Profiler - Add partition support
NOTE: GPU ordering used is not the same as in HSA/HIP. GPUs are ordered via amdsmi and then GPU_ID fields are compared to map GPU partitions to each other. Change-Id: If379214f5281d7d5ee98515b3e5ba7affc2e2197 Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
このコミットが含まれているのは:
+21
-19
@@ -45,6 +45,7 @@ FLD_DESC_ENT(RDC_FI_REV_ID, "Revision ID of the device",
|
||||
FLD_DESC_ENT(RDC_FI_TARGET_GRAPHICS_VERSION, "GFX version of the device", "GFX", true)
|
||||
FLD_DESC_ENT(RDC_FI_NUM_OF_COMPUTE_UNITS, "Number of Compute Units", "COMPUTE_UNITS", true)
|
||||
FLD_DESC_ENT(RDC_FI_UUID, "Unique ID of the device AKA asic_serial", "UUID", true)
|
||||
FLD_DESC_ENT(RDC_FI_GPU_PARTITION_COUNT, "GPU partition count", "PARTITION_COUNT", true)
|
||||
|
||||
FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true)
|
||||
FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true)
|
||||
@@ -136,25 +137,25 @@ FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_WRITE_KB, "XGMI accumlated data write size acr
|
||||
// This doesn't map to rocprofiler counters directly
|
||||
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
|
||||
// See metrics.xml in rocprofiler
|
||||
FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", true)
|
||||
// metrics with EVAL are divided by time passed
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", true)
|
||||
// CPC
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_BUSY, "", "CPC_CPC_STAT_BUSY", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_IDLE, "", "CPC_CPC_STAT_IDLE", false)
|
||||
@@ -194,7 +195,8 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_I
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false)
|
||||
// Misc
|
||||
FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", true)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_KFD_ID, "GPU_ID from rocprofiler, same as KFD_ID", "PROF_KFD_ID", true)
|
||||
|
||||
// Events
|
||||
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
|
||||
|
||||
@@ -173,6 +173,7 @@ typedef enum {
|
||||
RDC_FI_TARGET_GRAPHICS_VERSION, //!< Target graphics version
|
||||
RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units
|
||||
RDC_FI_UUID, //!< Device UUID
|
||||
RDC_FI_GPU_PARTITION_COUNT,
|
||||
|
||||
/**
|
||||
* @brief Frequency related fields
|
||||
@@ -344,6 +345,7 @@ typedef enum {
|
||||
RDC_FI_PROF_CPF_CPF_TCIU_STALL,
|
||||
RDC_FI_PROF_SIMD_UTILIZATION,
|
||||
RDC_FI_PROF_UUID,
|
||||
RDC_FI_PROF_KFD_ID,
|
||||
|
||||
/**
|
||||
* @brief Raw XGMI counter events
|
||||
|
||||
@@ -69,18 +69,22 @@ class RdcRocpBase {
|
||||
*/
|
||||
static const uint32_t collection_duration_us_k = 10000;
|
||||
|
||||
double read_feature(rocprofiler_record_counter_t* record, uint32_t gpu_index);
|
||||
|
||||
/**
|
||||
* @brief By default all profiler values are read as doubles
|
||||
*/
|
||||
double run_profiler(uint32_t gpu_index, rdc_field_t field);
|
||||
void map_smi_to_profiler_by_uuid();
|
||||
double run_profiler(uint32_t agent_index, rdc_field_t field);
|
||||
|
||||
/**
|
||||
* @description Create a map from entity_id to profiler agent_index.
|
||||
* This is required due to different structure and ordering.
|
||||
* Populates entity_to_prof_map.
|
||||
*/
|
||||
rdc_status_t map_entity_to_profiler();
|
||||
|
||||
std::vector<rocprofiler_agent_v0_t> agents = {};
|
||||
std::vector<std::shared_ptr<CounterSampler>> samplers = {};
|
||||
std::map<rdc_field_t, const char*> field_to_metric = {};
|
||||
std::map<uint32_t, uint32_t> smi_to_profiler_map = {};
|
||||
std::map<uint32_t, uint32_t> entity_to_prof_map = {};
|
||||
|
||||
// these fields must be divided by time passed
|
||||
std::unordered_set<rdc_field_t> eval_fields = {
|
||||
|
||||
@@ -178,6 +178,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_
|
||||
if (!count) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
rdc_field_value device_count;
|
||||
rdc_status_t status = metric_fetcher_->fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count);
|
||||
if (status != RDC_ST_OK) {
|
||||
|
||||
@@ -26,6 +26,8 @@ THE SOFTWARE.
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <chrono> //NOLINT
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
@@ -86,7 +88,7 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() {
|
||||
}
|
||||
|
||||
uint64_t RdcMetricFetcherImpl::now() {
|
||||
struct timeval tv;
|
||||
struct timeval tv {};
|
||||
gettimeofday(&tv, NULL);
|
||||
return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
|
||||
}
|
||||
@@ -98,6 +100,7 @@ void RdcMetricFetcherImpl::get_ecc(uint32_t gpu_index, rdc_field_t field_id,
|
||||
|
||||
amdsmi_processor_handle processor_handle;
|
||||
err = get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
assert(err == AMDSMI_STATUS_SUCCESS);
|
||||
|
||||
// because RDC already had an established order that is different from amd-smi : map blocks to
|
||||
// fields manually
|
||||
@@ -521,9 +524,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
case RDC_FI_GPU_CLOCK: {
|
||||
const uint16_t* clock_array = gpu_metrics.current_gfxclks;
|
||||
std::vector<uint16_t> valid_clocks;
|
||||
valid_clocks.reserve(8);
|
||||
valid_clocks.reserve(AMDSMI_MAX_NUM_GFX_CLKS);
|
||||
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
for (uint32_t i = 0; i < AMDSMI_MAX_NUM_GFX_CLKS; i++) {
|
||||
uint16_t clk = clock_array[i];
|
||||
if (clk != 0 && clk != 0xFFFF) {
|
||||
valid_clocks.push_back(clk);
|
||||
@@ -540,7 +543,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
}
|
||||
|
||||
if (vc == num_partitions) {
|
||||
value->value.l_int = clock_array[info.instance_index] * 1000000;
|
||||
value->value.l_int = static_cast<int64_t>(clock_array[info.instance_index]) * 1000000;
|
||||
value->type = INTEGER;
|
||||
value->status = RDC_ST_OK;
|
||||
return RDC_ST_OK;
|
||||
@@ -620,10 +623,12 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
}
|
||||
|
||||
default:
|
||||
// All other fields => N/A for partition
|
||||
RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
|
||||
<< " not supported => NO_DATA.");
|
||||
return RDC_ST_NO_DATA;
|
||||
// for now we must let other plugins return valid data for partition metrics
|
||||
|
||||
// TODO: All other fields => N/A for partition IN AMDSMI
|
||||
// RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
|
||||
// << " not supported => NO_DATA.");
|
||||
break;
|
||||
}
|
||||
} // end if partition
|
||||
|
||||
@@ -748,6 +753,17 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
value->value.l_int = static_cast<int64_t>(socket_count);
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_GPU_PARTITION_COUNT: {
|
||||
uint32_t partition_count = 0;
|
||||
amdsmi_gpu_metrics_t metrics;
|
||||
memset(&metrics, 0, sizeof(metrics));
|
||||
value->status = get_metrics_info(processor_handle, &metrics);
|
||||
partition_count = metrics.num_partition;
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(partition_count);
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_POWER_USAGE: {
|
||||
amdsmi_power_info_t power_info = {};
|
||||
// Handle API breaking change in amdsmi commit dc4a16da6fb45d581a6e23c78d340172989418a0
|
||||
|
||||
@@ -186,7 +186,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
|
||||
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH,
|
||||
RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED,
|
||||
RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION,
|
||||
RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID,
|
||||
RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT,
|
||||
};
|
||||
// clang-format on
|
||||
std::copy(fields.begin(), fields.end(), field_ids);
|
||||
|
||||
@@ -228,42 +228,6 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
|
||||
return result;
|
||||
}
|
||||
|
||||
// Check for rocprof fields in partitions
|
||||
rdc_group_info_t ginfo;
|
||||
result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
|
||||
if (result != RDC_ST_OK) {
|
||||
return result;
|
||||
}
|
||||
bool groupHasPartition = false;
|
||||
for (unsigned int i = 0; i < ginfo.count; i++) {
|
||||
uint32_t entityId = ginfo.entity_ids[i];
|
||||
rdc_entity_info_t info = rdc_get_info_from_entity_index(entityId);
|
||||
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
|
||||
groupHasPartition = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
rdc_field_group_info_t field_info;
|
||||
result = group_settings_->rdc_group_field_get_info(field_group_id, &field_info);
|
||||
if (result != RDC_ST_OK) {
|
||||
return result;
|
||||
}
|
||||
bool groupHasRocprof = false;
|
||||
if (result == RDC_ST_OK) {
|
||||
for (unsigned int i = 0; i < field_info.count; i++) {
|
||||
rdc_field_t fid = field_info.field_ids[i];
|
||||
if (fid >= 800 && fid < 900) { // Rocprof fields in the 800's
|
||||
groupHasRocprof = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (groupHasPartition && groupHasRocprof) {
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
// See if any of the fields are notification fields, and
|
||||
// set them up, if so.
|
||||
result = notifications_->set_listen_events(fields_in_watch);
|
||||
|
||||
@@ -240,6 +240,10 @@ amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (num_partition == nullptr) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
amdsmi_gpu_metrics_t metrics;
|
||||
memset(&metrics, 0, sizeof(metrics));
|
||||
ret = get_metrics_info(proc_handle, &metrics);
|
||||
|
||||
@@ -51,12 +51,12 @@ THE SOFTWARE.
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
double RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) {
|
||||
double RdcRocpBase::run_profiler(uint32_t agent_index, rdc_field_t field) {
|
||||
thread_local std::vector<rocprofiler_record_counter_t> records;
|
||||
|
||||
auto counter_sampler = CounterSampler::get_samplers()[gpu_index];
|
||||
auto counter_sampler = CounterSampler::get_samplers()[agent_index];
|
||||
if (!counter_sampler) {
|
||||
RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << gpu_index);
|
||||
RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << agent_index);
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
@@ -116,53 +116,88 @@ std::string uuid_to_string(const uint64_t uuid) {
|
||||
|
||||
std::string uuid_to_string(const rocprofiler_uuid_t& uuid) { return uuid_to_string(uuid.value); }
|
||||
|
||||
void RdcRocpBase::map_smi_to_profiler_by_uuid() {
|
||||
std::map<uint32_t, rocprofiler_uuid_t> index_to_prof_map;
|
||||
std::map<uint32_t, rocprofiler_uuid_t> index_to_smi_map;
|
||||
rdc_status_t RdcRocpBase::map_entity_to_profiler() {
|
||||
// std::map<uint32_t, uint32_t> entity_to_index_map;
|
||||
// kfd_id_t is only used inside this function
|
||||
typedef uint64_t kfd_id_t;
|
||||
std::map<uint32_t, kfd_id_t> prof_kfd_map;
|
||||
|
||||
// find intersection of supported and requested fields
|
||||
for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) {
|
||||
index_to_prof_map.insert({gpu_index, agents[gpu_index].uuid});
|
||||
|
||||
amdsmi_processor_handle processor_handle = nullptr;
|
||||
auto amdsmi_status = get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
amdsmi_asic_info_t asic_info;
|
||||
amdsmi_status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
|
||||
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
rocprofiler_uuid_t temp_id = asic_serial_to_uuid(asic_info.asic_serial);
|
||||
index_to_smi_map.insert({gpu_index, temp_id});
|
||||
|
||||
// clang-format off
|
||||
RDC_LOG(RDC_DEBUG, "\n"
|
||||
"ID[" << gpu_index << "]:\n"
|
||||
" PROF: " << uuid_to_string(index_to_prof_map[gpu_index]) << "\n"
|
||||
" SMI: " << uuid_to_string(index_to_smi_map[gpu_index]));
|
||||
// clang-format on
|
||||
// populate profiler map
|
||||
for (uint32_t prof_gpu_index = 0; prof_gpu_index < agents.size(); prof_gpu_index++) {
|
||||
prof_kfd_map.insert({prof_gpu_index, agents[prof_gpu_index].gpu_id});
|
||||
}
|
||||
|
||||
// Create a mapping from SMI to ROCProfiler by comparing uuid
|
||||
for (const auto& [smi_index, smi_uuid] : index_to_smi_map) {
|
||||
for (const auto& [prof_index, prof_uuid] : index_to_prof_map) {
|
||||
if (std::memcmp(&smi_uuid, &prof_uuid, sizeof(rocprofiler_uuid_t)) == 0) {
|
||||
// match found
|
||||
smi_to_profiler_map[smi_index] = prof_index;
|
||||
break;
|
||||
std::vector<amdsmi_socket_handle> sockets;
|
||||
auto amdsmi_status = get_socket_handles(sockets);
|
||||
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to get socket handles: " << amdsmi_status);
|
||||
return Smi2RdcError(amdsmi_status);
|
||||
}
|
||||
|
||||
for (int socket_index = 0; socket_index < sockets.size(); socket_index++) {
|
||||
auto* socket = sockets[socket_index];
|
||||
std::vector<amdsmi_processor_handle> processors;
|
||||
amdsmi_status = get_processor_handles(socket, processors);
|
||||
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to get processor handles for socket " << socket_index << ": "
|
||||
<< amdsmi_status);
|
||||
return Smi2RdcError(amdsmi_status);
|
||||
}
|
||||
|
||||
for (int processor_index = 0; processor_index < processors.size(); processor_index++) {
|
||||
auto* processor = processors[processor_index];
|
||||
processor_type_t processor_type = AMDSMI_PROCESSOR_TYPE_UNKNOWN;
|
||||
amdsmi_status = amdsmi_get_processor_type(processor, &processor_type);
|
||||
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to get processor type for processor "
|
||||
<< processor_index << " on socket " << socket_index << ": "
|
||||
<< amdsmi_status);
|
||||
return Smi2RdcError(amdsmi_status);
|
||||
}
|
||||
if (processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
|
||||
continue;
|
||||
}
|
||||
|
||||
amdsmi_kfd_info_t kfd_info;
|
||||
amdsmi_status = amdsmi_get_gpu_kfd_info(processor, &kfd_info);
|
||||
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to get KFD info for processor "
|
||||
<< processor_index << " on socket " << socket_index << ": "
|
||||
<< amdsmi_status);
|
||||
return Smi2RdcError(amdsmi_status);
|
||||
}
|
||||
|
||||
rdc_entity_info_t entity_info = {
|
||||
.device_index = static_cast<uint32_t>(socket_index),
|
||||
.instance_index = static_cast<uint32_t>(processor_index),
|
||||
.entity_role = RDC_DEVICE_ROLE_PHYSICAL,
|
||||
.device_type = RDC_DEVICE_TYPE_GPU,
|
||||
};
|
||||
|
||||
uint32_t entity_index = rdc_get_entity_index_from_info(entity_info);
|
||||
|
||||
for (const auto& [prof_index, prof_id] : prof_kfd_map) {
|
||||
if (std::memcmp(&kfd_info.kfd_id, &prof_id, sizeof(kfd_id_t)) == 0) {
|
||||
// match found
|
||||
// clang-format off
|
||||
RDC_LOG(RDC_DEBUG, "SMI[" << entity_index << "] <-> Profiler[" << prof_index << "] = KFD_ID[" << prof_id << "]");
|
||||
// clang-format on
|
||||
if (entity_info.entity_role == RDC_DEVICE_ROLE_PHYSICAL) {
|
||||
entity_index = rdc_get_entity_index_from_info(entity_info);
|
||||
entity_to_prof_map.insert({entity_index, prof_index});
|
||||
}
|
||||
if (processors.size() > 1) {
|
||||
// if there are multiple processors, also add entity with partition instance type
|
||||
entity_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE;
|
||||
entity_index = rdc_get_entity_index_from_info(entity_info);
|
||||
entity_to_prof_map.insert({entity_index, prof_index});
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& [smi_index, prof_index] : smi_to_profiler_map) {
|
||||
const auto& prof_uuid = index_to_prof_map[prof_index];
|
||||
const auto& smi_uuid = index_to_smi_map[smi_index];
|
||||
RDC_LOG(RDC_DEBUG, "SMI index " << smi_index << " maps to ROCProfiler index " << prof_index
|
||||
<< " with UUID: " << uuid_to_string(prof_uuid) << " = "
|
||||
<< uuid_to_string(smi_uuid));
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
RdcRocpBase::RdcRocpBase() {
|
||||
@@ -226,7 +261,8 @@ RdcRocpBase::RdcRocpBase() {
|
||||
{RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"},
|
||||
{RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"},
|
||||
{RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"},
|
||||
{RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value,
|
||||
{RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value,
|
||||
{RDC_FI_PROF_KFD_ID, "SQ_WAVES"}, // dummy value,
|
||||
};
|
||||
|
||||
hsa_status_t status = hsa_init();
|
||||
@@ -251,23 +287,22 @@ RdcRocpBase::RdcRocpBase() {
|
||||
RDC_LOG(RDC_DEBUG, "Agent count: " << agents.size());
|
||||
samplers = CounterSampler::get_samplers();
|
||||
|
||||
map_smi_to_profiler_by_uuid();
|
||||
map_entity_to_profiler();
|
||||
|
||||
// find intersection of supported and requested fields
|
||||
for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) {
|
||||
auto& cs = *samplers[gpu_index];
|
||||
RDC_LOG(RDC_DEBUG, "gpu_index[" << gpu_index << "] = node_id[" << agents[gpu_index].node_id
|
||||
<< "] agent_id[" << agents[gpu_index].id.handle << "]");
|
||||
uint32_t agent_index = 0;
|
||||
auto& cs = *samplers[agent_index];
|
||||
RDC_LOG(RDC_DEBUG, "agent_index[" << agent_index << "] location_id["
|
||||
<< agents[agent_index].location_id << "]");
|
||||
|
||||
for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) {
|
||||
checked_fields.emplace_back(str);
|
||||
}
|
||||
for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) {
|
||||
checked_fields.emplace_back(str);
|
||||
}
|
||||
|
||||
for (const auto& [k, v] : temp_field_map_k) {
|
||||
auto found = std::find(checked_fields.begin(), checked_fields.end(), v);
|
||||
if (found != checked_fields.end()) {
|
||||
field_to_metric.insert({k, v});
|
||||
}
|
||||
for (const auto& [k, v] : temp_field_map_k) {
|
||||
auto found = std::find(checked_fields.begin(), checked_fields.end(), v);
|
||||
if (found != checked_fields.end()) {
|
||||
field_to_metric.insert({k, v});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -276,7 +311,7 @@ RdcRocpBase::RdcRocpBase() {
|
||||
all_fields.emplace_back(v);
|
||||
}
|
||||
|
||||
RDC_LOG(RDC_DEBUG, "Rocprofiler supports " << field_to_metric.size() << " fields");
|
||||
RDC_LOG(RDC_DEBUG, "Profiler supports " << field_to_metric.size() << " fields");
|
||||
}
|
||||
|
||||
RdcRocpBase::~RdcRocpBase() {
|
||||
@@ -292,7 +327,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
|
||||
// default type
|
||||
*type = DOUBLE;
|
||||
|
||||
const auto& gpu_index = smi_to_profiler_map[gpu_field.gpu_index];
|
||||
// convert from entity to flat index
|
||||
uint32_t agent_index = entity_to_prof_map[gpu_field.gpu_index];
|
||||
const auto& field = gpu_field.field_id;
|
||||
|
||||
if (data == nullptr) {
|
||||
@@ -303,7 +339,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
|
||||
|
||||
const auto start_time = std::chrono::high_resolution_clock::now();
|
||||
// direct read from rocprofiler
|
||||
const double read_dbl = run_profiler(gpu_index, field);
|
||||
const double read_dbl = run_profiler(agent_index, field);
|
||||
const auto stop_time = std::chrono::high_resolution_clock::now();
|
||||
const double elapsed = std::chrono::duration<double, std::milli>(stop_time - start_time).count();
|
||||
// divide by elapsed time if needed
|
||||
@@ -330,8 +366,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
|
||||
// function
|
||||
const double active_cycles_val = read_dbl;
|
||||
if (active_cycles_val != 0.0) {
|
||||
// read second value from rocprofiler
|
||||
const double occupancy_val = run_profiler(gpu_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU);
|
||||
// read second value from profiler
|
||||
const double occupancy_val = run_profiler(agent_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU);
|
||||
data->dbl = occupancy_val / active_cycles_val;
|
||||
} else {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
@@ -343,14 +379,14 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
// 1024, 2048, and 256 are taken from "INTRODUCING AMD CDNA 3 ARCHITECTURE" white paper
|
||||
const std::string target_version = agents[gpu_index].name;
|
||||
const std::string target_version = agents[agent_index].name;
|
||||
// TODO: Design a lookup table for other GPUs
|
||||
const bool isMI200 = (target_version.find("gfx90a") != std::string::npos);
|
||||
// FLOPS/clock/CU
|
||||
if (isMI200) {
|
||||
data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
|
||||
data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
} else { // Assume mi300
|
||||
data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
|
||||
data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT:
|
||||
@@ -360,15 +396,21 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
// FLOPS/clock/CU
|
||||
data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[gpu_index].simd_per_cu));
|
||||
data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
break;
|
||||
case RDC_FI_PROF_UUID: {
|
||||
// do not care what RDC_FI_PROF_UUID is mapped to. read value from agents
|
||||
*type = STRING;
|
||||
std::string uuid_str = uuid_to_string(agents[gpu_index].uuid);
|
||||
std::string uuid_str = uuid_to_string(agents[agent_index].uuid);
|
||||
strncpy_with_null(data->str, uuid_str.c_str(), uuid_str.length());
|
||||
break;
|
||||
}
|
||||
case RDC_FI_PROF_KFD_ID: {
|
||||
// do not care what RDC_FI_PROF_UUID is mapped to. read value from agents
|
||||
*type = INTEGER;
|
||||
data->l_int = agents[agent_index].gpu_id;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// only support default fallback for doubles
|
||||
assert(*type == DOUBLE);
|
||||
|
||||
新しいイシューから参照
ユーザーをブロックする