From 85b619b2f02fd92155730677cf4899808bb800bc Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 21 May 2025 18:40:15 -0500 Subject: [PATCH] Profiler - Add partition support NOTE: GPU ordering used is not the same as in HSA/HIP. GPUs are ordered via amdsmi and then GPU_ID fields are compared to map GPU partitions to each other. Change-Id: If379214f5281d7d5ee98515b3e5ba7affc2e2197 Signed-off-by: Galantsev, Dmitrii --- common/rdc_field.data | 40 +++-- include/rdc/rdc.h | 2 + include/rdc_modules/rdc_rocp/RdcRocpBase.h | 14 +- rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 1 + rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 32 +++- rdc_libs/rdc/src/RdcSmiLib.cc | 2 +- rdc_libs/rdc/src/RdcWatchTableImpl.cc | 36 ---- rdc_libs/rdc/src/SmiUtils.cc | 4 + rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc | 178 ++++++++++++------- 9 files changed, 172 insertions(+), 137 deletions(-) diff --git a/common/rdc_field.data b/common/rdc_field.data index 95fbfaafee..6e447974ea 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -45,6 +45,7 @@ FLD_DESC_ENT(RDC_FI_REV_ID, "Revision ID of the device", FLD_DESC_ENT(RDC_FI_TARGET_GRAPHICS_VERSION, "GFX version of the device", "GFX", true) FLD_DESC_ENT(RDC_FI_NUM_OF_COMPUTE_UNITS, "Number of Compute Units", "COMPUTE_UNITS", true) FLD_DESC_ENT(RDC_FI_UUID, "Unique ID of the device AKA asic_serial", "UUID", true) +FLD_DESC_ENT(RDC_FI_GPU_PARTITION_COUNT, "GPU partition count", "PARTITION_COUNT", true) FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true) FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true) @@ -136,25 +137,25 @@ FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_WRITE_KB, "XGMI accumlated data write size acr // This doesn't map to rocprofiler counters directly // See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h // See metrics.xml in rocprofiler -FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", false) -FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false) -FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false) -FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false) -FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", false) -FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", false) +FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", true) +FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", true) +FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", true) +FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", true) +FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", true) +FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", true) // metrics with EVAL are divided by time passed -FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false) -FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", false) -FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", false) -FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", false) -FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", false) -FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", true) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", true) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", true) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", true) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", true) +FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", true) +FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", true) +FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", true) +FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", true) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", true) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", true) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", true) // CPC FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_BUSY, "", "CPC_CPC_STAT_BUSY", false) FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_IDLE, "", "CPC_CPC_STAT_IDLE", false) @@ -194,7 +195,8 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_I FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false) // Misc FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false) -FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", false) +FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", true) +FLD_DESC_ENT(RDC_FI_PROF_KFD_ID, "GPU_ID from rocprofiler, same as KFD_ID", "PROF_KFD_ID", true) // Events FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false) diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index c6f655cf85..c975e6503b 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -173,6 +173,7 @@ typedef enum { RDC_FI_TARGET_GRAPHICS_VERSION, //!< Target graphics version RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units RDC_FI_UUID, //!< Device UUID + RDC_FI_GPU_PARTITION_COUNT, /** * @brief Frequency related fields @@ -344,6 +345,7 @@ typedef enum { RDC_FI_PROF_CPF_CPF_TCIU_STALL, RDC_FI_PROF_SIMD_UTILIZATION, RDC_FI_PROF_UUID, + RDC_FI_PROF_KFD_ID, /** * @brief Raw XGMI counter events diff --git a/include/rdc_modules/rdc_rocp/RdcRocpBase.h b/include/rdc_modules/rdc_rocp/RdcRocpBase.h index ccf71ab9df..55b63e0e5a 100644 --- a/include/rdc_modules/rdc_rocp/RdcRocpBase.h +++ b/include/rdc_modules/rdc_rocp/RdcRocpBase.h @@ -69,18 +69,22 @@ class RdcRocpBase { */ static const uint32_t collection_duration_us_k = 10000; - double read_feature(rocprofiler_record_counter_t* record, uint32_t gpu_index); - /** * @brief By default all profiler values are read as doubles */ - double run_profiler(uint32_t gpu_index, rdc_field_t field); - void map_smi_to_profiler_by_uuid(); + double run_profiler(uint32_t agent_index, rdc_field_t field); + + /** + * @description Create a map from entity_id to profiler agent_index. + * This is required due to different structure and ordering. + * Populates entity_to_prof_map. + */ + rdc_status_t map_entity_to_profiler(); std::vector agents = {}; std::vector> samplers = {}; std::map field_to_metric = {}; - std::map smi_to_profiler_map = {}; + std::map entity_to_prof_map = {}; // these fields must be divided by time passed std::unordered_set eval_fields = { diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 05023cb86d..f989345878 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -178,6 +178,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_ if (!count) { return RDC_ST_BAD_PARAMETER; } + rdc_field_value device_count; rdc_status_t status = metric_fetcher_->fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count); if (status != RDC_ST_OK) { diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 11c262b6f9..5a11ec7ed5 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -26,6 +26,8 @@ THE SOFTWARE. #include #include //NOLINT +#include +#include #include #include @@ -86,7 +88,7 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() { } uint64_t RdcMetricFetcherImpl::now() { - struct timeval tv; + struct timeval tv {}; gettimeofday(&tv, NULL); return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; } @@ -98,6 +100,7 @@ void RdcMetricFetcherImpl::get_ecc(uint32_t gpu_index, rdc_field_t field_id, amdsmi_processor_handle processor_handle; err = get_processor_handle_from_id(gpu_index, &processor_handle); + assert(err == AMDSMI_STATUS_SUCCESS); // because RDC already had an established order that is different from amd-smi : map blocks to // fields manually @@ -521,9 +524,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field case RDC_FI_GPU_CLOCK: { const uint16_t* clock_array = gpu_metrics.current_gfxclks; std::vector valid_clocks; - valid_clocks.reserve(8); + valid_clocks.reserve(AMDSMI_MAX_NUM_GFX_CLKS); - for (uint32_t i = 0; i < 8; i++) { + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_GFX_CLKS; i++) { uint16_t clk = clock_array[i]; if (clk != 0 && clk != 0xFFFF) { valid_clocks.push_back(clk); @@ -540,7 +543,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } if (vc == num_partitions) { - value->value.l_int = clock_array[info.instance_index] * 1000000; + value->value.l_int = static_cast(clock_array[info.instance_index]) * 1000000; value->type = INTEGER; value->status = RDC_ST_OK; return RDC_ST_OK; @@ -620,10 +623,12 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } default: - // All other fields => N/A for partition - RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id) - << " not supported => NO_DATA."); - return RDC_ST_NO_DATA; + // for now we must let other plugins return valid data for partition metrics + + // TODO: All other fields => N/A for partition IN AMDSMI + // RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id) + // << " not supported => NO_DATA."); + break; } } // end if partition @@ -748,6 +753,17 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field value->value.l_int = static_cast(socket_count); } } break; + case RDC_FI_GPU_PARTITION_COUNT: { + uint32_t partition_count = 0; + amdsmi_gpu_metrics_t metrics; + memset(&metrics, 0, sizeof(metrics)); + value->status = get_metrics_info(processor_handle, &metrics); + partition_count = metrics.num_partition; + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(partition_count); + } + } break; case RDC_FI_POWER_USAGE: { amdsmi_power_info_t power_info = {}; // Handle API breaking change in amdsmi commit dc4a16da6fb45d581a6e23c78d340172989418a0 diff --git a/rdc_libs/rdc/src/RdcSmiLib.cc b/rdc_libs/rdc/src/RdcSmiLib.cc index 37db7207b6..ef4d5d53b8 100644 --- a/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/rdc_libs/rdc/src/RdcSmiLib.cc @@ -186,7 +186,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH, RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED, RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION, - RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, + RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT, }; // clang-format on std::copy(fields.begin(), fields.end(), field_ids); diff --git a/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/rdc_libs/rdc/src/RdcWatchTableImpl.cc index fc2c5de698..1c1d50bee4 100644 --- a/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -228,42 +228,6 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id, return result; } - // Check for rocprof fields in partitions - rdc_group_info_t ginfo; - result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); - if (result != RDC_ST_OK) { - return result; - } - bool groupHasPartition = false; - for (unsigned int i = 0; i < ginfo.count; i++) { - uint32_t entityId = ginfo.entity_ids[i]; - rdc_entity_info_t info = rdc_get_info_from_entity_index(entityId); - if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { - groupHasPartition = true; - break; - } - } - - rdc_field_group_info_t field_info; - result = group_settings_->rdc_group_field_get_info(field_group_id, &field_info); - if (result != RDC_ST_OK) { - return result; - } - bool groupHasRocprof = false; - if (result == RDC_ST_OK) { - for (unsigned int i = 0; i < field_info.count; i++) { - rdc_field_t fid = field_info.field_ids[i]; - if (fid >= 800 && fid < 900) { // Rocprof fields in the 800's - groupHasRocprof = true; - break; - } - } - } - - if (groupHasPartition && groupHasRocprof) { - return RDC_ST_NOT_SUPPORTED; - } - // See if any of the fields are notification fields, and // set them up, if so. result = notifications_->set_listen_events(fields_in_watch); diff --git a/rdc_libs/rdc/src/SmiUtils.cc b/rdc_libs/rdc/src/SmiUtils.cc index cd555f1160..2fb5023d2b 100644 --- a/rdc_libs/rdc/src/SmiUtils.cc +++ b/rdc_libs/rdc/src/SmiUtils.cc @@ -240,6 +240,10 @@ amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition) { return ret; } + if (num_partition == nullptr) { + return AMDSMI_STATUS_INVAL; + } + amdsmi_gpu_metrics_t metrics; memset(&metrics, 0, sizeof(metrics)); ret = get_metrics_info(proc_handle, &metrics); diff --git a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index a904c88176..4b18147987 100644 --- a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -51,12 +51,12 @@ THE SOFTWARE. namespace amd { namespace rdc { -double RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) { +double RdcRocpBase::run_profiler(uint32_t agent_index, rdc_field_t field) { thread_local std::vector records; - auto counter_sampler = CounterSampler::get_samplers()[gpu_index]; + auto counter_sampler = CounterSampler::get_samplers()[agent_index]; if (!counter_sampler) { - RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << gpu_index); + RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << agent_index); return RDC_ST_BAD_PARAMETER; } @@ -116,53 +116,88 @@ std::string uuid_to_string(const uint64_t uuid) { std::string uuid_to_string(const rocprofiler_uuid_t& uuid) { return uuid_to_string(uuid.value); } -void RdcRocpBase::map_smi_to_profiler_by_uuid() { - std::map index_to_prof_map; - std::map index_to_smi_map; +rdc_status_t RdcRocpBase::map_entity_to_profiler() { + // std::map entity_to_index_map; + // kfd_id_t is only used inside this function + typedef uint64_t kfd_id_t; + std::map prof_kfd_map; - // find intersection of supported and requested fields - for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) { - index_to_prof_map.insert({gpu_index, agents[gpu_index].uuid}); - - amdsmi_processor_handle processor_handle = nullptr; - auto amdsmi_status = get_processor_handle_from_id(gpu_index, &processor_handle); - if (amdsmi_status != AMDSMI_STATUS_SUCCESS) { - continue; - } - amdsmi_asic_info_t asic_info; - amdsmi_status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info); - if (amdsmi_status != AMDSMI_STATUS_SUCCESS) { - continue; - } - rocprofiler_uuid_t temp_id = asic_serial_to_uuid(asic_info.asic_serial); - index_to_smi_map.insert({gpu_index, temp_id}); - - // clang-format off - RDC_LOG(RDC_DEBUG, "\n" - "ID[" << gpu_index << "]:\n" - " PROF: " << uuid_to_string(index_to_prof_map[gpu_index]) << "\n" - " SMI: " << uuid_to_string(index_to_smi_map[gpu_index])); - // clang-format on + // populate profiler map + for (uint32_t prof_gpu_index = 0; prof_gpu_index < agents.size(); prof_gpu_index++) { + prof_kfd_map.insert({prof_gpu_index, agents[prof_gpu_index].gpu_id}); } - // Create a mapping from SMI to ROCProfiler by comparing uuid - for (const auto& [smi_index, smi_uuid] : index_to_smi_map) { - for (const auto& [prof_index, prof_uuid] : index_to_prof_map) { - if (std::memcmp(&smi_uuid, &prof_uuid, sizeof(rocprofiler_uuid_t)) == 0) { - // match found - smi_to_profiler_map[smi_index] = prof_index; - break; + std::vector sockets; + auto amdsmi_status = get_socket_handles(sockets); + if (amdsmi_status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get socket handles: " << amdsmi_status); + return Smi2RdcError(amdsmi_status); + } + + for (int socket_index = 0; socket_index < sockets.size(); socket_index++) { + auto* socket = sockets[socket_index]; + std::vector processors; + amdsmi_status = get_processor_handles(socket, processors); + if (amdsmi_status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get processor handles for socket " << socket_index << ": " + << amdsmi_status); + return Smi2RdcError(amdsmi_status); + } + + for (int processor_index = 0; processor_index < processors.size(); processor_index++) { + auto* processor = processors[processor_index]; + processor_type_t processor_type = AMDSMI_PROCESSOR_TYPE_UNKNOWN; + amdsmi_status = amdsmi_get_processor_type(processor, &processor_type); + if (amdsmi_status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get processor type for processor " + << processor_index << " on socket " << socket_index << ": " + << amdsmi_status); + return Smi2RdcError(amdsmi_status); + } + if (processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) { + continue; + } + + amdsmi_kfd_info_t kfd_info; + amdsmi_status = amdsmi_get_gpu_kfd_info(processor, &kfd_info); + if (amdsmi_status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get KFD info for processor " + << processor_index << " on socket " << socket_index << ": " + << amdsmi_status); + return Smi2RdcError(amdsmi_status); + } + + rdc_entity_info_t entity_info = { + .device_index = static_cast(socket_index), + .instance_index = static_cast(processor_index), + .entity_role = RDC_DEVICE_ROLE_PHYSICAL, + .device_type = RDC_DEVICE_TYPE_GPU, + }; + + uint32_t entity_index = rdc_get_entity_index_from_info(entity_info); + + for (const auto& [prof_index, prof_id] : prof_kfd_map) { + if (std::memcmp(&kfd_info.kfd_id, &prof_id, sizeof(kfd_id_t)) == 0) { + // match found + // clang-format off + RDC_LOG(RDC_DEBUG, "SMI[" << entity_index << "] <-> Profiler[" << prof_index << "] = KFD_ID[" << prof_id << "]"); + // clang-format on + if (entity_info.entity_role == RDC_DEVICE_ROLE_PHYSICAL) { + entity_index = rdc_get_entity_index_from_info(entity_info); + entity_to_prof_map.insert({entity_index, prof_index}); + } + if (processors.size() > 1) { + // if there are multiple processors, also add entity with partition instance type + entity_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE; + entity_index = rdc_get_entity_index_from_info(entity_info); + entity_to_prof_map.insert({entity_index, prof_index}); + } + break; + } } } } - - for (const auto& [smi_index, prof_index] : smi_to_profiler_map) { - const auto& prof_uuid = index_to_prof_map[prof_index]; - const auto& smi_uuid = index_to_smi_map[smi_index]; - RDC_LOG(RDC_DEBUG, "SMI index " << smi_index << " maps to ROCProfiler index " << prof_index - << " with UUID: " << uuid_to_string(prof_uuid) << " = " - << uuid_to_string(smi_uuid)); - } + return RDC_ST_OK; } RdcRocpBase::RdcRocpBase() { @@ -226,7 +261,8 @@ RdcRocpBase::RdcRocpBase() { {RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"}, {RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"}, {RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"}, - {RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value, + {RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value, + {RDC_FI_PROF_KFD_ID, "SQ_WAVES"}, // dummy value, }; hsa_status_t status = hsa_init(); @@ -251,23 +287,22 @@ RdcRocpBase::RdcRocpBase() { RDC_LOG(RDC_DEBUG, "Agent count: " << agents.size()); samplers = CounterSampler::get_samplers(); - map_smi_to_profiler_by_uuid(); + map_entity_to_profiler(); // find intersection of supported and requested fields - for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) { - auto& cs = *samplers[gpu_index]; - RDC_LOG(RDC_DEBUG, "gpu_index[" << gpu_index << "] = node_id[" << agents[gpu_index].node_id - << "] agent_id[" << agents[gpu_index].id.handle << "]"); + uint32_t agent_index = 0; + auto& cs = *samplers[agent_index]; + RDC_LOG(RDC_DEBUG, "agent_index[" << agent_index << "] location_id[" + << agents[agent_index].location_id << "]"); - for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) { - checked_fields.emplace_back(str); - } + for (auto& [str, id] : CounterSampler::get_supported_counters(cs.get_agent())) { + checked_fields.emplace_back(str); + } - for (const auto& [k, v] : temp_field_map_k) { - auto found = std::find(checked_fields.begin(), checked_fields.end(), v); - if (found != checked_fields.end()) { - field_to_metric.insert({k, v}); - } + for (const auto& [k, v] : temp_field_map_k) { + auto found = std::find(checked_fields.begin(), checked_fields.end(), v); + if (found != checked_fields.end()) { + field_to_metric.insert({k, v}); } } @@ -276,7 +311,7 @@ RdcRocpBase::RdcRocpBase() { all_fields.emplace_back(v); } - RDC_LOG(RDC_DEBUG, "Rocprofiler supports " << field_to_metric.size() << " fields"); + RDC_LOG(RDC_DEBUG, "Profiler supports " << field_to_metric.size() << " fields"); } RdcRocpBase::~RdcRocpBase() { @@ -292,7 +327,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value // default type *type = DOUBLE; - const auto& gpu_index = smi_to_profiler_map[gpu_field.gpu_index]; + // convert from entity to flat index + uint32_t agent_index = entity_to_prof_map[gpu_field.gpu_index]; const auto& field = gpu_field.field_id; if (data == nullptr) { @@ -303,7 +339,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value const auto start_time = std::chrono::high_resolution_clock::now(); // direct read from rocprofiler - const double read_dbl = run_profiler(gpu_index, field); + const double read_dbl = run_profiler(agent_index, field); const auto stop_time = std::chrono::high_resolution_clock::now(); const double elapsed = std::chrono::duration(stop_time - start_time).count(); // divide by elapsed time if needed @@ -330,8 +366,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value // function const double active_cycles_val = read_dbl; if (active_cycles_val != 0.0) { - // read second value from rocprofiler - const double occupancy_val = run_profiler(gpu_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU); + // read second value from profiler + const double occupancy_val = run_profiler(agent_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU); data->dbl = occupancy_val / active_cycles_val; } else { return RDC_ST_BAD_PARAMETER; @@ -343,14 +379,14 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value return RDC_ST_BAD_PARAMETER; } // 1024, 2048, and 256 are taken from "INTRODUCING AMD CDNA 3 ARCHITECTURE" white paper - const std::string target_version = agents[gpu_index].name; + const std::string target_version = agents[agent_index].name; // TODO: Design a lookup table for other GPUs const bool isMI200 = (target_version.find("gfx90a") != std::string::npos); // FLOPS/clock/CU if (isMI200) { - data->dbl = divided_dbl / (1024.0F / static_cast(agents[gpu_index].simd_per_cu)); + data->dbl = divided_dbl / (1024.0F / static_cast(agents[agent_index].simd_per_cu)); } else { // Assume mi300 - data->dbl = divided_dbl / (2048.0F / static_cast(agents[gpu_index].simd_per_cu)); + data->dbl = divided_dbl / (2048.0F / static_cast(agents[agent_index].simd_per_cu)); } } break; case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT: @@ -360,15 +396,21 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value return RDC_ST_BAD_PARAMETER; } // FLOPS/clock/CU - data->dbl = divided_dbl / (256.0F / static_cast(agents[gpu_index].simd_per_cu)); + data->dbl = divided_dbl / (256.0F / static_cast(agents[agent_index].simd_per_cu)); break; case RDC_FI_PROF_UUID: { // do not care what RDC_FI_PROF_UUID is mapped to. read value from agents *type = STRING; - std::string uuid_str = uuid_to_string(agents[gpu_index].uuid); + std::string uuid_str = uuid_to_string(agents[agent_index].uuid); strncpy_with_null(data->str, uuid_str.c_str(), uuid_str.length()); break; } + case RDC_FI_PROF_KFD_ID: { + // do not care what RDC_FI_PROF_UUID is mapped to. read value from agents + *type = INTEGER; + data->l_int = agents[agent_index].gpu_id; + break; + } default: // only support default fallback for doubles assert(*type == DOUBLE);