diff --git a/projects/rdc/common/rdc_field.data b/projects/rdc/common/rdc_field.data index bdb904d658..1c0ace213e 100644 --- a/projects/rdc/common/rdc_field.data +++ b/projects/rdc/common/rdc_field.data @@ -46,6 +46,7 @@ FLD_DESC_ENT(RDC_FI_TARGET_GRAPHICS_VERSION, "GFX version of the device", FLD_DESC_ENT(RDC_FI_NUM_OF_COMPUTE_UNITS, "Number of Compute Units", "COMPUTE_UNITS", true) FLD_DESC_ENT(RDC_FI_UUID, "Unique ID of the device AKA asic_serial", "UUID", true) FLD_DESC_ENT(RDC_FI_GPU_PARTITION_COUNT, "GPU partition count", "PARTITION_COUNT", true) +FLD_DESC_ENT(RDC_FI_KFD_ID, "KFD_ID of GPU", "KFD_ID", true) FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true) FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true) @@ -198,7 +199,6 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_I FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false) // Misc FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false) -FLD_DESC_ENT(RDC_FI_PROF_KFD_ID, "GPU_ID from rocprofiler, same as KFD_ID", "PROF_KFD_ID", true) // Events FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false) diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index 1d9b904d36..a7fe956c39 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -171,6 +171,7 @@ typedef enum { RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units RDC_FI_UUID, //!< Device UUID RDC_FI_GPU_PARTITION_COUNT, + RDC_FI_KFD_ID, /** * @brief Frequency related fields @@ -344,7 +345,6 @@ typedef enum { RDC_FI_PROF_CPF_CPF_TCIU_IDLE, RDC_FI_PROF_CPF_CPF_TCIU_STALL, RDC_FI_PROF_SIMD_UTILIZATION, - RDC_FI_PROF_KFD_ID, /** * @brief Raw XGMI counter events diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index b0623b7eaf..753d1a2760 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -650,6 +650,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel value->value.l_int = static_cast(partition_count); } } break; + case RDC_FI_KFD_ID: { + amdsmi_kfd_info_t kfd_info; + value->status = amdsmi_get_gpu_kfd_info(processor_handle, &kfd_info); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(kfd_info.kfd_id); + } + } break; case RDC_FI_POWER_USAGE: { amdsmi_power_info_t power_info = {}; value->status = amdsmi_get_power_info(processor_handle, &power_info); diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 080a2e4b73..84189e5d4f 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -188,11 +188,11 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED, RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION, RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT, - RDC_FI_CPU_SKT_COUNT, RDC_FI_CPU_MODEL, RDC_FI_CPU_MODEL_ID, - RDC_FI_CPU_FAMILY, RDC_FI_CPU_CORES_PER_SKT, RDC_FI_CPU_SKT_ENERGY, - RDC_FI_CPU_HSMP_DRIVER_VERSION, RDC_FI_CPU_SMU_FW_VERSION, RDC_FI_CPU_HSMP_PROTO_VERSION, - RDC_FI_CPU_FCLK_FREQUENCY, RDC_FI_CPU_MCLK_FREQUENCY, RDC_FI_CPU_CCLK_LIMIT, - RDC_FI_CPU_SKT_ACTIVE_FREQ_LIMIT, + RDC_FI_KFD_ID, RDC_FI_CPU_SKT_COUNT, RDC_FI_CPU_MODEL, + RDC_FI_CPU_MODEL_ID, RDC_FI_CPU_FAMILY, RDC_FI_CPU_CORES_PER_SKT, + RDC_FI_CPU_SKT_ENERGY, RDC_FI_CPU_HSMP_DRIVER_VERSION,RDC_FI_CPU_SMU_FW_VERSION, + RDC_FI_CPU_HSMP_PROTO_VERSION, RDC_FI_CPU_FCLK_FREQUENCY, RDC_FI_CPU_MCLK_FREQUENCY, + RDC_FI_CPU_CCLK_LIMIT, RDC_FI_CPU_SKT_ACTIVE_FREQ_LIMIT, RDC_FI_CPU_SKT_FREQ_LIMIT_SRC, RDC_FI_CPU_SKT_FREQ_RANGE_MAX, RDC_FI_CPU_SKT_FREQ_RANGE_MIN, RDC_FI_CPU_SKT_C0_RESIDENCY, RDC_FI_CPU_SKT_LCLK_DPM_LEVEL, }; diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index e86905cfc5..f1a823ac76 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -108,7 +108,6 @@ static const std::map temp_field_map_k = { {RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"}, {RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"}, {RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"}, - {RDC_FI_PROF_KFD_ID, "SQ_WAVES"}, // dummy value, }; double RdcRocpBase::run_profiler(uint32_t agent_index, rdc_field_t field) { @@ -388,13 +387,6 @@ rdc_status_t RdcRocpBase::rocp_lookup_bulk(const std::vector& f types[i] = DOUBLE; // Default type statuses[i] = RDC_ST_OK; - // Handle special case: RDC_FI_PROF_KFD_ID doesn't need sampling - if (field == RDC_FI_PROF_KFD_ID) { - types[i] = INTEGER; - values[i].l_int = agents[agent_index].gpu_id; - continue; - } - // Get metric name for this field auto field_it = field_to_metric.find(field); if (field_it == field_to_metric.end()) { @@ -442,9 +434,7 @@ rdc_status_t RdcRocpBase::rocp_lookup_bulk(const std::vector& f } catch (const std::exception& e) { RDC_LOG(RDC_ERROR, "Error while sampling counter values: " << e.what()); for (size_t i = 0; i < fields.size(); i++) { - if (fields[i].field_id != RDC_FI_PROF_KFD_ID) { statuses[i] = RDC_ST_BAD_PARAMETER; - } } return RDC_ST_BAD_PARAMETER; } @@ -456,11 +446,6 @@ rdc_status_t RdcRocpBase::rocp_lookup_bulk(const std::vector& f for (size_t i = 0; i < fields.size(); i++) { const auto& field = fields[i].field_id; - // Skip fields that already have values set (like RDC_FI_PROF_KFD_ID) - if (field == RDC_FI_PROF_KFD_ID) { - continue; - } - // Skip fields that had errors earlier if (statuses[i] != RDC_ST_OK) { continue; @@ -560,11 +545,6 @@ rdc_status_t RdcRocpBase::apply_field_transformation( output->dbl = divided_dbl / (256.0F / static_cast(agents[agent_index].simd_per_cu)); break; - case RDC_FI_PROF_KFD_ID: - *type = INTEGER; - output->l_int = agents[agent_index].gpu_id; - break; - default: if (is_eval_field) { output->dbl = divided_dbl; diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc index fbf5fbfbd5..7d8e2d4c5c 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc @@ -210,7 +210,7 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3 // Bulk lookup for this GPU // Note: rocp_lookup_bulk only handles rocprofiler-sdk metrics. - // Non-rocprofiler fields (e.g., RDC_FI_PROF_KFD_ID) are handled within + // Non-rocprofiler fields are handled within // the bulk lookup via special case logic in apply_field_transformation(). // Fields without rocprofiler metric mappings will return RDC_ST_BAD_PARAMETER. std::vector bulk_data;