[SWDEV-566924] Update KFD_ID metric to use amd-smi instead of rocprof (#2355)

This commit is contained in:
Adam Pryor
2025-12-18 08:39:19 -06:00
committed by GitHub
vanhempi fdf73116d5
commit bd6c6852fc
6 muutettua tiedostoa jossa 16 lisäystä ja 28 poistoa
+1 -1
Näytä tiedosto
@@ -46,6 +46,7 @@ FLD_DESC_ENT(RDC_FI_TARGET_GRAPHICS_VERSION, "GFX version of the device",
FLD_DESC_ENT(RDC_FI_NUM_OF_COMPUTE_UNITS, "Number of Compute Units", "COMPUTE_UNITS", true)
FLD_DESC_ENT(RDC_FI_UUID, "Unique ID of the device AKA asic_serial", "UUID", true)
FLD_DESC_ENT(RDC_FI_GPU_PARTITION_COUNT, "GPU partition count", "PARTITION_COUNT", true)
FLD_DESC_ENT(RDC_FI_KFD_ID, "KFD_ID of GPU", "KFD_ID", true)
FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true)
FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true)
@@ -198,7 +199,6 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_I
FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false)
// Misc
FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false)
FLD_DESC_ENT(RDC_FI_PROF_KFD_ID, "GPU_ID from rocprofiler, same as KFD_ID", "PROF_KFD_ID", true)
// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
+1 -1
Näytä tiedosto
@@ -171,6 +171,7 @@ typedef enum {
RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units
RDC_FI_UUID, //!< Device UUID
RDC_FI_GPU_PARTITION_COUNT,
RDC_FI_KFD_ID,
/**
* @brief Frequency related fields
@@ -344,7 +345,6 @@ typedef enum {
RDC_FI_PROF_CPF_CPF_TCIU_IDLE,
RDC_FI_PROF_CPF_CPF_TCIU_STALL,
RDC_FI_PROF_SIMD_UTILIZATION,
RDC_FI_PROF_KFD_ID,
/**
* @brief Raw XGMI counter events
@@ -650,6 +650,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel
value->value.l_int = static_cast<int64_t>(partition_count);
}
} break;
case RDC_FI_KFD_ID: {
amdsmi_kfd_info_t kfd_info;
value->status = amdsmi_get_gpu_kfd_info(processor_handle, &kfd_info);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(kfd_info.kfd_id);
}
} break;
case RDC_FI_POWER_USAGE: {
amdsmi_power_info_t power_info = {};
value->status = amdsmi_get_power_info(processor_handle, &power_info);
@@ -188,11 +188,11 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED,
RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION,
RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT,
RDC_FI_CPU_SKT_COUNT, RDC_FI_CPU_MODEL, RDC_FI_CPU_MODEL_ID,
RDC_FI_CPU_FAMILY, RDC_FI_CPU_CORES_PER_SKT, RDC_FI_CPU_SKT_ENERGY,
RDC_FI_CPU_HSMP_DRIVER_VERSION, RDC_FI_CPU_SMU_FW_VERSION, RDC_FI_CPU_HSMP_PROTO_VERSION,
RDC_FI_CPU_FCLK_FREQUENCY, RDC_FI_CPU_MCLK_FREQUENCY, RDC_FI_CPU_CCLK_LIMIT,
RDC_FI_CPU_SKT_ACTIVE_FREQ_LIMIT,
RDC_FI_KFD_ID, RDC_FI_CPU_SKT_COUNT, RDC_FI_CPU_MODEL,
RDC_FI_CPU_MODEL_ID, RDC_FI_CPU_FAMILY, RDC_FI_CPU_CORES_PER_SKT,
RDC_FI_CPU_SKT_ENERGY, RDC_FI_CPU_HSMP_DRIVER_VERSION,RDC_FI_CPU_SMU_FW_VERSION,
RDC_FI_CPU_HSMP_PROTO_VERSION, RDC_FI_CPU_FCLK_FREQUENCY, RDC_FI_CPU_MCLK_FREQUENCY,
RDC_FI_CPU_CCLK_LIMIT, RDC_FI_CPU_SKT_ACTIVE_FREQ_LIMIT,
RDC_FI_CPU_SKT_FREQ_LIMIT_SRC, RDC_FI_CPU_SKT_FREQ_RANGE_MAX, RDC_FI_CPU_SKT_FREQ_RANGE_MIN,
RDC_FI_CPU_SKT_C0_RESIDENCY, RDC_FI_CPU_SKT_LCLK_DPM_LEVEL,
};
@@ -108,7 +108,6 @@ static const std::map<rdc_field_t, const char*> temp_field_map_k = {
{RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"},
{RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"},
{RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"},
{RDC_FI_PROF_KFD_ID, "SQ_WAVES"}, // dummy value,
};
double RdcRocpBase::run_profiler(uint32_t agent_index, rdc_field_t field) {
@@ -388,13 +387,6 @@ rdc_status_t RdcRocpBase::rocp_lookup_bulk(const std::vector<rdc_gpu_field_t>& f
types[i] = DOUBLE; // Default type
statuses[i] = RDC_ST_OK;
// Handle special case: RDC_FI_PROF_KFD_ID doesn't need sampling
if (field == RDC_FI_PROF_KFD_ID) {
types[i] = INTEGER;
values[i].l_int = agents[agent_index].gpu_id;
continue;
}
// Get metric name for this field
auto field_it = field_to_metric.find(field);
if (field_it == field_to_metric.end()) {
@@ -442,9 +434,7 @@ rdc_status_t RdcRocpBase::rocp_lookup_bulk(const std::vector<rdc_gpu_field_t>& f
} catch (const std::exception& e) {
RDC_LOG(RDC_ERROR, "Error while sampling counter values: " << e.what());
for (size_t i = 0; i < fields.size(); i++) {
if (fields[i].field_id != RDC_FI_PROF_KFD_ID) {
statuses[i] = RDC_ST_BAD_PARAMETER;
}
}
return RDC_ST_BAD_PARAMETER;
}
@@ -456,11 +446,6 @@ rdc_status_t RdcRocpBase::rocp_lookup_bulk(const std::vector<rdc_gpu_field_t>& f
for (size_t i = 0; i < fields.size(); i++) {
const auto& field = fields[i].field_id;
// Skip fields that already have values set (like RDC_FI_PROF_KFD_ID)
if (field == RDC_FI_PROF_KFD_ID) {
continue;
}
// Skip fields that had errors earlier
if (statuses[i] != RDC_ST_OK) {
continue;
@@ -560,11 +545,6 @@ rdc_status_t RdcRocpBase::apply_field_transformation(
output->dbl = divided_dbl / (256.0F / static_cast<double>(agents[agent_index].simd_per_cu));
break;
case RDC_FI_PROF_KFD_ID:
*type = INTEGER;
output->l_int = agents[agent_index].gpu_id;
break;
default:
if (is_eval_field) {
output->dbl = divided_dbl;
@@ -210,7 +210,7 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3
// Bulk lookup for this GPU
// Note: rocp_lookup_bulk only handles rocprofiler-sdk metrics.
// Non-rocprofiler fields (e.g., RDC_FI_PROF_KFD_ID) are handled within
// Non-rocprofiler fields are handled within
// the bulk lookup via special case logic in apply_field_transformation().
// Fields without rocprofiler metric mappings will return RDC_ST_BAD_PARAMETER.
std::vector<rdc_field_value_data> bulk_data;