[SWDEV-566924] Update KFD_ID metric to use amd-smi instead of rocprof (#2355)
Cette révision appartient à :
@@ -46,6 +46,7 @@ FLD_DESC_ENT(RDC_FI_TARGET_GRAPHICS_VERSION, "GFX version of the device",
|
||||
FLD_DESC_ENT(RDC_FI_NUM_OF_COMPUTE_UNITS, "Number of Compute Units", "COMPUTE_UNITS", true)
|
||||
FLD_DESC_ENT(RDC_FI_UUID, "Unique ID of the device AKA asic_serial", "UUID", true)
|
||||
FLD_DESC_ENT(RDC_FI_GPU_PARTITION_COUNT, "GPU partition count", "PARTITION_COUNT", true)
|
||||
FLD_DESC_ENT(RDC_FI_KFD_ID, "KFD_ID of GPU", "KFD_ID", true)
|
||||
|
||||
FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true)
|
||||
FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true)
|
||||
@@ -198,7 +199,6 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_I
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false)
|
||||
// Misc
|
||||
FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_KFD_ID, "GPU_ID from rocprofiler, same as KFD_ID", "PROF_KFD_ID", true)
|
||||
|
||||
// Events
|
||||
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
|
||||
|
||||
@@ -171,6 +171,7 @@ typedef enum {
|
||||
RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units
|
||||
RDC_FI_UUID, //!< Device UUID
|
||||
RDC_FI_GPU_PARTITION_COUNT,
|
||||
RDC_FI_KFD_ID,
|
||||
|
||||
/**
|
||||
* @brief Frequency related fields
|
||||
@@ -344,7 +345,6 @@ typedef enum {
|
||||
RDC_FI_PROF_CPF_CPF_TCIU_IDLE,
|
||||
RDC_FI_PROF_CPF_CPF_TCIU_STALL,
|
||||
RDC_FI_PROF_SIMD_UTILIZATION,
|
||||
RDC_FI_PROF_KFD_ID,
|
||||
|
||||
/**
|
||||
* @brief Raw XGMI counter events
|
||||
|
||||
@@ -650,6 +650,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel
|
||||
value->value.l_int = static_cast<int64_t>(partition_count);
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_KFD_ID: {
|
||||
amdsmi_kfd_info_t kfd_info;
|
||||
value->status = amdsmi_get_gpu_kfd_info(processor_handle, &kfd_info);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(kfd_info.kfd_id);
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_POWER_USAGE: {
|
||||
amdsmi_power_info_t power_info = {};
|
||||
value->status = amdsmi_get_power_info(processor_handle, &power_info);
|
||||
|
||||
@@ -188,11 +188,11 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
|
||||
RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED,
|
||||
RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION,
|
||||
RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT,
|
||||
RDC_FI_CPU_SKT_COUNT, RDC_FI_CPU_MODEL, RDC_FI_CPU_MODEL_ID,
|
||||
RDC_FI_CPU_FAMILY, RDC_FI_CPU_CORES_PER_SKT, RDC_FI_CPU_SKT_ENERGY,
|
||||
RDC_FI_CPU_HSMP_DRIVER_VERSION, RDC_FI_CPU_SMU_FW_VERSION, RDC_FI_CPU_HSMP_PROTO_VERSION,
|
||||
RDC_FI_CPU_FCLK_FREQUENCY, RDC_FI_CPU_MCLK_FREQUENCY, RDC_FI_CPU_CCLK_LIMIT,
|
||||
RDC_FI_CPU_SKT_ACTIVE_FREQ_LIMIT,
|
||||
RDC_FI_KFD_ID, RDC_FI_CPU_SKT_COUNT, RDC_FI_CPU_MODEL,
|
||||
RDC_FI_CPU_MODEL_ID, RDC_FI_CPU_FAMILY, RDC_FI_CPU_CORES_PER_SKT,
|
||||
RDC_FI_CPU_SKT_ENERGY, RDC_FI_CPU_HSMP_DRIVER_VERSION,RDC_FI_CPU_SMU_FW_VERSION,
|
||||
RDC_FI_CPU_HSMP_PROTO_VERSION, RDC_FI_CPU_FCLK_FREQUENCY, RDC_FI_CPU_MCLK_FREQUENCY,
|
||||
RDC_FI_CPU_CCLK_LIMIT, RDC_FI_CPU_SKT_ACTIVE_FREQ_LIMIT,
|
||||
RDC_FI_CPU_SKT_FREQ_LIMIT_SRC, RDC_FI_CPU_SKT_FREQ_RANGE_MAX, RDC_FI_CPU_SKT_FREQ_RANGE_MIN,
|
||||
RDC_FI_CPU_SKT_C0_RESIDENCY, RDC_FI_CPU_SKT_LCLK_DPM_LEVEL,
|
||||
};
|
||||
|
||||
@@ -108,7 +108,6 @@ static const std::map<rdc_field_t, const char*> temp_field_map_k = {
|
||||
{RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"},
|
||||
{RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"},
|
||||
{RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"},
|
||||
{RDC_FI_PROF_KFD_ID, "SQ_WAVES"}, // dummy value,
|
||||
};
|
||||
|
||||
double RdcRocpBase::run_profiler(uint32_t agent_index, rdc_field_t field) {
|
||||
@@ -388,13 +387,6 @@ rdc_status_t RdcRocpBase::rocp_lookup_bulk(const std::vector<rdc_gpu_field_t>& f
|
||||
types[i] = DOUBLE; // Default type
|
||||
statuses[i] = RDC_ST_OK;
|
||||
|
||||
// Handle special case: RDC_FI_PROF_KFD_ID doesn't need sampling
|
||||
if (field == RDC_FI_PROF_KFD_ID) {
|
||||
types[i] = INTEGER;
|
||||
values[i].l_int = agents[agent_index].gpu_id;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get metric name for this field
|
||||
auto field_it = field_to_metric.find(field);
|
||||
if (field_it == field_to_metric.end()) {
|
||||
@@ -442,9 +434,7 @@ rdc_status_t RdcRocpBase::rocp_lookup_bulk(const std::vector<rdc_gpu_field_t>& f
|
||||
} catch (const std::exception& e) {
|
||||
RDC_LOG(RDC_ERROR, "Error while sampling counter values: " << e.what());
|
||||
for (size_t i = 0; i < fields.size(); i++) {
|
||||
if (fields[i].field_id != RDC_FI_PROF_KFD_ID) {
|
||||
statuses[i] = RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
}
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
@@ -456,11 +446,6 @@ rdc_status_t RdcRocpBase::rocp_lookup_bulk(const std::vector<rdc_gpu_field_t>& f
|
||||
for (size_t i = 0; i < fields.size(); i++) {
|
||||
const auto& field = fields[i].field_id;
|
||||
|
||||
// Skip fields that already have values set (like RDC_FI_PROF_KFD_ID)
|
||||
if (field == RDC_FI_PROF_KFD_ID) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip fields that had errors earlier
|
||||
if (statuses[i] != RDC_ST_OK) {
|
||||
continue;
|
||||
@@ -560,11 +545,6 @@ rdc_status_t RdcRocpBase::apply_field_transformation(
|
||||
output->dbl = divided_dbl / (256.0F / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
break;
|
||||
|
||||
case RDC_FI_PROF_KFD_ID:
|
||||
*type = INTEGER;
|
||||
output->l_int = agents[agent_index].gpu_id;
|
||||
break;
|
||||
|
||||
default:
|
||||
if (is_eval_field) {
|
||||
output->dbl = divided_dbl;
|
||||
|
||||
@@ -210,7 +210,7 @@ rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint3
|
||||
|
||||
// Bulk lookup for this GPU
|
||||
// Note: rocp_lookup_bulk only handles rocprofiler-sdk metrics.
|
||||
// Non-rocprofiler fields (e.g., RDC_FI_PROF_KFD_ID) are handled within
|
||||
// Non-rocprofiler fields are handled within
|
||||
// the bulk lookup via special case logic in apply_field_transformation().
|
||||
// Fields without rocprofiler metric mappings will return RDC_ST_BAD_PARAMETER.
|
||||
std::vector<rdc_field_value_data> bulk_data;
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur