[RDC] Fix GPU_COUNT metric to only count GPUs (#1453)

* [RDC] Fix GPU_COUNT metric to only count GPUs
* [RDC] Clean up float->double casts

---------

Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
此提交包含在:
Dmitrii
2025-10-30 12:50:47 -05:00
提交者 GitHub
父節點 e0ec72ccdd
當前提交 a2cff3c84d
共有 3 個檔案被更改,包括 44 行新增7 行删除
+1 -1
查看文件
@@ -225,7 +225,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(
}
if (component == RDC_AMDSMI_COMPONENT) {
amdsmi_status_t ret;
amdsmi_status_t ret = AMDSMI_STATUS_UNKNOWN_ERROR;
amdsmi_version_t ver = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, nullptr};
ret = amdsmi_get_lib_version(&ver);
+39 -2
查看文件
@@ -87,7 +87,10 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() {
}
uint64_t RdcMetricFetcherImpl::now() {
// WHY does clang-format like to randomly add space after tv and then randomly remove it?
// clang-format off
struct timeval tv {};
// clang-format on
gettimeofday(&tv, nullptr);
return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
}
@@ -592,12 +595,46 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel
break;
}
case RDC_FI_GPU_COUNT: {
uint32_t gpu_count = 0;
uint32_t socket_count = 0;
std::vector<amdsmi_socket_handle> socket_handles;
value->status = amdsmi_get_socket_handles(&socket_count, nullptr);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(socket_count);
if (value->status != AMDSMI_STATUS_SUCCESS) {
break;
}
socket_handles.resize(socket_count);
value->status = amdsmi_get_socket_handles(&socket_count, socket_handles.data());
if (value->status != AMDSMI_STATUS_SUCCESS) {
break;
}
for (uint32_t i = 0; i < socket_count; i++) {
uint32_t proc_count = 0;
amdsmi_status_t status = AMDSMI_STATUS_UNKNOWN_ERROR;
status = amdsmi_get_processor_handles(socket_handles[i], &proc_count, nullptr);
if ((status != AMDSMI_STATUS_SUCCESS) || (proc_count < 1)) {
continue;
}
// only need to check the first processor in socket.
// sockets don't mix CPUs and GPUs.. I hope.
proc_count = 1;
amdsmi_processor_handle proc = nullptr;
status = amdsmi_get_processor_handles(socket_handles[i], &proc_count, &proc);
if ((status != AMDSMI_STATUS_SUCCESS) || (proc_count < 1)) {
continue;
}
processor_type_t proc_type = AMDSMI_PROCESSOR_TYPE_UNKNOWN;
status = amdsmi_get_processor_type(proc, &proc_type);
if (status != AMDSMI_STATUS_SUCCESS) {
continue;
}
// only count AMD GPUs
// only count 1 GPU per socket
if (proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
gpu_count++;
}
}
value->value.l_int = static_cast<int64_t>(gpu_count);
} break;
case RDC_FI_GPU_PARTITION_COUNT: {
uint32_t partition_count = 0;
+4 -4
查看文件
@@ -364,7 +364,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
// RDC_FI_PROF_GPU_UTIL_PERCENT is mapped to GPU_UTIL
// GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE.
// ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves
data->dbl = read_dbl / 100.0F;
data->dbl = read_dbl / 100.0;
break;
case RDC_FI_PROF_OCC_ELAPSED: {
// RDC_FI_PROF_OCC_ELAPSED is mapped to GRBM_GUI_ACTIVE, the read happens earlier in this
@@ -389,9 +389,9 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
const bool isMI200 = (target_version.find("gfx90a") != std::string::npos);
// FLOPS/clock/CU
if (isMI200) {
data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[agent_index].simd_per_cu));
data->dbl = divided_dbl / (1024.0 / static_cast<double>(agents[agent_index].simd_per_cu));
} else { // Assume mi300
data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[agent_index].simd_per_cu));
data->dbl = divided_dbl / (2048.0 / static_cast<double>(agents[agent_index].simd_per_cu));
}
} break;
case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT:
@@ -401,7 +401,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
return RDC_ST_BAD_PARAMETER;
}
// FLOPS/clock/CU
data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[agent_index].simd_per_cu));
data->dbl = divided_dbl / (256.0 / static_cast<double>(agents[agent_index].simd_per_cu));
break;
case RDC_FI_PROF_KFD_ID: {
// do not care what it is mapped to. read value from agents