From a2cff3c84d1551c82e9a8fab91e620a3d90d6d9c Mon Sep 17 00:00:00 2001 From: Dmitrii <19967783+dmitrii-galantsev@users.noreply.github.com> Date: Thu, 30 Oct 2025 12:50:47 -0500 Subject: [PATCH] [RDC] Fix GPU_COUNT metric to only count GPUs (#1453) * [RDC] Fix GPU_COUNT metric to only count GPUs * [RDC] Clean up float->double casts --------- Signed-off-by: Galantsev, Dmitrii --- .../rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 2 +- .../rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 41 ++++++++++++++++++- .../rdc_modules/rdc_rocp/RdcRocpBase.cc | 8 ++-- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 0c8fff1dc2..f5220c1646 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -225,7 +225,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version( } if (component == RDC_AMDSMI_COMPONENT) { - amdsmi_status_t ret; + amdsmi_status_t ret = AMDSMI_STATUS_UNKNOWN_ERROR; amdsmi_version_t ver = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, nullptr}; ret = amdsmi_get_lib_version(&ver); diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 732b2f801b..04669929de 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -87,7 +87,10 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() { } uint64_t RdcMetricFetcherImpl::now() { + // WHY does clang-format like to randomly add space after tv and then randomly remove it? + // clang-format off struct timeval tv {}; + // clang-format on gettimeofday(&tv, nullptr); return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; } @@ -592,12 +595,46 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel break; } case RDC_FI_GPU_COUNT: { + uint32_t gpu_count = 0; uint32_t socket_count = 0; + std::vector socket_handles; value->status = amdsmi_get_socket_handles(&socket_count, nullptr); value->type = INTEGER; - if (value->status == AMDSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(socket_count); + if (value->status != AMDSMI_STATUS_SUCCESS) { + break; } + socket_handles.resize(socket_count); + value->status = amdsmi_get_socket_handles(&socket_count, socket_handles.data()); + if (value->status != AMDSMI_STATUS_SUCCESS) { + break; + } + for (uint32_t i = 0; i < socket_count; i++) { + uint32_t proc_count = 0; + amdsmi_status_t status = AMDSMI_STATUS_UNKNOWN_ERROR; + status = amdsmi_get_processor_handles(socket_handles[i], &proc_count, nullptr); + if ((status != AMDSMI_STATUS_SUCCESS) || (proc_count < 1)) { + continue; + } + // only need to check the first processor in socket. + // sockets don't mix CPUs and GPUs.. I hope. + proc_count = 1; + amdsmi_processor_handle proc = nullptr; + status = amdsmi_get_processor_handles(socket_handles[i], &proc_count, &proc); + if ((status != AMDSMI_STATUS_SUCCESS) || (proc_count < 1)) { + continue; + } + processor_type_t proc_type = AMDSMI_PROCESSOR_TYPE_UNKNOWN; + status = amdsmi_get_processor_type(proc, &proc_type); + if (status != AMDSMI_STATUS_SUCCESS) { + continue; + } + // only count AMD GPUs + // only count 1 GPU per socket + if (proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) { + gpu_count++; + } + } + value->value.l_int = static_cast(gpu_count); } break; case RDC_FI_GPU_PARTITION_COUNT: { uint32_t partition_count = 0; diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index bf2a919355..63b1d010d6 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -364,7 +364,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value // RDC_FI_PROF_GPU_UTIL_PERCENT is mapped to GPU_UTIL // GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE. // ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves - data->dbl = read_dbl / 100.0F; + data->dbl = read_dbl / 100.0; break; case RDC_FI_PROF_OCC_ELAPSED: { // RDC_FI_PROF_OCC_ELAPSED is mapped to GRBM_GUI_ACTIVE, the read happens earlier in this @@ -389,9 +389,9 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value const bool isMI200 = (target_version.find("gfx90a") != std::string::npos); // FLOPS/clock/CU if (isMI200) { - data->dbl = divided_dbl / (1024.0F / static_cast(agents[agent_index].simd_per_cu)); + data->dbl = divided_dbl / (1024.0 / static_cast(agents[agent_index].simd_per_cu)); } else { // Assume mi300 - data->dbl = divided_dbl / (2048.0F / static_cast(agents[agent_index].simd_per_cu)); + data->dbl = divided_dbl / (2048.0 / static_cast(agents[agent_index].simd_per_cu)); } } break; case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT: @@ -401,7 +401,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value return RDC_ST_BAD_PARAMETER; } // FLOPS/clock/CU - data->dbl = divided_dbl / (256.0F / static_cast(agents[agent_index].simd_per_cu)); + data->dbl = divided_dbl / (256.0 / static_cast(agents[agent_index].simd_per_cu)); break; case RDC_FI_PROF_KFD_ID: { // do not care what it is mapped to. read value from agents