[RDC] Fix GPU_COUNT metric to only count GPUs (#1453)
* [RDC] Fix GPU_COUNT metric to only count GPUs * [RDC] Clean up float->double casts --------- Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
此提交包含在:
@@ -225,7 +225,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(
|
||||
}
|
||||
|
||||
if (component == RDC_AMDSMI_COMPONENT) {
|
||||
amdsmi_status_t ret;
|
||||
amdsmi_status_t ret = AMDSMI_STATUS_UNKNOWN_ERROR;
|
||||
amdsmi_version_t ver = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, nullptr};
|
||||
|
||||
ret = amdsmi_get_lib_version(&ver);
|
||||
|
||||
@@ -87,7 +87,10 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() {
|
||||
}
|
||||
|
||||
uint64_t RdcMetricFetcherImpl::now() {
|
||||
// WHY does clang-format like to randomly add space after tv and then randomly remove it?
|
||||
// clang-format off
|
||||
struct timeval tv {};
|
||||
// clang-format on
|
||||
gettimeofday(&tv, nullptr);
|
||||
return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
|
||||
}
|
||||
@@ -592,12 +595,46 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel
|
||||
break;
|
||||
}
|
||||
case RDC_FI_GPU_COUNT: {
|
||||
uint32_t gpu_count = 0;
|
||||
uint32_t socket_count = 0;
|
||||
std::vector<amdsmi_socket_handle> socket_handles;
|
||||
value->status = amdsmi_get_socket_handles(&socket_count, nullptr);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(socket_count);
|
||||
if (value->status != AMDSMI_STATUS_SUCCESS) {
|
||||
break;
|
||||
}
|
||||
socket_handles.resize(socket_count);
|
||||
value->status = amdsmi_get_socket_handles(&socket_count, socket_handles.data());
|
||||
if (value->status != AMDSMI_STATUS_SUCCESS) {
|
||||
break;
|
||||
}
|
||||
for (uint32_t i = 0; i < socket_count; i++) {
|
||||
uint32_t proc_count = 0;
|
||||
amdsmi_status_t status = AMDSMI_STATUS_UNKNOWN_ERROR;
|
||||
status = amdsmi_get_processor_handles(socket_handles[i], &proc_count, nullptr);
|
||||
if ((status != AMDSMI_STATUS_SUCCESS) || (proc_count < 1)) {
|
||||
continue;
|
||||
}
|
||||
// only need to check the first processor in socket.
|
||||
// sockets don't mix CPUs and GPUs.. I hope.
|
||||
proc_count = 1;
|
||||
amdsmi_processor_handle proc = nullptr;
|
||||
status = amdsmi_get_processor_handles(socket_handles[i], &proc_count, &proc);
|
||||
if ((status != AMDSMI_STATUS_SUCCESS) || (proc_count < 1)) {
|
||||
continue;
|
||||
}
|
||||
processor_type_t proc_type = AMDSMI_PROCESSOR_TYPE_UNKNOWN;
|
||||
status = amdsmi_get_processor_type(proc, &proc_type);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
// only count AMD GPUs
|
||||
// only count 1 GPU per socket
|
||||
if (proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
|
||||
gpu_count++;
|
||||
}
|
||||
}
|
||||
value->value.l_int = static_cast<int64_t>(gpu_count);
|
||||
} break;
|
||||
case RDC_FI_GPU_PARTITION_COUNT: {
|
||||
uint32_t partition_count = 0;
|
||||
|
||||
@@ -364,7 +364,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
|
||||
// RDC_FI_PROF_GPU_UTIL_PERCENT is mapped to GPU_UTIL
|
||||
// GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE.
|
||||
// ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves
|
||||
data->dbl = read_dbl / 100.0F;
|
||||
data->dbl = read_dbl / 100.0;
|
||||
break;
|
||||
case RDC_FI_PROF_OCC_ELAPSED: {
|
||||
// RDC_FI_PROF_OCC_ELAPSED is mapped to GRBM_GUI_ACTIVE, the read happens earlier in this
|
||||
@@ -389,9 +389,9 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
|
||||
const bool isMI200 = (target_version.find("gfx90a") != std::string::npos);
|
||||
// FLOPS/clock/CU
|
||||
if (isMI200) {
|
||||
data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
data->dbl = divided_dbl / (1024.0 / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
} else { // Assume mi300
|
||||
data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
data->dbl = divided_dbl / (2048.0 / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT:
|
||||
@@ -401,7 +401,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
// FLOPS/clock/CU
|
||||
data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
data->dbl = divided_dbl / (256.0 / static_cast<double>(agents[agent_index].simd_per_cu));
|
||||
break;
|
||||
case RDC_FI_PROF_KFD_ID: {
|
||||
// do not care what it is mapped to. read value from agents
|
||||
|
||||
新增問題並參考
封鎖使用者