Fix breaking changes introduced with CPU support

Changes introduced in f0f44d977f
broke RDC if it was compiled without ESMI support, or if esmi driver is
not loaded when RDC is being used.

Change-Id: Id54e1e9002d2e3cf09240081149eed84178700af
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: 0aeceefcb3]
This commit is contained in:
Galantsev, Dmitrii
2025-04-07 19:36:45 +00:00
کامیت شده توسط Galantsev, Dmitrii
والد f0f44d977f
کامیت 3e8f56c430
2فایلهای تغییر یافته به همراه22 افزوده شده و 10 حذف شده
@@ -50,11 +50,18 @@ class smi_initializer {
// Make sure smi will not be initialized multiple times
amdsmi_shut_down();
amdsmi_status_t ret;
uint64_t init_flag_;
//initialize CPU and GPU instances
init_flag_ = AMDSMI_INIT_AMD_GPUS | AMDSMI_INIT_AMD_CPUS;
uint64_t init_flag_ = AMDSMI_INIT_AMD_GPUS;
#ifdef ENABLE_ESMI_LIB
init_flag_ = init_flag_ | AMDSMI_INIT_AMD_CPUS;
#endif
ret = amdsmi_init(init_flag_);
if (init_flag_ & AMDSMI_INIT_AMD_CPUS) {
RDC_LOG(RDC_ERROR,
"Failed to initalize amdsmi with CPUs and GPUs enabled.. trying GPUs only.");
ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
}
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "SMI FAILED with" << ret);
throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "SMI initialize fail");
}
}
@@ -198,14 +205,15 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_
}
// Discovery API
rdc_status_t RdcEmbeddedHandler::rdc_device_get_all_cpu(uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES],
uint32_t* count) {
rdc_status_t RdcEmbeddedHandler::rdc_device_get_all_cpu(
uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
rdc_field_value device_count;
rdc_status_t status = metric_fetcher_->fetch_smi_cpu_field(0, RDC_FI_DEV_CPU_COUNT, &device_count);
rdc_status_t status =
metric_fetcher_->fetch_smi_cpu_field(0, RDC_FI_DEV_CPU_COUNT, &device_count);
if (status != RDC_ST_OK) {
std::cout << "rdc_device_get_all_cpu failed to get cpu count";
return status;
@@ -216,9 +224,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all_cpu(uint32_t cpu_index_list[
for (uint32_t i = 0; i < *count; i++) {
cpu_index_list[i] = i;
}
return RDC_ST_OK;
}
rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr) {
if (!p_rdc_attr) {
@@ -230,14 +238,15 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index,
return status;
}
rdc_status_t RdcEmbeddedHandler::rdc_device_get_cpu_attributes(uint32_t cpu_index,
rdc_device_attributes_t* p_rdc_attr) {
rdc_status_t RdcEmbeddedHandler::rdc_device_get_cpu_attributes(
uint32_t cpu_index, rdc_device_attributes_t* p_rdc_attr) {
if (!p_rdc_attr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_field_value device_name;
rdc_status_t status = metric_fetcher_->fetch_smi_cpu_field(cpu_index, RDC_FI_DEV_CPU_MODEL, &device_name);
rdc_status_t status =
metric_fetcher_->fetch_smi_cpu_field(cpu_index, RDC_FI_DEV_CPU_MODEL, &device_name);
strncpy_with_null(p_rdc_attr->device_name, device_name.value.str, RDC_MAX_STR_LENGTH);
return status;
@@ -1338,6 +1338,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_cpu_field(uint32_t cpu_index, rdc_f
value->status = Smi2RdcError(ret);
break;
}
#ifdef ENABLE_ESMI_LIB
// amdsmi_cpu_info_t is only defined if ENABLE_ESMI_LIB is set
case RDC_FI_DEV_CPU_MODEL: {
amdsmi_cpu_info_t cpu_info;
value->status = amdsmi_get_cpu_model_name(processor_handle, &cpu_info);
@@ -1347,6 +1349,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_cpu_field(uint32_t cpu_index, rdc_f
}
break;
}
#endif
default: {
RDC_LOG(RDC_ERROR, "field_id is not supported: " << field_id);
break;