From 0aeceefcb3086b181efbf867eb6f00e326983041 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Mon, 7 Apr 2025 19:36:45 +0000 Subject: [PATCH] Fix breaking changes introduced with CPU support Changes introduced in 3bdca8b8b67890d973b01ddfd1b8bf253d17431a broke RDC if it was compiled without ESMI support, or if esmi driver is not loaded when RDC is being used. Change-Id: Id54e1e9002d2e3cf09240081149eed84178700af Signed-off-by: Galantsev, Dmitrii --- rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 29 ++++++++++++++++-------- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 3 +++ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 048d0ce500..fd51b573b9 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -50,11 +50,18 @@ class smi_initializer { // Make sure smi will not be initialized multiple times amdsmi_shut_down(); amdsmi_status_t ret; - uint64_t init_flag_; - //initialize CPU and GPU instances - init_flag_ = AMDSMI_INIT_AMD_GPUS | AMDSMI_INIT_AMD_CPUS; + uint64_t init_flag_ = AMDSMI_INIT_AMD_GPUS; +#ifdef ENABLE_ESMI_LIB + init_flag_ = init_flag_ | AMDSMI_INIT_AMD_CPUS; +#endif ret = amdsmi_init(init_flag_); + if (init_flag_ & AMDSMI_INIT_AMD_CPUS) { + RDC_LOG(RDC_ERROR, + "Failed to initalize amdsmi with CPUs and GPUs enabled.. trying GPUs only."); + ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS); + } if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "SMI FAILED with" << ret); throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "SMI initialize fail"); } } @@ -198,14 +205,15 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_ } // Discovery API -rdc_status_t RdcEmbeddedHandler::rdc_device_get_all_cpu(uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], - uint32_t* count) { +rdc_status_t RdcEmbeddedHandler::rdc_device_get_all_cpu( + uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { if (!count) { return RDC_ST_BAD_PARAMETER; } rdc_field_value device_count; - rdc_status_t status = metric_fetcher_->fetch_smi_cpu_field(0, RDC_FI_DEV_CPU_COUNT, &device_count); + rdc_status_t status = + metric_fetcher_->fetch_smi_cpu_field(0, RDC_FI_DEV_CPU_COUNT, &device_count); if (status != RDC_ST_OK) { std::cout << "rdc_device_get_all_cpu failed to get cpu count"; return status; @@ -216,9 +224,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all_cpu(uint32_t cpu_index_list[ for (uint32_t i = 0; i < *count; i++) { cpu_index_list[i] = i; } - return RDC_ST_OK; } + rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) { if (!p_rdc_attr) { @@ -230,14 +238,15 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index, return status; } -rdc_status_t RdcEmbeddedHandler::rdc_device_get_cpu_attributes(uint32_t cpu_index, - rdc_device_attributes_t* p_rdc_attr) { +rdc_status_t RdcEmbeddedHandler::rdc_device_get_cpu_attributes( + uint32_t cpu_index, rdc_device_attributes_t* p_rdc_attr) { if (!p_rdc_attr) { return RDC_ST_BAD_PARAMETER; } rdc_field_value device_name; - rdc_status_t status = metric_fetcher_->fetch_smi_cpu_field(cpu_index, RDC_FI_DEV_CPU_MODEL, &device_name); + rdc_status_t status = + metric_fetcher_->fetch_smi_cpu_field(cpu_index, RDC_FI_DEV_CPU_MODEL, &device_name); strncpy_with_null(p_rdc_attr->device_name, device_name.value.str, RDC_MAX_STR_LENGTH); return status; diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 9969d5997a..8f6e3fdc13 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -1338,6 +1338,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_cpu_field(uint32_t cpu_index, rdc_f value->status = Smi2RdcError(ret); break; } +#ifdef ENABLE_ESMI_LIB +// amdsmi_cpu_info_t is only defined if ENABLE_ESMI_LIB is set case RDC_FI_DEV_CPU_MODEL: { amdsmi_cpu_info_t cpu_info; value->status = amdsmi_get_cpu_model_name(processor_handle, &cpu_info); @@ -1347,6 +1349,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_cpu_field(uint32_t cpu_index, rdc_f } break; } +#endif default: { RDC_LOG(RDC_ERROR, "field_id is not supported: " << field_id); break;