From 2db6ddea6935cf52ab08b65cd01f8e32e3bd2af0 Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Mon, 28 Apr 2025 11:57:51 -0500 Subject: [PATCH] [SWDEV-523349/SWDEV-527257] Fix Rdci Config (#161) Change-Id: Iae21ea8061205f186086a3ed59c6259ddeb1dbe7 Signed-off-by: adapryor --- include/rdc/rdc.h | 4 +- python_binding/rdc_bootstrap.py | 2 +- rdc_libs/bootstrap/src/RdcBootStrap.cc | 15 +++-- rdc_libs/rdc/src/RdcConfigSettingsImpl.cc | 67 +++++++++++++---------- rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 4 +- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 2 +- rdc_libs/rdc/src/RdcPolicyImpl.cc | 2 +- rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc | 2 +- rdci/src/RdciConfigSubSystem.cc | 22 +++++--- rdci/src/RdciDiscoverySubSystem.cc | 2 +- 10 files changed, 66 insertions(+), 56 deletions(-) diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index b6bb8309f0..7cf1adb86b 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -64,7 +64,7 @@ extern "C" { typedef enum { RDC_ST_OK = 0, //!< Success RDC_ST_NOT_SUPPORTED, //!< Not supported feature - RDC_ST_MSI_ERROR, //!< The MSI library error + RDC_ST_SMI_ERROR, //!< The SMI library error RDC_ST_FAIL_LOAD_MODULE, //!< Fail to load the library RDC_ST_INVALID_HANDLER, //!< Invalid handler RDC_ST_BAD_PARAMETER, //!< A parameter is invalid @@ -593,7 +593,7 @@ typedef enum { * @brief Type of Components */ typedef enum { - RDC_AMDMSI_COMPONENT + RDC_AMDSMI_COMPONENT // If needed later, add them one by one } rdc_component_t; diff --git a/python_binding/rdc_bootstrap.py b/python_binding/rdc_bootstrap.py index c2b8bb699b..d7d0ded34c 100644 --- a/python_binding/rdc_bootstrap.py +++ b/python_binding/rdc_bootstrap.py @@ -42,7 +42,7 @@ class rdc_status_t(Enum): return int(obj) RDC_ST_OK = 0 RDC_ST_NOT_SUPPORTED = 1 - RDC_ST_MSI_ERROR = 2 + RDC_ST_SMI_ERROR = 2 RDC_ST_FAIL_LOAD_MODULE = 3 RDC_ST_INVALID_HANDLER = 4 RDC_ST_BAD_PARAMETER = 5 diff --git a/rdc_libs/bootstrap/src/RdcBootStrap.cc b/rdc_libs/bootstrap/src/RdcBootStrap.cc index e068fb2784..757dd626c2 100644 --- a/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -21,8 +21,8 @@ THE SOFTWARE. */ #include #include -#include +#include #include #include "common/rdc_fields_supported.h" @@ -352,7 +352,7 @@ const char* rdc_status_string(rdc_status_t result) { return "Cannot find the value"; case RDC_ST_BAD_PARAMETER: return "Invalid parameters"; - case RDC_ST_MSI_ERROR: + case RDC_ST_SMI_ERROR: return "SMI error"; case RDC_ST_MAX_LIMIT: return "The max limit reached"; @@ -477,7 +477,6 @@ char* strncpy_with_null(char* dest, const char* src, size_t n) { return dest; } - rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_policy_t policy) { if (!p_rdc_handle) { @@ -541,20 +540,20 @@ rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index, return RDC_ST_INVALID_HANDLER; } return static_cast(p_rdc_handle) - ->rdc_get_num_partition(index, num_partition); + ->rdc_get_num_partition(index, num_partition); } rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index, - rdc_instance_resource_type_t resource_type, - rdc_resource_profile_t* profile) { + rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile) { if (!p_rdc_handle || !profile) { return RDC_ST_INVALID_HANDLER; } return static_cast(p_rdc_handle) - ->rdc_instance_profile_get(entity_index, resource_type, profile); + ->rdc_instance_profile_get(entity_index, resource_type, profile); } -const char * get_rocm_path(const char * search_string) { +const char* get_rocm_path(const char* search_string) { // set default rocm path in case lookup fails static std::string rocm_path("/opt/rocm"); const char* rocm_path_env = getenv("ROCM_PATH"); diff --git a/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc b/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc index 213c480ae0..a126884ea7 100644 --- a/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc +++ b/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc @@ -44,8 +44,7 @@ void RdcConfigSettingsImpl::monitorSettings() { rdc_status_t rdc_status; rdc_group_info_t rdc_group_info = {}; amdsmi_power_cap_info_t cap_info = {}; - amdsmi_dev_perf_level_t perf_info = {}; - uint32_t od; + amdsmi_frequencies_t freqs = {}; uint64_t cached_value; while (true) { @@ -103,19 +102,18 @@ void RdcConfigSettingsImpl::monitorSettings() { } // Mem clock - status = amdsmi_get_gpu_overdrive_level(processor_handle, &od); + status = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_MEM, &freqs); if (status != AMDSMI_STATUS_SUCCESS) { RDC_LOG( RDC_ERROR, - "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_overdrive_level failed: " - << status); + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq failed: " << status); continue; } auto mem_clk_it = cached_settings.find(RDC_CFG_MEMORY_CLOCK_LIMIT); if (mem_clk_it != cached_settings.end()) { cached_value = mem_clk_it->second.target_value; - if (od == cached_value) { + if (freqs.frequency[freqs.current] == cached_value) { status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, cached_value); if (status != AMDSMI_STATUS_SUCCESS) { @@ -129,18 +127,18 @@ void RdcConfigSettingsImpl::monitorSettings() { } // GFX clock - status = amdsmi_get_gpu_perf_level(processor_handle, &perf_info); + status = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_GFX, &freqs); if (status != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, - "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_perf_level failed: " - << status); + RDC_LOG( + RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq failed: " << status); continue; } auto gfx_clk_it = cached_settings.find(RDC_CFG_GFX_CLOCK_LIMIT); if (gfx_clk_it != cached_settings.end()) { cached_value = gfx_clk_it->second.target_value; - if (od == cached_value) { + if (freqs.frequency[freqs.current] == cached_value) { status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX, cached_value); if (status != AMDSMI_STATUS_SUCCESS) { @@ -313,31 +311,40 @@ rdc_status_t RdcConfigSettingsImpl::rdc_config_clear(rdc_gpu_group_t group_id) { // Reset GFX clock limit if it was set if (group_iter->second.find(RDC_CFG_GFX_CLOCK_LIMIT) != group_iter->second.end()) { - amdsmi_dev_perf_level_t perf_info = {}; - amd_ret = amdsmi_get_gpu_perf_level(processor_handle, &perf_info); - if (amd_ret == AMDSMI_STATUS_SUCCESS && perf_info != AMDSMI_DEV_PERF_LEVEL_AUTO) { - amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX, - AMDSMI_DEV_PERF_LEVEL_AUTO); - if (amd_ret != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, - "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : " - << amd_ret); - break; + amdsmi_frequencies_t freqs = {}; + amd_ret = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_GFX, &freqs); + if (amd_ret == AMDSMI_STATUS_SUCCESS) { + uint64_t curr = freqs.frequency[freqs.current]; + uint64_t maxf = freqs.frequency[freqs.num_supported - 1]; + if (curr != maxf) { + amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX, + AMDSMI_DEV_PERF_LEVEL_AUTO); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : " + << amd_ret); + break; + } } } } // Reset memory clock limit if it was set if (group_iter->second.find(RDC_CFG_MEMORY_CLOCK_LIMIT) != group_iter->second.end()) { - uint32_t od = 0; - amd_ret = amdsmi_get_gpu_overdrive_level(processor_handle, &od); - if (amd_ret == AMDSMI_STATUS_SUCCESS && od != 0) { - amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, 0); - if (amd_ret != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, - "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:" - << amd_ret); - break; + amdsmi_frequencies_t freqs = {}; + amd_ret = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_MEM, &freqs); + if (amd_ret == AMDSMI_STATUS_SUCCESS) { + uint64_t curr = freqs.frequency[freqs.current]; + uint64_t maxf = freqs.frequency[freqs.num_supported - 1]; + if (curr != maxf) { + amd_ret = + amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, 0); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:" + << amd_ret); + break; + } } } } diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index c9c9068ac1..ce1009f16d 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -210,7 +210,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version( return RDC_ST_BAD_PARAMETER; } - if (component == RDC_AMDMSI_COMPONENT) { + if (component == RDC_AMDSMI_COMPONENT) { amdsmi_status_t ret; amdsmi_version_t ver = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, nullptr}; @@ -219,7 +219,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version( if (ret != AMDSMI_STATUS_SUCCESS) { RDC_LOG(RDC_ERROR, "Failed to obtain the version of the server's amd-smi library. reason: " << (ret == AMDSMI_STATUS_INVAL ? "Invalid parameters" : "unknown")); - return RDC_ST_MSI_ERROR; + return RDC_ST_SMI_ERROR; } strncpy_with_null(p_rdc_compv->version, ver.build, RDC_MAX_VERSION_STR_LENGTH); diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 3c2de034f2..2be028f979 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -1125,7 +1125,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field << value->value.str << ", latency " << latency); } - return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR; + return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_SMI_ERROR; } std::shared_ptr RdcMetricFetcherImpl::get_smi_data(RdcFieldKey key) { diff --git a/rdc_libs/rdc/src/RdcPolicyImpl.cc b/rdc_libs/rdc/src/RdcPolicyImpl.cc index 1d3bb76881..e2fc262401 100644 --- a/rdc_libs/rdc/src/RdcPolicyImpl.cc +++ b/rdc_libs/rdc/src/RdcPolicyImpl.cc @@ -82,7 +82,7 @@ rdc_status_t RdcPolicyImpl::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_ gpu_index = group_info.entity_ids[i]; status = metric_fetcher_->fetch_smi_field(gpu_index, RDC_FI_GPU_PAGE_RETRIED, &value); - if (status == RDC_ST_MSI_ERROR) return RDC_ST_NOT_SUPPORTED; + if (status == RDC_ST_SMI_ERROR) return RDC_ST_NOT_SUPPORTED; } } diff --git a/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc b/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc index 8f190bc8a4..50e5a6d795 100644 --- a/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc +++ b/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc @@ -212,7 +212,7 @@ rdc_status_t RdcSmiDiagnosticImpl::check_smi_topo_info(uint32_t gpu_index[RDC_MA err_info += " fail"; strncpy_with_null(result->details.msg, err_info.c_str(), MAX_DIAG_MSG_LENGTH); strncpy_with_null(result->info, err_info.c_str(), MAX_DIAG_MSG_LENGTH); - return RDC_ST_MSI_ERROR; + return RDC_ST_SMI_ERROR; } info += std::to_string(i) + "=>"; diff --git a/rdci/src/RdciConfigSubSystem.cc b/rdci/src/RdciConfigSubSystem.cc index 7a9c4d458f..394b18b74a 100644 --- a/rdci/src/RdciConfigSubSystem.cc +++ b/rdci/src/RdciConfigSubSystem.cc @@ -228,7 +228,7 @@ void RdciConfigSubSystem::process() { if (is_json_output()) { json_ss << "\"group_id\": \"" << group_id_ << "\", \"status\": \"ok\""; } else { - std::cout << "Successfully cleared all configurationbelongs for group: " << group_id_ + std::cout << "Successfully cleared all configuration for group: " << group_id_ << std::endl; } std::cout << json_ss.str() << std::endl; @@ -292,11 +292,13 @@ void RdciConfigSubSystem::display_config_settings(rdc_config_setting_list_t& rdc json_ss << "\"N/A\""; } else { if (value.type == INTEGER) { - ss << value.value.l_int; - json_ss << value.value.l_int; + double mhz = static_cast(value.value.l_int) / 1'000'000.0; + ss << std::fixed << std::setprecision(0) << mhz; + json_ss << mhz; } else if (value.type == DOUBLE) { - ss << std::fixed << std::setprecision(3) << value.value.dbl; - json_ss << value.value.dbl; + double mhz = value.value.dbl / 1'000'000.0; + ss << std::fixed << std::setprecision(0) << mhz; + json_ss << mhz; } else { ss << value.value.str; json_ss << value.value.str; @@ -320,11 +322,13 @@ void RdciConfigSubSystem::display_config_settings(rdc_config_setting_list_t& rdc json_ss << "\"N/A\""; } else { if (value.type == INTEGER) { - ss << value.value.l_int; - json_ss << value.value.l_int; + double mhz = static_cast(value.value.l_int) / 1'000'000.0; + ss << std::fixed << std::setprecision(0) << mhz; + json_ss << mhz; } else if (value.type == DOUBLE) { - ss << std::fixed << std::setprecision(3) << value.value.dbl; - json_ss << value.value.dbl; + double mhz = value.value.dbl / 1'000'000.0; + ss << std::fixed << std::setprecision(0) << mhz; + json_ss << mhz; } else { ss << value.value.str; json_ss << value.value.str; diff --git a/rdci/src/RdciDiscoverySubSystem.cc b/rdci/src/RdciDiscoverySubSystem.cc index 02ca20964a..c58d5a2d42 100644 --- a/rdci/src/RdciDiscoverySubSystem.cc +++ b/rdci/src/RdciDiscoverySubSystem.cc @@ -297,7 +297,7 @@ void RdciDiscoverySubSystem::show_attributes_with_partitions() { void RdciDiscoverySubSystem::show_version() { rdc_component_version_t smiv; - rdc_status_t result = rdc_device_get_component_version(rdc_handle_, RDC_AMDMSI_COMPONENT, &smiv); + rdc_status_t result = rdc_device_get_component_version(rdc_handle_, RDC_AMDSMI_COMPONENT, &smiv); if (result != RDC_ST_OK) { return; }