[SWDEV-523349/SWDEV-527257] Fix Rdci Config (#161)

Change-Id: Iae21ea8061205f186086a3ed59c6259ddeb1dbe7

Signed-off-by: adapryor <Adam.pryor@amd.com>
Этот коммит содержится в:
Pryor, Adam
2025-04-28 11:57:51 -05:00
коммит произвёл GitHub
родитель 9edacdeac4
Коммит 2db6ddea69
10 изменённых файлов: 66 добавлений и 56 удалений
+2 -2
Просмотреть файл
@@ -64,7 +64,7 @@ extern "C" {
typedef enum {
RDC_ST_OK = 0, //!< Success
RDC_ST_NOT_SUPPORTED, //!< Not supported feature
RDC_ST_MSI_ERROR, //!< The MSI library error
RDC_ST_SMI_ERROR, //!< The SMI library error
RDC_ST_FAIL_LOAD_MODULE, //!< Fail to load the library
RDC_ST_INVALID_HANDLER, //!< Invalid handler
RDC_ST_BAD_PARAMETER, //!< A parameter is invalid
@@ -593,7 +593,7 @@ typedef enum {
* @brief Type of Components
*/
typedef enum {
RDC_AMDMSI_COMPONENT
RDC_AMDSMI_COMPONENT
// If needed later, add them one by one
} rdc_component_t;
+1 -1
Просмотреть файл
@@ -42,7 +42,7 @@ class rdc_status_t(Enum):
return int(obj)
RDC_ST_OK = 0
RDC_ST_NOT_SUPPORTED = 1
RDC_ST_MSI_ERROR = 2
RDC_ST_SMI_ERROR = 2
RDC_ST_FAIL_LOAD_MODULE = 3
RDC_ST_INVALID_HANDLER = 4
RDC_ST_BAD_PARAMETER = 5
+7 -8
Просмотреть файл
@@ -21,8 +21,8 @@ THE SOFTWARE.
*/
#include <dlfcn.h>
#include <string.h>
#include <fstream>
#include <fstream>
#include <map>
#include "common/rdc_fields_supported.h"
@@ -352,7 +352,7 @@ const char* rdc_status_string(rdc_status_t result) {
return "Cannot find the value";
case RDC_ST_BAD_PARAMETER:
return "Invalid parameters";
case RDC_ST_MSI_ERROR:
case RDC_ST_SMI_ERROR:
return "SMI error";
case RDC_ST_MAX_LIMIT:
return "The max limit reached";
@@ -477,7 +477,6 @@ char* strncpy_with_null(char* dest, const char* src, size_t n) {
return dest;
}
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_t policy) {
if (!p_rdc_handle) {
@@ -541,20 +540,20 @@ rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index,
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_get_num_partition(index, num_partition);
->rdc_get_num_partition(index, num_partition);
}
rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index,
rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) {
rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) {
if (!p_rdc_handle || !profile) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_instance_profile_get(entity_index, resource_type, profile);
->rdc_instance_profile_get(entity_index, resource_type, profile);
}
const char * get_rocm_path(const char * search_string) {
const char* get_rocm_path(const char* search_string) {
// set default rocm path in case lookup fails
static std::string rocm_path("/opt/rocm");
const char* rocm_path_env = getenv("ROCM_PATH");
+37 -30
Просмотреть файл
@@ -44,8 +44,7 @@ void RdcConfigSettingsImpl::monitorSettings() {
rdc_status_t rdc_status;
rdc_group_info_t rdc_group_info = {};
amdsmi_power_cap_info_t cap_info = {};
amdsmi_dev_perf_level_t perf_info = {};
uint32_t od;
amdsmi_frequencies_t freqs = {};
uint64_t cached_value;
while (true) {
@@ -103,19 +102,18 @@ void RdcConfigSettingsImpl::monitorSettings() {
}
// Mem clock
status = amdsmi_get_gpu_overdrive_level(processor_handle, &od);
status = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_MEM, &freqs);
if (status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(
RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_overdrive_level failed: "
<< status);
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq failed: " << status);
continue;
}
auto mem_clk_it = cached_settings.find(RDC_CFG_MEMORY_CLOCK_LIMIT);
if (mem_clk_it != cached_settings.end()) {
cached_value = mem_clk_it->second.target_value;
if (od == cached_value) {
if (freqs.frequency[freqs.current] == cached_value) {
status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM,
CLK_LIMIT_MAX, cached_value);
if (status != AMDSMI_STATUS_SUCCESS) {
@@ -129,18 +127,18 @@ void RdcConfigSettingsImpl::monitorSettings() {
}
// GFX clock
status = amdsmi_get_gpu_perf_level(processor_handle, &perf_info);
status = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_GFX, &freqs);
if (status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_perf_level failed: "
<< status);
RDC_LOG(
RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq failed: " << status);
continue;
}
auto gfx_clk_it = cached_settings.find(RDC_CFG_GFX_CLOCK_LIMIT);
if (gfx_clk_it != cached_settings.end()) {
cached_value = gfx_clk_it->second.target_value;
if (od == cached_value) {
if (freqs.frequency[freqs.current] == cached_value) {
status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX,
CLK_LIMIT_MAX, cached_value);
if (status != AMDSMI_STATUS_SUCCESS) {
@@ -313,31 +311,40 @@ rdc_status_t RdcConfigSettingsImpl::rdc_config_clear(rdc_gpu_group_t group_id) {
// Reset GFX clock limit if it was set
if (group_iter->second.find(RDC_CFG_GFX_CLOCK_LIMIT) != group_iter->second.end()) {
amdsmi_dev_perf_level_t perf_info = {};
amd_ret = amdsmi_get_gpu_perf_level(processor_handle, &perf_info);
if (amd_ret == AMDSMI_STATUS_SUCCESS && perf_info != AMDSMI_DEV_PERF_LEVEL_AUTO) {
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX,
AMDSMI_DEV_PERF_LEVEL_AUTO);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : "
<< amd_ret);
break;
amdsmi_frequencies_t freqs = {};
amd_ret = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_GFX, &freqs);
if (amd_ret == AMDSMI_STATUS_SUCCESS) {
uint64_t curr = freqs.frequency[freqs.current];
uint64_t maxf = freqs.frequency[freqs.num_supported - 1];
if (curr != maxf) {
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX,
AMDSMI_DEV_PERF_LEVEL_AUTO);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : "
<< amd_ret);
break;
}
}
}
}
// Reset memory clock limit if it was set
if (group_iter->second.find(RDC_CFG_MEMORY_CLOCK_LIMIT) != group_iter->second.end()) {
uint32_t od = 0;
amd_ret = amdsmi_get_gpu_overdrive_level(processor_handle, &od);
if (amd_ret == AMDSMI_STATUS_SUCCESS && od != 0) {
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, 0);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:"
<< amd_ret);
break;
amdsmi_frequencies_t freqs = {};
amd_ret = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_MEM, &freqs);
if (amd_ret == AMDSMI_STATUS_SUCCESS) {
uint64_t curr = freqs.frequency[freqs.current];
uint64_t maxf = freqs.frequency[freqs.num_supported - 1];
if (curr != maxf) {
amd_ret =
amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, 0);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:"
<< amd_ret);
break;
}
}
}
}
+2 -2
Просмотреть файл
@@ -210,7 +210,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(
return RDC_ST_BAD_PARAMETER;
}
if (component == RDC_AMDMSI_COMPONENT) {
if (component == RDC_AMDSMI_COMPONENT) {
amdsmi_status_t ret;
amdsmi_version_t ver = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, nullptr};
@@ -219,7 +219,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to obtain the version of the server's amd-smi library. reason: "
<< (ret == AMDSMI_STATUS_INVAL ? "Invalid parameters" : "unknown"));
return RDC_ST_MSI_ERROR;
return RDC_ST_SMI_ERROR;
}
strncpy_with_null(p_rdc_compv->version, ver.build, RDC_MAX_VERSION_STR_LENGTH);
+1 -1
Просмотреть файл
@@ -1125,7 +1125,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
<< value->value.str << ", latency " << latency);
}
return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR;
return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_SMI_ERROR;
}
std::shared_ptr<FieldSMIData> RdcMetricFetcherImpl::get_smi_data(RdcFieldKey key) {
+1 -1
Просмотреть файл
@@ -82,7 +82,7 @@ rdc_status_t RdcPolicyImpl::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_
gpu_index = group_info.entity_ids[i];
status = metric_fetcher_->fetch_smi_field(gpu_index, RDC_FI_GPU_PAGE_RETRIED, &value);
if (status == RDC_ST_MSI_ERROR) return RDC_ST_NOT_SUPPORTED;
if (status == RDC_ST_SMI_ERROR) return RDC_ST_NOT_SUPPORTED;
}
}
+1 -1
Просмотреть файл
@@ -212,7 +212,7 @@ rdc_status_t RdcSmiDiagnosticImpl::check_smi_topo_info(uint32_t gpu_index[RDC_MA
err_info += " fail";
strncpy_with_null(result->details.msg, err_info.c_str(), MAX_DIAG_MSG_LENGTH);
strncpy_with_null(result->info, err_info.c_str(), MAX_DIAG_MSG_LENGTH);
return RDC_ST_MSI_ERROR;
return RDC_ST_SMI_ERROR;
}
info += std::to_string(i) + "=>";
+13 -9
Просмотреть файл
@@ -228,7 +228,7 @@ void RdciConfigSubSystem::process() {
if (is_json_output()) {
json_ss << "\"group_id\": \"" << group_id_ << "\", \"status\": \"ok\"";
} else {
std::cout << "Successfully cleared all configurationbelongs for group: " << group_id_
std::cout << "Successfully cleared all configuration for group: " << group_id_
<< std::endl;
}
std::cout << json_ss.str() << std::endl;
@@ -292,11 +292,13 @@ void RdciConfigSubSystem::display_config_settings(rdc_config_setting_list_t& rdc
json_ss << "\"N/A\"";
} else {
if (value.type == INTEGER) {
ss << value.value.l_int;
json_ss << value.value.l_int;
double mhz = static_cast<double>(value.value.l_int) / 1'000'000.0;
ss << std::fixed << std::setprecision(0) << mhz;
json_ss << mhz;
} else if (value.type == DOUBLE) {
ss << std::fixed << std::setprecision(3) << value.value.dbl;
json_ss << value.value.dbl;
double mhz = value.value.dbl / 1'000'000.0;
ss << std::fixed << std::setprecision(0) << mhz;
json_ss << mhz;
} else {
ss << value.value.str;
json_ss << value.value.str;
@@ -320,11 +322,13 @@ void RdciConfigSubSystem::display_config_settings(rdc_config_setting_list_t& rdc
json_ss << "\"N/A\"";
} else {
if (value.type == INTEGER) {
ss << value.value.l_int;
json_ss << value.value.l_int;
double mhz = static_cast<double>(value.value.l_int) / 1'000'000.0;
ss << std::fixed << std::setprecision(0) << mhz;
json_ss << mhz;
} else if (value.type == DOUBLE) {
ss << std::fixed << std::setprecision(3) << value.value.dbl;
json_ss << value.value.dbl;
double mhz = value.value.dbl / 1'000'000.0;
ss << std::fixed << std::setprecision(0) << mhz;
json_ss << mhz;
} else {
ss << value.value.str;
json_ss << value.value.str;
+1 -1
Просмотреть файл
@@ -297,7 +297,7 @@ void RdciDiscoverySubSystem::show_attributes_with_partitions() {
void RdciDiscoverySubSystem::show_version() {
rdc_component_version_t smiv;
rdc_status_t result = rdc_device_get_component_version(rdc_handle_, RDC_AMDMSI_COMPONENT, &smiv);
rdc_status_t result = rdc_device_get_component_version(rdc_handle_, RDC_AMDSMI_COMPONENT, &smiv);
if (result != RDC_ST_OK) {
return;
}