[SWDEV-523349/SWDEV-527257] Fix Rdci Config (#161)
Change-Id: Iae21ea8061205f186086a3ed59c6259ddeb1dbe7 Signed-off-by: adapryor <Adam.pryor@amd.com>
Этот коммит содержится в:
@@ -64,7 +64,7 @@ extern "C" {
|
||||
typedef enum {
|
||||
RDC_ST_OK = 0, //!< Success
|
||||
RDC_ST_NOT_SUPPORTED, //!< Not supported feature
|
||||
RDC_ST_MSI_ERROR, //!< The MSI library error
|
||||
RDC_ST_SMI_ERROR, //!< The SMI library error
|
||||
RDC_ST_FAIL_LOAD_MODULE, //!< Fail to load the library
|
||||
RDC_ST_INVALID_HANDLER, //!< Invalid handler
|
||||
RDC_ST_BAD_PARAMETER, //!< A parameter is invalid
|
||||
@@ -593,7 +593,7 @@ typedef enum {
|
||||
* @brief Type of Components
|
||||
*/
|
||||
typedef enum {
|
||||
RDC_AMDMSI_COMPONENT
|
||||
RDC_AMDSMI_COMPONENT
|
||||
// If needed later, add them one by one
|
||||
} rdc_component_t;
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ class rdc_status_t(Enum):
|
||||
return int(obj)
|
||||
RDC_ST_OK = 0
|
||||
RDC_ST_NOT_SUPPORTED = 1
|
||||
RDC_ST_MSI_ERROR = 2
|
||||
RDC_ST_SMI_ERROR = 2
|
||||
RDC_ST_FAIL_LOAD_MODULE = 3
|
||||
RDC_ST_INVALID_HANDLER = 4
|
||||
RDC_ST_BAD_PARAMETER = 5
|
||||
|
||||
@@ -21,8 +21,8 @@ THE SOFTWARE.
|
||||
*/
|
||||
#include <dlfcn.h>
|
||||
#include <string.h>
|
||||
#include <fstream>
|
||||
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
|
||||
#include "common/rdc_fields_supported.h"
|
||||
@@ -352,7 +352,7 @@ const char* rdc_status_string(rdc_status_t result) {
|
||||
return "Cannot find the value";
|
||||
case RDC_ST_BAD_PARAMETER:
|
||||
return "Invalid parameters";
|
||||
case RDC_ST_MSI_ERROR:
|
||||
case RDC_ST_SMI_ERROR:
|
||||
return "SMI error";
|
||||
case RDC_ST_MAX_LIMIT:
|
||||
return "The max limit reached";
|
||||
@@ -477,7 +477,6 @@ char* strncpy_with_null(char* dest, const char* src, size_t n) {
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_policy_t policy) {
|
||||
if (!p_rdc_handle) {
|
||||
@@ -541,20 +540,20 @@ rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index,
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_get_num_partition(index, num_partition);
|
||||
->rdc_get_num_partition(index, num_partition);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index,
|
||||
rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) {
|
||||
rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) {
|
||||
if (!p_rdc_handle || !profile) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_instance_profile_get(entity_index, resource_type, profile);
|
||||
->rdc_instance_profile_get(entity_index, resource_type, profile);
|
||||
}
|
||||
|
||||
const char * get_rocm_path(const char * search_string) {
|
||||
const char* get_rocm_path(const char* search_string) {
|
||||
// set default rocm path in case lookup fails
|
||||
static std::string rocm_path("/opt/rocm");
|
||||
const char* rocm_path_env = getenv("ROCM_PATH");
|
||||
|
||||
@@ -44,8 +44,7 @@ void RdcConfigSettingsImpl::monitorSettings() {
|
||||
rdc_status_t rdc_status;
|
||||
rdc_group_info_t rdc_group_info = {};
|
||||
amdsmi_power_cap_info_t cap_info = {};
|
||||
amdsmi_dev_perf_level_t perf_info = {};
|
||||
uint32_t od;
|
||||
amdsmi_frequencies_t freqs = {};
|
||||
uint64_t cached_value;
|
||||
|
||||
while (true) {
|
||||
@@ -103,19 +102,18 @@ void RdcConfigSettingsImpl::monitorSettings() {
|
||||
}
|
||||
|
||||
// Mem clock
|
||||
status = amdsmi_get_gpu_overdrive_level(processor_handle, &od);
|
||||
status = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_MEM, &freqs);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(
|
||||
RDC_ERROR,
|
||||
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_overdrive_level failed: "
|
||||
<< status);
|
||||
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq failed: " << status);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto mem_clk_it = cached_settings.find(RDC_CFG_MEMORY_CLOCK_LIMIT);
|
||||
if (mem_clk_it != cached_settings.end()) {
|
||||
cached_value = mem_clk_it->second.target_value;
|
||||
if (od == cached_value) {
|
||||
if (freqs.frequency[freqs.current] == cached_value) {
|
||||
status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM,
|
||||
CLK_LIMIT_MAX, cached_value);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
@@ -129,18 +127,18 @@ void RdcConfigSettingsImpl::monitorSettings() {
|
||||
}
|
||||
|
||||
// GFX clock
|
||||
status = amdsmi_get_gpu_perf_level(processor_handle, &perf_info);
|
||||
status = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_GFX, &freqs);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_perf_level failed: "
|
||||
<< status);
|
||||
RDC_LOG(
|
||||
RDC_ERROR,
|
||||
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq failed: " << status);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto gfx_clk_it = cached_settings.find(RDC_CFG_GFX_CLOCK_LIMIT);
|
||||
if (gfx_clk_it != cached_settings.end()) {
|
||||
cached_value = gfx_clk_it->second.target_value;
|
||||
if (od == cached_value) {
|
||||
if (freqs.frequency[freqs.current] == cached_value) {
|
||||
status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX,
|
||||
CLK_LIMIT_MAX, cached_value);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
@@ -313,31 +311,40 @@ rdc_status_t RdcConfigSettingsImpl::rdc_config_clear(rdc_gpu_group_t group_id) {
|
||||
|
||||
// Reset GFX clock limit if it was set
|
||||
if (group_iter->second.find(RDC_CFG_GFX_CLOCK_LIMIT) != group_iter->second.end()) {
|
||||
amdsmi_dev_perf_level_t perf_info = {};
|
||||
amd_ret = amdsmi_get_gpu_perf_level(processor_handle, &perf_info);
|
||||
if (amd_ret == AMDSMI_STATUS_SUCCESS && perf_info != AMDSMI_DEV_PERF_LEVEL_AUTO) {
|
||||
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX,
|
||||
AMDSMI_DEV_PERF_LEVEL_AUTO);
|
||||
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : "
|
||||
<< amd_ret);
|
||||
break;
|
||||
amdsmi_frequencies_t freqs = {};
|
||||
amd_ret = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_GFX, &freqs);
|
||||
if (amd_ret == AMDSMI_STATUS_SUCCESS) {
|
||||
uint64_t curr = freqs.frequency[freqs.current];
|
||||
uint64_t maxf = freqs.frequency[freqs.num_supported - 1];
|
||||
if (curr != maxf) {
|
||||
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX,
|
||||
AMDSMI_DEV_PERF_LEVEL_AUTO);
|
||||
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : "
|
||||
<< amd_ret);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reset memory clock limit if it was set
|
||||
if (group_iter->second.find(RDC_CFG_MEMORY_CLOCK_LIMIT) != group_iter->second.end()) {
|
||||
uint32_t od = 0;
|
||||
amd_ret = amdsmi_get_gpu_overdrive_level(processor_handle, &od);
|
||||
if (amd_ret == AMDSMI_STATUS_SUCCESS && od != 0) {
|
||||
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, 0);
|
||||
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:"
|
||||
<< amd_ret);
|
||||
break;
|
||||
amdsmi_frequencies_t freqs = {};
|
||||
amd_ret = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_MEM, &freqs);
|
||||
if (amd_ret == AMDSMI_STATUS_SUCCESS) {
|
||||
uint64_t curr = freqs.frequency[freqs.current];
|
||||
uint64_t maxf = freqs.frequency[freqs.num_supported - 1];
|
||||
if (curr != maxf) {
|
||||
amd_ret =
|
||||
amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, 0);
|
||||
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:"
|
||||
<< amd_ret);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,7 +210,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
if (component == RDC_AMDMSI_COMPONENT) {
|
||||
if (component == RDC_AMDSMI_COMPONENT) {
|
||||
amdsmi_status_t ret;
|
||||
amdsmi_version_t ver = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, nullptr};
|
||||
|
||||
@@ -219,7 +219,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to obtain the version of the server's amd-smi library. reason: "
|
||||
<< (ret == AMDSMI_STATUS_INVAL ? "Invalid parameters" : "unknown"));
|
||||
return RDC_ST_MSI_ERROR;
|
||||
return RDC_ST_SMI_ERROR;
|
||||
}
|
||||
|
||||
strncpy_with_null(p_rdc_compv->version, ver.build, RDC_MAX_VERSION_STR_LENGTH);
|
||||
|
||||
@@ -1125,7 +1125,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
<< value->value.str << ", latency " << latency);
|
||||
}
|
||||
|
||||
return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR;
|
||||
return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_SMI_ERROR;
|
||||
}
|
||||
|
||||
std::shared_ptr<FieldSMIData> RdcMetricFetcherImpl::get_smi_data(RdcFieldKey key) {
|
||||
|
||||
@@ -82,7 +82,7 @@ rdc_status_t RdcPolicyImpl::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_
|
||||
gpu_index = group_info.entity_ids[i];
|
||||
|
||||
status = metric_fetcher_->fetch_smi_field(gpu_index, RDC_FI_GPU_PAGE_RETRIED, &value);
|
||||
if (status == RDC_ST_MSI_ERROR) return RDC_ST_NOT_SUPPORTED;
|
||||
if (status == RDC_ST_SMI_ERROR) return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -212,7 +212,7 @@ rdc_status_t RdcSmiDiagnosticImpl::check_smi_topo_info(uint32_t gpu_index[RDC_MA
|
||||
err_info += " fail";
|
||||
strncpy_with_null(result->details.msg, err_info.c_str(), MAX_DIAG_MSG_LENGTH);
|
||||
strncpy_with_null(result->info, err_info.c_str(), MAX_DIAG_MSG_LENGTH);
|
||||
return RDC_ST_MSI_ERROR;
|
||||
return RDC_ST_SMI_ERROR;
|
||||
}
|
||||
|
||||
info += std::to_string(i) + "=>";
|
||||
|
||||
@@ -228,7 +228,7 @@ void RdciConfigSubSystem::process() {
|
||||
if (is_json_output()) {
|
||||
json_ss << "\"group_id\": \"" << group_id_ << "\", \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully cleared all configurationbelongs for group: " << group_id_
|
||||
std::cout << "Successfully cleared all configuration for group: " << group_id_
|
||||
<< std::endl;
|
||||
}
|
||||
std::cout << json_ss.str() << std::endl;
|
||||
@@ -292,11 +292,13 @@ void RdciConfigSubSystem::display_config_settings(rdc_config_setting_list_t& rdc
|
||||
json_ss << "\"N/A\"";
|
||||
} else {
|
||||
if (value.type == INTEGER) {
|
||||
ss << value.value.l_int;
|
||||
json_ss << value.value.l_int;
|
||||
double mhz = static_cast<double>(value.value.l_int) / 1'000'000.0;
|
||||
ss << std::fixed << std::setprecision(0) << mhz;
|
||||
json_ss << mhz;
|
||||
} else if (value.type == DOUBLE) {
|
||||
ss << std::fixed << std::setprecision(3) << value.value.dbl;
|
||||
json_ss << value.value.dbl;
|
||||
double mhz = value.value.dbl / 1'000'000.0;
|
||||
ss << std::fixed << std::setprecision(0) << mhz;
|
||||
json_ss << mhz;
|
||||
} else {
|
||||
ss << value.value.str;
|
||||
json_ss << value.value.str;
|
||||
@@ -320,11 +322,13 @@ void RdciConfigSubSystem::display_config_settings(rdc_config_setting_list_t& rdc
|
||||
json_ss << "\"N/A\"";
|
||||
} else {
|
||||
if (value.type == INTEGER) {
|
||||
ss << value.value.l_int;
|
||||
json_ss << value.value.l_int;
|
||||
double mhz = static_cast<double>(value.value.l_int) / 1'000'000.0;
|
||||
ss << std::fixed << std::setprecision(0) << mhz;
|
||||
json_ss << mhz;
|
||||
} else if (value.type == DOUBLE) {
|
||||
ss << std::fixed << std::setprecision(3) << value.value.dbl;
|
||||
json_ss << value.value.dbl;
|
||||
double mhz = value.value.dbl / 1'000'000.0;
|
||||
ss << std::fixed << std::setprecision(0) << mhz;
|
||||
json_ss << mhz;
|
||||
} else {
|
||||
ss << value.value.str;
|
||||
json_ss << value.value.str;
|
||||
|
||||
@@ -297,7 +297,7 @@ void RdciDiscoverySubSystem::show_attributes_with_partitions() {
|
||||
|
||||
void RdciDiscoverySubSystem::show_version() {
|
||||
rdc_component_version_t smiv;
|
||||
rdc_status_t result = rdc_device_get_component_version(rdc_handle_, RDC_AMDMSI_COMPONENT, &smiv);
|
||||
rdc_status_t result = rdc_device_get_component_version(rdc_handle_, RDC_AMDSMI_COMPONENT, &smiv);
|
||||
if (result != RDC_ST_OK) {
|
||||
return;
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user