[SWDEV-484382] Fix VCLK/DCLK outputs for monitor, static, metric

Units were off and VCLK/DCLK outputs were not coming in
properly through amdsmi_get_clk_freq()

Now we match units sent back through rsmi_dev_gpu_clk_freq_get (MHz).

CLI now shows maximum of 2 VCLK/DCLKs otherwise shows N/A if there
is no current_freq listed.

Change-Id: I8a7b66cbb5263e8d396f8568c104e1ce3512923d
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: 3226a1d0ea]
Этот коммит содержится в:
Charis Poag
2024-12-20 09:59:15 -06:00
коммит произвёл Arif, Maisam
родитель 9d5eada975
Коммит bf4bbef085
7 изменённых файлов: 128 добавлений и 48 удалений
+61 -36
Просмотреть файл
@@ -846,9 +846,14 @@ class AMDSMICommands():
logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['cache_info'] = cache_info_list
if 'clock' in current_platform_args:
if isinstance(args.clock, bool) and args.clock == True:
# default to printing all clocks, if in current_platform_args; otherwise print specific clocks
if ((args.clock == True or isinstance(args.clock, list))
and 'clock' in current_platform_args):
original_clock_args = args.clock #save original args.clock value, so we can reset for multiple devices
if isinstance(args.clock, bool):
args.clock = ['sys', 'mem', 'df', 'soc', 'dcef', 'vclk0', 'vclk1', 'dclk0', 'dclk1']
if isinstance(args.clock, list):
# remove potential duplicates from list
args.clock = list(set(args.clock))
@@ -888,9 +893,15 @@ class AMDSMICommands():
freq_dict = {}
freq_dict.update({'current level':frequencies['current']})
freq_dict.update({'frequency_levels':{}})
for level in range(len(frequencies['frequency'])):
freq = str(self.helpers.convert_SI_unit(frequencies['frequency'][level], AMDSMIHelpers.SI_Unit.MICRO)) + " MHz"
freq_dict['frequency_levels'].update({level:freq})
if frequencies["num_supported"] != 0:
for level in range(len(frequencies['frequency'])):
if frequencies['frequency'][level] != "N/A":
freq = str(self.helpers.convert_SI_unit(frequencies['frequency'][level], AMDSMIHelpers.SI_Unit.MICRO)) + " MHz"
freq_dict['frequency_levels'].update({level:freq})
else:
freq_dict['frequency_levels'].update("N/A")
else:
freq_dict = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
freq_dict = "N/A"
clk_dict.update({clk:freq_dict})
@@ -898,6 +909,9 @@ class AMDSMICommands():
static_dict['clock'] = clk_dict
else:
raise amdsmi_exception.AmdSmiParameterException(args.clock, list[str])
# if original_clock_args is a boolean, set it back to the original value
if isinstance(original_clock_args, bool):
args.clock = original_clock_args
# Convert and store output by pid for csv format
multiple_devices_csv_override = False
@@ -1678,7 +1692,8 @@ class AMDSMICommands():
"clk_locked" : "N/A",
"deep_sleep" : "N/A"}
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
kMAX_NUM_VCLKS = 2
for clock_index in range(kMAX_NUM_VCLKS):
vclk_index = f"vclk_{clock_index}"
clocks[vclk_index] = {"clk" : "N/A",
"min_clk" : "N/A",
@@ -1686,7 +1701,8 @@ class AMDSMICommands():
"clk_locked" : "N/A",
"deep_sleep" : "N/A"}
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
kMAX_NUM_DCLKS = 2
for clock_index in range(kMAX_NUM_DCLKS):
dclk_index = f"dclk_{clock_index}"
clocks[dclk_index] = {"clk" : "N/A",
"min_clk" : "N/A",
@@ -1851,34 +1867,43 @@ class AMDSMICommands():
logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info())
# VCLK & DCLK min and max clocks
try:
vclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.VCLK0)
for clock_index in range(kMAX_NUM_DCLKS):
vclk_index = f"vclk_{clock_index}"
dclk_index = f"dclk_{clock_index}"
vclk_clock_info_dict = {"min_clk": "N/A", "max_clk": "N/A"}
dclk_clock_info_dict = {"min_clk": "N/A", "max_clk": "N/A"}
if clock_index == 0:
try:
vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.VCLK0)
dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.DCLK0)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get vclk0 and/or dclk0 clock info for gpu %s | %s", gpu_id, e.get_error_info())
if clock_index == 1:
try:
vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.VCLK1)
dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.DCLK1)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get vclk1 and/or dclk1 clock info for gpu %s | %s", gpu_id, e.get_error_info())
dclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.DCLK0)
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
vclk_index = f"vclk_{clock_index}"
# if the current clock is N/A then we shouldn't populate the max and min values
if clocks[vclk_index]["clk"] != "N/A":
clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
vclk0_clock_info_dict["min_clk"],
clock_unit)
clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
vclk0_clock_info_dict["max_clk"],
clock_unit)
dclk_index = f"dclk_{clock_index}"
if clocks[dclk_index]["clk"] != "N/A":
clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
dclk0_clock_info_dict["min_clk"],
clock_unit)
clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
dclk0_clock_info_dict["max_clk"],
clock_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get vclk and/or dclk clock info for gpu %s | %s", gpu_id, e.get_error_info())
# if the current clock is N/A then we shouldn't populate the max and min values
if (vclk_clock_info_dict["min_clk"] != "N/A" or vclk_clock_info_dict["max_clk"] != "N/A") and clock_index == 0:
clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
vclk_clock_info_dict["min_clk"],
clock_unit)
clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
vclk_clock_info_dict["max_clk"],
clock_unit)
if (dclk_clock_info_dict["min_clk"] != "N/A" or dclk_clock_info_dict["max_clk"] != "N/A") and clock_index == 1:
clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
dclk_clock_info_dict["min_clk"],
clock_unit)
clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
dclk_clock_info_dict["max_clk"],
clock_unit)
# FCLK min and max clocks
try:
@@ -5145,7 +5170,7 @@ class AMDSMICommands():
monitor_values['vclock'] = "N/A"
logging.debug("Failed to get dclock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'VCLOCK'.rjust(8)
self.logger.table_header += 'VCLOCK'.rjust(10)
try:
dclock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_dclk0']
@@ -5162,7 +5187,7 @@ class AMDSMICommands():
monitor_values['dclock'] = "N/A"
logging.debug("Failed to get vclock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'DCLOCK'.rjust(8)
self.logger.table_header += 'DCLOCK'.rjust(10)
if args.ecc:
try:
+1 -1
Просмотреть файл
@@ -120,7 +120,7 @@ class AMDSMILogger():
elif key in ('gfx_clock', 'mem_clock', 'vram_used'):
table_values += string_value.rjust(11)
elif key in ('vclock', 'dclock'):
table_values += string_value.rjust(8)
table_values += string_value.rjust(10)
elif key == 'vram_total' or 'ecc' in key or key == 'pcie_bw':
table_values += string_value.rjust(12)
elif key in ['pcie_replay']:
+2 -1
Просмотреть файл
@@ -3688,11 +3688,12 @@ def amdsmi_get_clk_freq(
)
)
return {
dict_ret = {
"num_supported": freq.num_supported,
"current": freq.current,
"frequency": list(freq.frequency)[: freq.num_supported],
}
return dict_ret
def amdsmi_get_soc_pstate(
+1
Просмотреть файл
@@ -124,6 +124,7 @@ std::string removeString(const std::string origStr,
const std::string &removeMe);
void system_wait(int milli_seconds);
int countDigit(uint64_t n);
uint64_t get_multiplier_from_str(char units_char);
template <typename T>
std::string print_int_as_hex(T i, bool showHexNotation = true,
int overloadBitSize = 0) {
+1 -1
Просмотреть файл
@@ -1212,7 +1212,7 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_
return RSMI_STATUS_INVALID_ARGS;
}
memset(f, 0, sizeof(rsmi_frequencies_t));
f->current=0;
f->current = 0;
ret = GetDevValueVec(type, dv_ind, &val_vec);
if (ret != RSMI_STATUS_SUCCESS) {
+30
Просмотреть файл
@@ -1280,5 +1280,35 @@ int countDigit(uint64_t n) {
return static_cast<int>(std::floor(log10(static_cast<double>(n)) + 1));
}
uint64_t get_multiplier_from_str(char units_char) {
uint32_t multiplier = 0;
switch (units_char) {
case 'G': // GT or GHz
multiplier = 1000000000;
break;
case 'M': // MT or MHz
multiplier = 1000000;
break;
case 'K': // KT or KHz
case 'V': // default unit for voltage is mV
multiplier = 1000;
break;
case 'T': // Transactions
case 'H': // Hertz
case 'm': // mV (we will make mV the default unit for voltage)
multiplier = 1;
break;
default:
assert(false); // Unexpected units for frequency
throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA, __FUNCTION__);
}
return multiplier;
}
} // namespace smi
} // namespace amd
+32 -9
Просмотреть файл
@@ -1868,6 +1868,9 @@ amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle,
clk_type == AMDSMI_CLK_TYPE_VCLK1 ||
clk_type == AMDSMI_CLK_TYPE_DCLK0 ||
clk_type == AMDSMI_CLK_TYPE_DCLK1 ) {
// Default unit is MHz
char unit = 'M';
// when f == nullptr -> check if metrics are supported
amdsmi_gpu_metrics_t metric_info;
amdsmi_gpu_metrics_t * metric_info_p = nullptr;
@@ -1882,22 +1885,42 @@ amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle,
if (r_status != AMDSMI_STATUS_SUCCESS)
return r_status;
f->num_supported = 1;
f->num_supported = 0;
if (clk_type == AMDSMI_CLK_TYPE_VCLK0) {
f->current = metric_info_p->current_vclk0;
f->frequency[0] = metric_info_p->average_vclk0_frequency;
f->current = 0;
f->frequency[0] = std::numeric_limits<uint64_t>::max();
if (metric_info_p->current_vclk0 != std::numeric_limits<uint16_t>::max()) {
f->frequency[0] = static_cast<uint64_t>(metric_info_p->current_vclk0)
* amd::smi::get_multiplier_from_str(unit); // match MHz ROCm SMI provides
f->num_supported = 1;
}
}
if (clk_type == AMDSMI_CLK_TYPE_VCLK1) {
f->current = metric_info_p->current_vclk1;
f->frequency[0] = metric_info_p->average_vclk1_frequency;
f->current = 0;
f->frequency[0] = std::numeric_limits<uint64_t>::max();
if (metric_info_p->current_vclk1 != std::numeric_limits<uint16_t>::max()) {
f->frequency[0] = static_cast<uint64_t>(metric_info_p->current_vclk1)
* amd::smi::get_multiplier_from_str(unit); // match MHz ROCm SMI provides
f->num_supported = 1;
}
}
if (clk_type == AMDSMI_CLK_TYPE_DCLK0) {
f->current = metric_info_p->current_dclk0;
f->frequency[0] = metric_info_p->average_dclk0_frequency;
f->current = 0;
f->frequency[0] = std::numeric_limits<uint64_t>::max();
if (metric_info_p->current_dclk0 != std::numeric_limits<uint16_t>::max()) {
f->frequency[0] = static_cast<uint64_t>(metric_info_p->current_dclk0)
* amd::smi::get_multiplier_from_str(unit); // match MHz ROCm SMI provides
f->num_supported = 1;
}
}
if (clk_type == AMDSMI_CLK_TYPE_DCLK1) {
f->current = metric_info_p->current_dclk1;
f->frequency[0] = metric_info_p->average_dclk1_frequency;
f->current = 0;
f->frequency[0] = std::numeric_limits<uint64_t>::max();
if (metric_info_p->current_dclk1 != std::numeric_limits<uint16_t>::max()) {
f->frequency[0] = static_cast<uint64_t>(metric_info_p->current_dclk1)
* amd::smi::get_multiplier_from_str(unit); // match MHz ROCm SMI provides
f->num_supported = 1;
}
}
return r_status;