diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 4eb319c7a4..7171885e1a 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -846,9 +846,14 @@ class AMDSMICommands(): logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['cache_info'] = cache_info_list - if 'clock' in current_platform_args: - if isinstance(args.clock, bool) and args.clock == True: + + # default to printing all clocks, if in current_platform_args; otherwise print specific clocks + if ((args.clock == True or isinstance(args.clock, list)) + and 'clock' in current_platform_args): + original_clock_args = args.clock #save original args.clock value, so we can reset for multiple devices + if isinstance(args.clock, bool): args.clock = ['sys', 'mem', 'df', 'soc', 'dcef', 'vclk0', 'vclk1', 'dclk0', 'dclk1'] + if isinstance(args.clock, list): # remove potential duplicates from list args.clock = list(set(args.clock)) @@ -888,9 +893,15 @@ class AMDSMICommands(): freq_dict = {} freq_dict.update({'current level':frequencies['current']}) freq_dict.update({'frequency_levels':{}}) - for level in range(len(frequencies['frequency'])): - freq = str(self.helpers.convert_SI_unit(frequencies['frequency'][level], AMDSMIHelpers.SI_Unit.MICRO)) + " MHz" - freq_dict['frequency_levels'].update({level:freq}) + if frequencies["num_supported"] != 0: + for level in range(len(frequencies['frequency'])): + if frequencies['frequency'][level] != "N/A": + freq = str(self.helpers.convert_SI_unit(frequencies['frequency'][level], AMDSMIHelpers.SI_Unit.MICRO)) + " MHz" + freq_dict['frequency_levels'].update({level:freq}) + else: + freq_dict['frequency_levels'].update("N/A") + else: + freq_dict = "N/A" except amdsmi_exception.AmdSmiLibraryException as e: freq_dict = "N/A" clk_dict.update({clk:freq_dict}) @@ -898,6 +909,9 @@ class AMDSMICommands(): static_dict['clock'] = clk_dict else: raise amdsmi_exception.AmdSmiParameterException(args.clock, list[str]) + # if original_clock_args is a boolean, set it back to the original value + if isinstance(original_clock_args, bool): + args.clock = original_clock_args # Convert and store output by pid for csv format multiple_devices_csv_override = False @@ -1678,7 +1692,8 @@ class AMDSMICommands(): "clk_locked" : "N/A", "deep_sleep" : "N/A"} - for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): + kMAX_NUM_VCLKS = 2 + for clock_index in range(kMAX_NUM_VCLKS): vclk_index = f"vclk_{clock_index}" clocks[vclk_index] = {"clk" : "N/A", "min_clk" : "N/A", @@ -1686,7 +1701,8 @@ class AMDSMICommands(): "clk_locked" : "N/A", "deep_sleep" : "N/A"} - for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): + kMAX_NUM_DCLKS = 2 + for clock_index in range(kMAX_NUM_DCLKS): dclk_index = f"dclk_{clock_index}" clocks[dclk_index] = {"clk" : "N/A", "min_clk" : "N/A", @@ -1851,34 +1867,43 @@ class AMDSMICommands(): logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info()) # VCLK & DCLK min and max clocks - try: - vclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, - amdsmi_interface.AmdSmiClkType.VCLK0) + for clock_index in range(kMAX_NUM_DCLKS): + vclk_index = f"vclk_{clock_index}" + dclk_index = f"dclk_{clock_index}" + vclk_clock_info_dict = {"min_clk": "N/A", "max_clk": "N/A"} + dclk_clock_info_dict = {"min_clk": "N/A", "max_clk": "N/A"} + if clock_index == 0: + try: + vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.VCLK0) + dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.DCLK0) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get vclk0 and/or dclk0 clock info for gpu %s | %s", gpu_id, e.get_error_info()) + if clock_index == 1: + try: + vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.VCLK1) + dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.DCLK1) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get vclk1 and/or dclk1 clock info for gpu %s | %s", gpu_id, e.get_error_info()) - dclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, - amdsmi_interface.AmdSmiClkType.DCLK0) - - for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): - vclk_index = f"vclk_{clock_index}" - # if the current clock is N/A then we shouldn't populate the max and min values - if clocks[vclk_index]["clk"] != "N/A": - clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger, - vclk0_clock_info_dict["min_clk"], - clock_unit) - clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger, - vclk0_clock_info_dict["max_clk"], - clock_unit) - - dclk_index = f"dclk_{clock_index}" - if clocks[dclk_index]["clk"] != "N/A": - clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger, - dclk0_clock_info_dict["min_clk"], - clock_unit) - clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger, - dclk0_clock_info_dict["max_clk"], - clock_unit) - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get vclk and/or dclk clock info for gpu %s | %s", gpu_id, e.get_error_info()) + # if the current clock is N/A then we shouldn't populate the max and min values + if (vclk_clock_info_dict["min_clk"] != "N/A" or vclk_clock_info_dict["max_clk"] != "N/A") and clock_index == 0: + clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger, + vclk_clock_info_dict["min_clk"], + clock_unit) + clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger, + vclk_clock_info_dict["max_clk"], + clock_unit) + if (dclk_clock_info_dict["min_clk"] != "N/A" or dclk_clock_info_dict["max_clk"] != "N/A") and clock_index == 1: + clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger, + dclk_clock_info_dict["min_clk"], + clock_unit) + clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger, + dclk_clock_info_dict["max_clk"], + clock_unit) # FCLK min and max clocks try: @@ -5145,7 +5170,7 @@ class AMDSMICommands(): monitor_values['vclock'] = "N/A" logging.debug("Failed to get dclock on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.table_header += 'VCLOCK'.rjust(8) + self.logger.table_header += 'VCLOCK'.rjust(10) try: dclock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_dclk0'] @@ -5162,7 +5187,7 @@ class AMDSMICommands(): monitor_values['dclock'] = "N/A" logging.debug("Failed to get vclock on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.table_header += 'DCLOCK'.rjust(8) + self.logger.table_header += 'DCLOCK'.rjust(10) if args.ecc: try: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 2bc183050a..4295a7b489 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -120,7 +120,7 @@ class AMDSMILogger(): elif key in ('gfx_clock', 'mem_clock', 'vram_used'): table_values += string_value.rjust(11) elif key in ('vclock', 'dclock'): - table_values += string_value.rjust(8) + table_values += string_value.rjust(10) elif key == 'vram_total' or 'ecc' in key or key == 'pcie_bw': table_values += string_value.rjust(12) elif key in ['pcie_replay']: diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 728dfb1e8a..a607a3c8d4 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -3688,11 +3688,12 @@ def amdsmi_get_clk_freq( ) ) - return { + dict_ret = { "num_supported": freq.num_supported, "current": freq.current, "frequency": list(freq.frequency)[: freq.num_supported], } + return dict_ret def amdsmi_get_soc_pstate( diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h index e86780ee3f..d5e28839ea 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -124,6 +124,7 @@ std::string removeString(const std::string origStr, const std::string &removeMe); void system_wait(int milli_seconds); int countDigit(uint64_t n); +uint64_t get_multiplier_from_str(char units_char); template std::string print_int_as_hex(T i, bool showHexNotation = true, int overloadBitSize = 0) { diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index 2a4b6d1aae..ae43b844ba 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -1212,7 +1212,7 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ return RSMI_STATUS_INVALID_ARGS; } memset(f, 0, sizeof(rsmi_frequencies_t)); - f->current=0; + f->current = 0; ret = GetDevValueVec(type, dv_ind, &val_vec); if (ret != RSMI_STATUS_SUCCESS) { diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc index 0dafa5795a..550b7dc054 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc @@ -1280,5 +1280,35 @@ int countDigit(uint64_t n) { return static_cast(std::floor(log10(static_cast(n)) + 1)); } +uint64_t get_multiplier_from_str(char units_char) { + uint32_t multiplier = 0; + + switch (units_char) { + case 'G': // GT or GHz + multiplier = 1000000000; + break; + + case 'M': // MT or MHz + multiplier = 1000000; + break; + + case 'K': // KT or KHz + case 'V': // default unit for voltage is mV + multiplier = 1000; + break; + + case 'T': // Transactions + case 'H': // Hertz + case 'm': // mV (we will make mV the default unit for voltage) + multiplier = 1; + break; + + default: + assert(false); // Unexpected units for frequency + throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA, __FUNCTION__); + } + return multiplier; +} + } // namespace smi } // namespace amd diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 86656c3a64..923b244915 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1868,6 +1868,9 @@ amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle, clk_type == AMDSMI_CLK_TYPE_VCLK1 || clk_type == AMDSMI_CLK_TYPE_DCLK0 || clk_type == AMDSMI_CLK_TYPE_DCLK1 ) { + // Default unit is MHz + char unit = 'M'; + // when f == nullptr -> check if metrics are supported amdsmi_gpu_metrics_t metric_info; amdsmi_gpu_metrics_t * metric_info_p = nullptr; @@ -1882,22 +1885,42 @@ amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle, if (r_status != AMDSMI_STATUS_SUCCESS) return r_status; - f->num_supported = 1; + f->num_supported = 0; if (clk_type == AMDSMI_CLK_TYPE_VCLK0) { - f->current = metric_info_p->current_vclk0; - f->frequency[0] = metric_info_p->average_vclk0_frequency; + f->current = 0; + f->frequency[0] = std::numeric_limits::max(); + if (metric_info_p->current_vclk0 != std::numeric_limits::max()) { + f->frequency[0] = static_cast(metric_info_p->current_vclk0) + * amd::smi::get_multiplier_from_str(unit); // match MHz ROCm SMI provides + f->num_supported = 1; + } } if (clk_type == AMDSMI_CLK_TYPE_VCLK1) { - f->current = metric_info_p->current_vclk1; - f->frequency[0] = metric_info_p->average_vclk1_frequency; + f->current = 0; + f->frequency[0] = std::numeric_limits::max(); + if (metric_info_p->current_vclk1 != std::numeric_limits::max()) { + f->frequency[0] = static_cast(metric_info_p->current_vclk1) + * amd::smi::get_multiplier_from_str(unit); // match MHz ROCm SMI provides + f->num_supported = 1; + } } if (clk_type == AMDSMI_CLK_TYPE_DCLK0) { - f->current = metric_info_p->current_dclk0; - f->frequency[0] = metric_info_p->average_dclk0_frequency; + f->current = 0; + f->frequency[0] = std::numeric_limits::max(); + if (metric_info_p->current_dclk0 != std::numeric_limits::max()) { + f->frequency[0] = static_cast(metric_info_p->current_dclk0) + * amd::smi::get_multiplier_from_str(unit); // match MHz ROCm SMI provides + f->num_supported = 1; + } } if (clk_type == AMDSMI_CLK_TYPE_DCLK1) { - f->current = metric_info_p->current_dclk1; - f->frequency[0] = metric_info_p->average_dclk1_frequency; + f->current = 0; + f->frequency[0] = std::numeric_limits::max(); + if (metric_info_p->current_dclk1 != std::numeric_limits::max()) { + f->frequency[0] = static_cast(metric_info_p->current_dclk1) + * amd::smi::get_multiplier_from_str(unit); // match MHz ROCm SMI provides + f->num_supported = 1; + } } return r_status;