diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 97b9bcd019..b57e7dfd1d 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -2269,11 +2269,14 @@ rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od); * * @param[inout] f a pointer to a caller provided ::rsmi_frequencies_t structure * to which the frequency information will be written. Frequency values are in - * Hz. + * Hz. * If this parameter is nullptr, this function will return * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, * arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the * provided arguments. + * If multiple current frequencies are found, a warning is shown. If no + * current frequency is found, it is reflected as -1. If frequencies are not + * read from low to high a warning is shown as well. * * @retval ::RSMI_STATUS_SUCCESS call was successful * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 5070994789..a85d65f630 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -1453,6 +1453,9 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], byref(freq)) if rsmi_ret_ok(ret, device, clk_defined, True): levl = freq.current + if levl >= freq.num_supported: + printLog(device, '%s current clock frequency not found' % (clk_defined), None) + continue fr = freq.frequency[levl] / 1000000 if concise: # in case function is used for concise output, no need to print. return '{:.0f}Mhz'.format(fr) @@ -1466,6 +1469,9 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) if rsmi_ret_ok(ret, device, clk_type, True): levl = freq.current + if levl >= freq.num_supported: + printLog(device, '%s current clock frequency not found' % (clk_type), None) + continue fr = freq.frequency[levl] / 1000000 if PRINT_JSON: printLog(device, '%s clock speed:' % (clk_type), '(%sMhz)' % (str(fr)[:-2])) @@ -1479,6 +1485,9 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw)) if rsmi_ret_ok(ret, device, 'PCIe', True): current_f = bw.transfer_rate.current + if current_f >= bw.transfer_rate.num_supported: + printLog(device, 'PCIe current clock frequency not found', None ) + continue fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000, bw.lanes[current_f]) printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index c8e521aac0..b8b2d0b7aa 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -77,6 +77,15 @@ static const uint32_t kMaxOverdriveLevel = 20; static const float kEnergyCounterResolution = 15.3f; +std::map ClkStateMap = { + {RSMI_CLK_TYPE_SYS, "SCLK"}, + {RSMI_CLK_TYPE_DF, "DFCLK"}, + {RSMI_CLK_TYPE_DCEF, "DCEFCLK"}, + {RSMI_CLK_TYPE_SOC, "SOCCLK"}, + {RSMI_CLK_TYPE_MEM, "MCLK"}, + {RSMI_CLK_TYPE_PCIE, "PCIECLK"}, + }; + #define TRY try { #define CATCH } catch (...) {return amd::smi::handleException();} @@ -895,7 +904,7 @@ rsmi_dev_perf_level_set_v1(uint32_t dv_ind, rsmi_dev_perf_level_t perf_level) { } -static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, +static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_t clk_type, uint32_t dv_ind, rsmi_frequencies_t *f, uint32_t *lanes = nullptr) { TRY std::vector val_vec; @@ -925,19 +934,34 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, // Our assumption is that frequencies are read in from lowest to highest. // Check that that is true. if (i > 0) { - assert(f->frequency[i-1] <= f->frequency[i]); + if (f->frequency[i] < f->frequency[i-1]) { + std::string sysvalue = ClkStateMap[clk_type]; + sysvalue += " Current Value"; + sysvalue += ' ' + std::to_string(f->frequency[i]); + sysvalue += " Previous Value"; + sysvalue += ' ' + std::to_string(f->frequency[i-1]); + DEBUG_LOG("Frequencies are not read from lowest to highest. ", sysvalue); + } } if (current) { - // Should only be 1 current frequency - assert(f->current == RSMI_MAX_NUM_FREQUENCIES + 1); - f->current = i; + // set the current frequency + if (f->current != RSMI_MAX_NUM_FREQUENCIES + 1) { + std::string sysvalue = ClkStateMap[clk_type]; + sysvalue += " Current Value"; + sysvalue += ' ' + std::to_string(f->frequency[i]); + sysvalue += " Previous Value"; + sysvalue += ' ' + std::to_string(f->frequency[f->current]); + DEBUG_LOG("More than one current clock. ", sysvalue); + } + else + f->current = i; } } // Some older drivers will not have the current frequency set // assert(f->current < f->num_supported); if (f->current >= f->num_supported) { - return RSMI_STATUS_NOT_SUPPORTED; + f->current = -1; } return RSMI_STATUS_SUCCESS; @@ -1444,7 +1468,7 @@ rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_clk_type_t clk_type, DEVICE_MUTEX - return get_frequencies(dev_type, dv_ind, f); + return get_frequencies(dev_type, clk_type, dv_ind, f); CATCH } @@ -2006,7 +2030,7 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { DEVICE_MUTEX - return get_frequencies(amd::smi::kDevPCIEClk, dv_ind, + return get_frequencies(amd::smi::kDevPCIEClk, RSMI_CLK_TYPE_PCIE, dv_ind, &b->transfer_rate, b->lanes); CATCH