From f4e33b90c993d38eb62d029b7a793f86deecf91c Mon Sep 17 00:00:00 2001 From: Divya Shikre Date: Tue, 3 May 2022 18:41:45 -0400 Subject: [PATCH] Update get_frequencies to handle failures. Show an optional debug log (RSMI_DEBUG_BITFIELD=2) to the user in the following scenarios: 1. If more than one current frequency is found 2. If frequencies are not read in increasing order of their value If current frequency is not available, index for it is set to -1, values will not have * next to it in the output. This will also be handled in rocm_smi.py. Signed-off-by: Divya Shikre Change-Id: I477ec065f7513c8045d6392f12ef6cb835a6b8f6 [ROCm/amdsmi commit: afe996c2edb791ee3f7a7bdb4803da8a886fc016] --- projects/amdsmi/include/rocm_smi/rocm_smi.h | 5 ++- projects/amdsmi/python_smi_tools/rocm_smi.py | 9 +++++ projects/amdsmi/src/rocm_smi.cc | 40 ++++++++++++++++---- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/include/rocm_smi/rocm_smi.h index 97b9bcd019..b57e7dfd1d 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi.h @@ -2269,11 +2269,14 @@ rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od); * * @param[inout] f a pointer to a caller provided ::rsmi_frequencies_t structure * to which the frequency information will be written. Frequency values are in - * Hz. + * Hz. * If this parameter is nullptr, this function will return * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, * arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the * provided arguments. + * If multiple current frequencies are found, a warning is shown. If no + * current frequency is found, it is reflected as -1. If frequencies are not + * read from low to high a warning is shown as well. * * @retval ::RSMI_STATUS_SUCCESS call was successful * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index 5070994789..a85d65f630 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -1453,6 +1453,9 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], byref(freq)) if rsmi_ret_ok(ret, device, clk_defined, True): levl = freq.current + if levl >= freq.num_supported: + printLog(device, '%s current clock frequency not found' % (clk_defined), None) + continue fr = freq.frequency[levl] / 1000000 if concise: # in case function is used for concise output, no need to print. return '{:.0f}Mhz'.format(fr) @@ -1466,6 +1469,9 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) if rsmi_ret_ok(ret, device, clk_type, True): levl = freq.current + if levl >= freq.num_supported: + printLog(device, '%s current clock frequency not found' % (clk_type), None) + continue fr = freq.frequency[levl] / 1000000 if PRINT_JSON: printLog(device, '%s clock speed:' % (clk_type), '(%sMhz)' % (str(fr)[:-2])) @@ -1479,6 +1485,9 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw)) if rsmi_ret_ok(ret, device, 'PCIe', True): current_f = bw.transfer_rate.current + if current_f >= bw.transfer_rate.num_supported: + printLog(device, 'PCIe current clock frequency not found', None ) + continue fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000, bw.lanes[current_f]) printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index c8e521aac0..b8b2d0b7aa 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -77,6 +77,15 @@ static const uint32_t kMaxOverdriveLevel = 20; static const float kEnergyCounterResolution = 15.3f; +std::map ClkStateMap = { + {RSMI_CLK_TYPE_SYS, "SCLK"}, + {RSMI_CLK_TYPE_DF, "DFCLK"}, + {RSMI_CLK_TYPE_DCEF, "DCEFCLK"}, + {RSMI_CLK_TYPE_SOC, "SOCCLK"}, + {RSMI_CLK_TYPE_MEM, "MCLK"}, + {RSMI_CLK_TYPE_PCIE, "PCIECLK"}, + }; + #define TRY try { #define CATCH } catch (...) {return amd::smi::handleException();} @@ -895,7 +904,7 @@ rsmi_dev_perf_level_set_v1(uint32_t dv_ind, rsmi_dev_perf_level_t perf_level) { } -static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, +static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_t clk_type, uint32_t dv_ind, rsmi_frequencies_t *f, uint32_t *lanes = nullptr) { TRY std::vector val_vec; @@ -925,19 +934,34 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, // Our assumption is that frequencies are read in from lowest to highest. // Check that that is true. if (i > 0) { - assert(f->frequency[i-1] <= f->frequency[i]); + if (f->frequency[i] < f->frequency[i-1]) { + std::string sysvalue = ClkStateMap[clk_type]; + sysvalue += " Current Value"; + sysvalue += ' ' + std::to_string(f->frequency[i]); + sysvalue += " Previous Value"; + sysvalue += ' ' + std::to_string(f->frequency[i-1]); + DEBUG_LOG("Frequencies are not read from lowest to highest. ", sysvalue); + } } if (current) { - // Should only be 1 current frequency - assert(f->current == RSMI_MAX_NUM_FREQUENCIES + 1); - f->current = i; + // set the current frequency + if (f->current != RSMI_MAX_NUM_FREQUENCIES + 1) { + std::string sysvalue = ClkStateMap[clk_type]; + sysvalue += " Current Value"; + sysvalue += ' ' + std::to_string(f->frequency[i]); + sysvalue += " Previous Value"; + sysvalue += ' ' + std::to_string(f->frequency[f->current]); + DEBUG_LOG("More than one current clock. ", sysvalue); + } + else + f->current = i; } } // Some older drivers will not have the current frequency set // assert(f->current < f->num_supported); if (f->current >= f->num_supported) { - return RSMI_STATUS_NOT_SUPPORTED; + f->current = -1; } return RSMI_STATUS_SUCCESS; @@ -1444,7 +1468,7 @@ rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_clk_type_t clk_type, DEVICE_MUTEX - return get_frequencies(dev_type, dv_ind, f); + return get_frequencies(dev_type, clk_type, dv_ind, f); CATCH } @@ -2006,7 +2030,7 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { DEVICE_MUTEX - return get_frequencies(amd::smi::kDevPCIEClk, dv_ind, + return get_frequencies(amd::smi::kDevPCIEClk, RSMI_CLK_TYPE_PCIE, dv_ind, &b->transfer_rate, b->lanes); CATCH