From 848697c287f2cded5573c381c817cd48e2b1b77e Mon Sep 17 00:00:00 2001 From: Ori Messinger Date: Fri, 18 Dec 2020 07:32:57 -0500 Subject: [PATCH] ROCm SMI Python CLI: Fix --showclkfrq/--showclocks Failure The purpose of this patch is to check if each valid clock is supported on the GPU before attempting to retrieve its value. The valid clocks are: dcefclk, fclk, mclk, pcie, sclk, socclk. This should get rid of the 'one or more commands failed' message when running --showclkfrq or --showclocks on a machine that doesn't support all the possible valid clocks. Signed-off-by: Ori Messinger Change-Id: I1fb10989fc1a36f38b68a23e17e6e600ed0ac85b [ROCm/amdsmi commit: 3b52c895cc50d039af3f9e804d784b0690619a52] --- projects/amdsmi/python_smi_tools/rocm_smi.py | 91 ++++++++++++-------- 1 file changed, 54 insertions(+), 37 deletions(-) diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index cfab54bc8b..b03e2f08bf 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -1293,26 +1293,34 @@ def showClocks(deviceList): for device in deviceList: for clk_type in sorted(rsmi_clk_names_dict): freq_list = [] - ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) - if rsmi_ret_ok(ret, device, clk_type, True): - printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None) - for x in range(freq.num_supported): - fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000) - if x == freq.current: + if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1: + ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) + if rsmi_ret_ok(ret, device, clk_type, True): + printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None) + for x in range(freq.num_supported): + fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000) + if x == freq.current: + printLog(device, str(x), str(fr) + ' *') + else: + printLog(device, str(x), str(fr)) + printLog(device, '', None) + else: + printErrLog(device, '%s frequency is unsupported' % (clk_type)) + printLog(device, '', None) + if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1: + ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw)) + if rsmi_ret_ok(ret, device, 'PCIe', True): + printLog(device, 'Supported %s frequencies on GPU%s' % ('PCIe', str(device)), None) + freq_list = [] + for x in range(bw.transfer_rate.num_supported): + fr = '{:>.1f}GT/s x{}'.format(bw.transfer_rate.frequency[x] / 1000000000, bw.lanes[x]) + if x == bw.transfer_rate.current: printLog(device, str(x), str(fr) + ' *') else: printLog(device, str(x), str(fr)) printLog(device, '', None) - ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw)) - if rsmi_ret_ok(ret, device, 'PCIe', True): - printLog(device, 'Supported %s frequencies on GPU%s' % ('PCIe', str(device)), None) - freq_list = [] - for x in range(bw.transfer_rate.num_supported): - fr = '{:>.1f}GT/s x{}'.format(bw.transfer_rate.frequency[x] / 1000000000, bw.lanes[x]) - if x == bw.transfer_rate.current: - printLog(device, str(x), str(fr) + ' *') - else: - printLog(device, str(x), str(fr)) + else: + printErrLog(device, 'PCIe frequency is unsupported') printLog(device, '', None) printLogSpacer(None, '-') # divider between devices for better visibility printLogSpacer() @@ -1333,31 +1341,40 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): printLogSpacer(' Current clock frequencies ') for device in deviceList: if clk_defined: - ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], byref(freq)) - if rsmi_ret_ok(ret, device, clk_defined, True): - levl = freq.current - fr = freq.frequency[levl] / 1000000 - if concise: # in case function is used for concise output, no need to print. - return '{:.0f}Mhz'.format(fr) - printLog(device, '{} clock level'.format(clk_defined), '{} ({:.0f}Mhz)'.format(levl, fr)) + if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], None) == 1: + ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], byref(freq)) + if rsmi_ret_ok(ret, device, clk_defined, True): + levl = freq.current + fr = freq.frequency[levl] / 1000000 + if concise: # in case function is used for concise output, no need to print. + return '{:.0f}Mhz'.format(fr) + printLog(device, '{} clock level'.format(clk_defined), '{} ({:.0f}Mhz)'.format(levl, fr)) + else: + printErrLog(device, '%s clock is unsupported' % (clk_defined)) else: # if clk is not defined, will display all current clk for clk_type in sorted(rsmi_clk_names_dict): - ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) - if rsmi_ret_ok(ret, device, clk_type, True): - levl = freq.current - fr = freq.frequency[levl] / 1000000 - if PRINT_JSON: - printLog(device, '%s clock speed:' % (clk_type), '(%sMhz)' % (str(fr)[:-2])) - printLog(device, '%s clock level:' % (clk_type), levl) - else: - printLog(device, '%s clock level: %s' % (clk_type, levl), '(%sMhz)' % (str(fr)[:-2])) + if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1: + ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) + if rsmi_ret_ok(ret, device, clk_type, True): + levl = freq.current + fr = freq.frequency[levl] / 1000000 + if PRINT_JSON: + printLog(device, '%s clock speed:' % (clk_type), '(%sMhz)' % (str(fr)[:-2])) + printLog(device, '%s clock level:' % (clk_type), levl) + else: + printLog(device, '%s clock level: %s' % (clk_type, levl), '(%sMhz)' % (str(fr)[:-2])) + else: + printErrLog(device, '%s clock is unsupported' % (clk_type)) # pcie clocks - ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw)) - if rsmi_ret_ok(ret, device, 'PCIe', True): - current_f = bw.transfer_rate.current - fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000, bw.lanes[current_f]) - printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) + if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1: + ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw)) + if rsmi_ret_ok(ret, device, 'PCIe', True): + current_f = bw.transfer_rate.current + fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000, bw.lanes[current_f]) + printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) + else: + printErrLog(device, 'PCIe clock is unsupported') printLogSpacer()