From 4298cbb400ff38950c3c9541ffd409e57e4a9a0a Mon Sep 17 00:00:00 2001 From: Ori Messinger Date: Fri, 8 Apr 2022 12:48:42 -0400 Subject: [PATCH 1/3] ROCm SMI CLI: Fix setPowerOverdrive restPowerOverdrive Bugs Fixes bug in the 'setPowerOverdrive' function which mishandles GPUs with secondary dies. Secondary dies have a default power cap of 0W and cannot be changed, so they are now skipped. Fixes bug in the 'resetPowerOverdrive' function which incorrectly resets the wattage to the current value. Signed-off-by: Ori Messinger Change-Id: I483fa3f58b1fa44a3bf7bae3b52c59ce523ae152 --- python_smi_tools/rocm_smi.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index a85d65f630..fa8fa8d655 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -1170,6 +1170,10 @@ def setPowerOverDrive(deviceList, value, autoRespond): strValue = value specWarningConfirmed = False for device in deviceList: + # Continue to next device in deviceList loop if the device is a secondary die + if checkIfSecondaryDie(device): + logging.debug("Unavailable for secondary die.") + continue power_cap_min = c_uint64() power_cap_max = c_uint64() current_power_cap = c_uint64() @@ -1195,17 +1199,17 @@ def setPowerOverDrive(deviceList, value, autoRespond): if rsmi_ret_ok(ret, device) == False: printErrLog(device, 'Unable to parse Power OverDrive range') RETCODE = 1 - return + continue if int(strValue) > (power_cap_max.value / 1000000): printErrLog(device, 'Unable to set Power OverDrive') logging.error('GPU[%s]\t\t: Value cannot be greater than: %dW ', device, power_cap_max.value / 1000000) RETCODE = 1 - return + continue if int(strValue) < (power_cap_min.value / 1000000): printErrLog(device, 'Unable to set Power OverDrive') logging.error('GPU[%s]\t\t: Value cannot be less than: %dW ', device, power_cap_min.value / 1000000) RETCODE = 1 - return + continue if new_power_cap.value == current_power_cap.value: printErrLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000)) From 786f66671a8a0118e12078c6855a71cf25920dde Mon Sep 17 00:00:00 2001 From: Ori Messinger Date: Thu, 19 May 2022 03:44:32 -0400 Subject: [PATCH 2/3] ROCm SMI CLI: Fix --showvoltagerange bug This patch fixes a --showvoltagerange bug, which attempts to check the voltage curve on a device that does not have any voltage regions in its OverDrive voltage frequency data (odvf). Signed-off-by: Ori Messinger Change-Id: I647c30c978ffb13f6819ac3d069ee340710a7f99 --- python_smi_tools/rocm_smi.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index fa8fa8d655..fac0c99153 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -2061,6 +2061,9 @@ def showRange(deviceList, rangeType): printLog(device, 'Valid mclk range: %sMhz - %sMhz' % ( int(odvf.curr_mclk_range.lower_bound / 1000000), int(odvf.curr_mclk_range.upper_bound / 1000000)), None) if rangeType == 'voltage': + if odvf.num_regions == 0: + printErrLog(device, 'Voltage curve regions unsupported.') + continue num_regions = c_uint32(odvf.num_regions) regions = (rsmi_freq_volt_region_t * odvf.num_regions)() ret = rocmsmi.rsmi_dev_od_volt_curve_regions_get(device, byref(num_regions), byref(regions)) From 44ea49eb013b6daf07585e7a5b532ee7561b9627 Mon Sep 17 00:00:00 2001 From: Elena Sakhnovitch Date: Thu, 19 May 2022 15:26:21 -0400 Subject: [PATCH 3/3] [rocm_smi.py]: shownodesbw fix for non xgmi Improve error output for non-xgmi nodes bandwidth signed-off-by: Elena Sakhnovitch Change-Id: I833970d3200a75c7639d33bf19e0e83afe176c8d --- python_smi_tools/rocm_smi.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index fac0c99153..2224edd8ec 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -2548,13 +2548,25 @@ def showNodesBw(deviceList): devices_ind = range(len(deviceList)) minBW = c_uint32() maxBW = c_uint32() + hops = c_uint64() + linktype = c_uint64() + silent = False + nonXgmi = False gpu_links_type = [[0 for x in devices_ind] for y in devices_ind] printLogSpacer(' Bandwidth ') for srcdevice in deviceList: for destdevice in deviceList: if srcdevice != destdevice: ret = rocmsmi.rsmi_minmax_bandwidth_get(srcdevice, destdevice, byref(minBW), byref(maxBW)) - if rsmi_ret_ok(ret, " {} to {}".format(srcdevice, destdevice),None ): + #verify that link type is xgmi + ret2 = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype)) + if rsmi_ret_ok(ret2," {} to {}".format(srcdevice, destdevice), None, True): + if linktype.value != 2: + nonXgmi = True + silent= True + gpu_links_type[srcdevice][destdevice] = "N/A" + + if rsmi_ret_ok(ret, " {} to {}".format(srcdevice, destdevice),None,silent): gpu_links_type[srcdevice][destdevice] = "{}-{}".format(minBW.value, maxBW.value) else: gpu_links_type[srcdevice][destdevice] = "N/A" @@ -2573,8 +2585,9 @@ def showNodesBw(deviceList): printTableRow('%-12s', gpu_links_type[gpu1][gpu2]) printEmptyLine() printLog(None,"Format: min-max; Units: mps", None) - printLog(None,'"0-0" min-max bandwidth indicates devices are not connected dirrectly', None) - + printLog(None,'"0-0" min-max bandwidth indicates devices are not connected directly', None) + if nonXgmi: + printLog(None,"Non-xGMI links detected and is currently not supported", None) def checkAmdGpus(deviceList): """ Check if there are any AMD GPUs being queried,