From cf61df76ad71473a59fabd30b1f21fb9cd9974be Mon Sep 17 00:00:00 2001 From: Ori Messinger Date: Fri, 8 Apr 2022 12:48:42 -0400 Subject: [PATCH] ROCm SMI CLI: Fix setPowerOverdrive restPowerOverdrive Bugs Fixes bug in the 'setPowerOverdrive' function which mishandles GPUs with secondary dies. Secondary dies have a default power cap of 0W and cannot be changed, so they are now skipped. Fixes bug in the 'resetPowerOverdrive' function which incorrectly resets the wattage to the current value. Signed-off-by: Ori Messinger Change-Id: I483fa3f58b1fa44a3bf7bae3b52c59ce523ae152 [ROCm/amdsmi commit: 4298cbb400ff38950c3c9541ffd409e57e4a9a0a] --- projects/amdsmi/python_smi_tools/rocm_smi.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index a85d65f630..fa8fa8d655 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -1170,6 +1170,10 @@ def setPowerOverDrive(deviceList, value, autoRespond): strValue = value specWarningConfirmed = False for device in deviceList: + # Continue to next device in deviceList loop if the device is a secondary die + if checkIfSecondaryDie(device): + logging.debug("Unavailable for secondary die.") + continue power_cap_min = c_uint64() power_cap_max = c_uint64() current_power_cap = c_uint64() @@ -1195,17 +1199,17 @@ def setPowerOverDrive(deviceList, value, autoRespond): if rsmi_ret_ok(ret, device) == False: printErrLog(device, 'Unable to parse Power OverDrive range') RETCODE = 1 - return + continue if int(strValue) > (power_cap_max.value / 1000000): printErrLog(device, 'Unable to set Power OverDrive') logging.error('GPU[%s]\t\t: Value cannot be greater than: %dW ', device, power_cap_max.value / 1000000) RETCODE = 1 - return + continue if int(strValue) < (power_cap_min.value / 1000000): printErrLog(device, 'Unable to set Power OverDrive') logging.error('GPU[%s]\t\t: Value cannot be less than: %dW ', device, power_cap_min.value / 1000000) RETCODE = 1 - return + continue if new_power_cap.value == current_power_cap.value: printErrLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000))