From 2e3cc74c795f23bdc22e1adb2b08faee83dad9e4 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Fri, 2 Feb 2024 00:00:38 -0600 Subject: [PATCH 1/3] [SWDEV-437365] Fix --showpower Updates: - [CLI] Switching to use generic rsmi_dev_power_get() this is a backwards compatible function to retrieve power values. More consistent than previous fixes. - [API] Update API for rsmi_dev_power_get() Now provides @depricated for this function. Providing notes on newer ASICS only support current socket power, where as previous ASICS only provided average power. Change-Id: I34da0e925cf0b6c669bdd801b017f33f3b3ee86a Signed-off-by: Charis Poag [ROCm/rocm_smi_lib commit: 51aec98edd34187cc6d42bbf7f6ea099257ea0b6] --- .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 10 ++- .../rocm-smi-lib/python_smi_tools/rocm_smi.py | 82 ++++++++++--------- .../python_smi_tools/rsmiBindings.py.in | 10 +++ 3 files changed, 64 insertions(+), 38 deletions(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index c916bccfd9..da3f75add1 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -1818,6 +1818,11 @@ rsmi_status_t rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask); * @p power, this function will write the current average power consumption * (in microwatts) to the uint64_t pointed to by @p power. * + * @deprecated ::rsmi_dev_power_get() is preferred due to providing + * backwards compatibility, which looks at both average and current power + * values. Whereas ::rsmi_dev_power_ave_get only looks for average power + * consumption. Newer ASICs will support current power only. + * * @param[in] dv_ind a device index * * @param[in] sensor_ind a 0-based sensor index. Normally, this will be 0. @@ -1886,7 +1891,10 @@ rsmi_dev_current_socket_power_get(uint32_t dv_ind, uint64_t *socket_power); * @param[inout] type a pointer to RSMI_POWER_TYPE object. Returns the type * of power retrieved from the device. Current power is ::RSMI_CURRENT_POWER * and average power is ::RSMI_AVERAGE_POWER. If an error occurs, - * returns an invalid power type ::RSMI_INVALID_POWER. + * returns an invalid power type ::RSMI_INVALID_POWER - example device + * neither supports average power or current power. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS. * * @retval ::RSMI_STATUS_SUCCESS call was successful * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index 0def4c6a29..d65fb4a3a2 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -387,32 +387,40 @@ def getPidList(): return -def getAvgPower(device, silent=False): - """ Return the average power level of a given device +def getPower(device): + """ Return dictionary of power responses. @param device: DRM device identifier - @param silent=Turn on to silence error output - (you plan to handle manually). Default is off. - """ - power = c_uint32() - ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power)) - if rsmi_ret_ok(ret, device, 'get_power_avg', silent): - return str(power.value / 1000000) - return 'N/A' -def getCurrentSocketPower(device, silent=False): - """ Return the current (also known as instant) - socket power of a given device + Response power dictionary: + { + 'power': string wattage response or 'N/A' (for not RSMI_STATUS_SUCCESS), + 'power_type': power type string - 'Current Socket' or 'Average', + 'unit': W (Watt) + 'ret': response of rsmi_dev_power_get(device, byref(power), byref(power_type)) + } - @param device: DRM device identifier - @param silent=Turn on to silence error output - (you plan to handle manually). Default is off. """ - power = c_uint32() - ret = rocmsmi.rsmi_dev_current_socket_power_get(device, byref(power)) - if rsmi_ret_ok(ret, device, 'get_socket_power', silent): - return str(power.value / 1000000) - return 'N/A' + + power = c_int64(0) + power_type = rsmi_power_type_t() + power_ret_dict = { + 'power': "N/A", + 'power_type': "N/A", + 'unit': 'W', + 'ret': rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED + } + ret = rocmsmi.rsmi_dev_power_get(device, byref(power), byref(power_type)) + if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: + power_ret_dict = { + 'power': str(power.value / 1000000), + 'power_type': rsmi_power_type_dict[power_type.value], + 'unit': 'W', + 'ret': ret + } + else: + power_ret_dict['ret'] = ret + return power_ret_dict def getRasEnablement(device, block, silent=True): @@ -492,8 +500,9 @@ def getPowerLabel(deviceList): if len(deviceList) < 1: return powerLabel device=deviceList[0] - power = getCurrentSocketPower(device, True) - if power != '0.0' and power != 'N/A': + power_dict = getPower(device) + if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and + power_dict['power_type'] == 'CURRENT SOCKET'): powerLabel = rsmi_power_label.CURRENT_SOCKET_POWER return powerLabel @@ -1756,17 +1765,12 @@ def showAllConcise(deviceList): temp_val = str(getTemp(device, available_temp_type, silent)) if temp_val != 'N/A': temp_val += degree_sign + 'C' - socketPwr = getCurrentSocketPower(device, True) - avgPwr = getAvgPower(device, True) + power_dict = getPower(device) powerVal = 'N/A' - if socketPwr != '0.0' and socketPwr != 'N/A': - socketPwr += 'W' - powerVal=socketPwr - elif avgPwr != '0.0' and avgPwr != 'N/A': - avgPwr += 'W' - powerVal=avgPwr - else: - powerVal = 'N/A' + if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and + power_dict['power_type'] != 'INVALID_POWER_TYPE'): + if power_dict['power'] != 0: + powerVal = power_dict['power'] + power_dict['unit'] combined_partition = (getMemoryPartition(device, silent) + ", " + getComputePartition(device, silent)) sclk = showCurrentClocks([device], 'sclk', concise=silent) @@ -2469,13 +2473,17 @@ def showPower(deviceList): secondaryPresent=False printLogSpacer(' Power Consumption ') for device in deviceList: - if str(getCurrentSocketPower(device, True)) != 'N/A': - printLog(device, 'Current Socket Graphics Package Power (W)', getCurrentSocketPower(device)) + power_dict = getPower(device) + power = 'N/A' + if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and + power_dict['power_type'] != 'INVALID_POWER_TYPE'): + power = power_dict['power'] + printLog(device, power_dict['power_type'].title() + ' Graphics Package Power (' + + power_dict['unit'] + ')', + power) elif checkIfSecondaryDie(device): printLog(device, 'Average Graphics Package Power (W)', "N/A (Secondary die)") secondaryPresent=True - elif str(getAvgPower(device)) != '0.0': - printLog(device, 'Average Graphics Package Power (W)', getAvgPower(device)) else: printErrLog(device, 'Unable to get Average or Current Socket Graphics Package Power Consumption') if secondaryPresent: diff --git a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py.in b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py.in index 884793468f..d6ff405f0c 100644 --- a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py.in +++ b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py.in @@ -662,3 +662,13 @@ class rsmi_power_label(str, Enum): AVG_POWER = '(Avg)' CURRENT_SOCKET_POWER = '(Socket)' +class rsmi_power_type_t(c_int): + RSMI_AVERAGE_POWER = 0, + RSMI_CURRENT_POWER = 1, + RSMI_INVALID_POWER = 0xFFFFFFFF + +rsmi_power_type_dict = { + 0: 'AVERAGE', + 1: 'CURRENT SOCKET', + 0xFFFFFFFF: 'INVALID_POWER_TYPE' +} \ No newline at end of file From 9bc381ec40a1bfefc57a2f57530007a7b51d517b Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 24 Jan 2024 18:11:47 -0600 Subject: [PATCH 2/3] Add lychee.toml for dead link checks Use Lychee[1] to check dead links [1] - https://github.com/lycheeverse/lychee Change-Id: I741a2760283da8c21b95e5b516f78e39a9d9a0a1 Signed-off-by: Galantsev, Dmitrii [ROCm/rocm_smi_lib commit: 1015cba4895f3d32d85e32be8a809af6eea83672] --- projects/rocm-smi-lib/lychee.toml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 projects/rocm-smi-lib/lychee.toml diff --git a/projects/rocm-smi-lib/lychee.toml b/projects/rocm-smi-lib/lychee.toml new file mode 100644 index 0000000000..25193c0706 --- /dev/null +++ b/projects/rocm-smi-lib/lychee.toml @@ -0,0 +1,2 @@ +exclude = ['^file://.*', '.*localhost.*'] +exclude_path = ["./build"] From 194f98b8093803cc990040f8d4e48f5fe16b5cb2 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Fri, 9 Feb 2024 09:22:51 -0600 Subject: [PATCH 3/3] Support set min or max clock In addition to be able to set clock range, new setextremum option is added to set only min/max clock as sometimes one of them may not be supported. Change-Id: I7c91ba308f3fc6c78efc88117509c515d403a6cb [ROCm/rocm_smi_lib commit: 4e0a7f2f67695d1ca44bc9361dfdf5d80dc832ea] --- .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 25 +++++++++ .../rocm-smi-lib/python_smi_tools/rocm_smi.py | 50 +++++++++++++++++- projects/rocm-smi-lib/src/rocm_smi.cc | 52 +++++++++++++++++++ 3 files changed, 126 insertions(+), 1 deletion(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index da3f75add1..28f0bd795b 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -2761,6 +2761,31 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, uint64_t maxclkvalue, rsmi_clk_type_t clkType); +/** + * @brief This function sets the clock min/max level + * + * @details Given a device index @p dv_ind, a clock value @p minclkvalue, + * a maximum clock value @p maxclkvalue and a clock type @p clkType this function + * will set the sclk|mclk range + * + * @param[in] dv_ind a device index + * + * @param[in] level RSMI_FREQ_IND_MIN|RSMI_FREQ_IND_MAX + * + * @param[in] clkvalue value to apply to the clock level. Frequency values + * are in MHz. + * + * @param[in] clkType RSMI_CLK_TYPE_SYS | RSMI_CLK_TYPE_MEM level type + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + */ +rsmi_status_t rsmi_dev_clk_extremum_set(uint32_t dv_ind, rsmi_freq_ind_t level, + uint64_t clkvalue, + rsmi_clk_type_t clkType); + /** * @brief This function sets the clock frequency information * diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index d65fb4a3a2..e910b0bc30 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -1057,6 +1057,51 @@ def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond): if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: printLog(device, 'Setting %s range is not supported for this device.' % (clkType), None) +def setClockExtremum(deviceList, level, clkType, clkValue, autoRespond): + """ Set the range for the specified clktype in the PowerPlay table for a list of devices. + + Parameters: + deviceList -- List of DRM devices (can be a single-item list) + level -- [min|max] Minimum value or Maximum value + clktype -- [sclk|mclk] Which clock type to apply the range to + clkValue -- clock value to apply to the level + autoRespond -- Response to automatically provide for all prompts + """ + global RETCODE + if level not in {'min', 'max'}: + printLog(None, 'Invalid extremum identifier %s, use min or max' % (level), None) + logging.error('Unsupported clock extremum %s', level) + RETCODE = 1 + return + + if clkType not in {'sclk', 'mclk'}: + printLog(None, 'Invalid clock type identifier %s, use sclk or mclk ' % (clkType), None) + logging.error('Unsupported clock type %s', clkType) + RETCODE = 1 + return + + point = 0 + if level == "max": + point = 1 + try: + int(clkValue) + except ValueError: + printErrLog(None, 'Unable to set %s' % (clkValue)) + logging.error('%s is not an integer', clkValue) + RETCODE = 1 + return + confirmOutOfSpecWarning(autoRespond) + printLogSpacer(' Set Valid %s Extremum ' % (clkType)) + for device in deviceList: + ret = rocmsmi.rsmi_dev_clk_extremum_set(device, rsmi_freq_ind_t(int(point)), int(clkValue), rsmi_clk_names_dict[clkType]) + if rsmi_ret_ok(ret, device, silent=True): + printLog(device, 'Successfully set %s %s to %s(MHz)' % (level, clkType, clkValue), None) + else: + printErrLog(device, 'Unable to set %s %s to %s(MHz)' % (level, clkType, clkValue)) + RETCODE = 1 + if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + printLog(device, 'Setting %s %s clock is not supported for this device.' % (level, clkType), None) + def setVoltageCurve(deviceList, point, clk, volt, autoRespond): """ Set voltage curve for a point in the PowerPlay table for a list of devices. @@ -3786,6 +3831,7 @@ if __name__ == '__main__': groupAction.add_argument('--setvc', help='Change SCLK Voltage Curve (MHz mV) for a specific point', metavar=('POINT', 'SCLK', 'SVOLT'), nargs=3) groupAction.add_argument('--setsrange', help='Set min and max SCLK speed', metavar=('SCLKMIN', 'SCLKMAX'), nargs=2) + groupAction.add_argument('--setextremum', help='Set min/max of SCLK/MCLK speed', metavar=('min|max', "sclk|mclk", 'CLK'), nargs=3) groupAction.add_argument('--setmrange', help='Set min and max MCLK speed', metavar=('MCLKMIN', 'MCLKMAX'), nargs=2) groupAction.add_argument('--setfan', help='Set GPU Fan Speed (Level or %%)', metavar='LEVEL') groupAction.add_argument('--setperflevel', help='Set Performance Level', metavar='LEVEL') @@ -3855,7 +3901,7 @@ if __name__ == '__main__': or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or args.setmemoverdrive \ or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \ args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \ - args.setvc or args.setsrange or args.setmrange or args.setclock or \ + args.setvc or args.setsrange or args.setextremum or args.setmrange or args.setclock or \ args.setcomputepartition or args.setmemorypartition or args.resetcomputepartition or args.resetmemorypartition: relaunchAsSudo() @@ -4082,6 +4128,8 @@ if __name__ == '__main__': setProfile(deviceList, args.setprofile) if args.setvc: setVoltageCurve(deviceList, args.setvc[0], args.setvc[1], args.setvc[2], args.autorespond) + if args.setextremum: + setClockExtremum(deviceList, args.setextremum[0], args.setextremum[1], args.setextremum[2], args.autorespond) if args.setsrange: setClockRange(deviceList, 'sclk', args.setsrange[0], args.setsrange[1], args.autorespond) if args.setmrange: diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 29a535c509..200466947c 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -1377,6 +1377,58 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, CATCH } +rsmi_status_t rsmi_dev_clk_extremum_set(uint32_t dv_ind, rsmi_freq_ind_t level, + uint64_t clkvalue, + rsmi_clk_type_t clkType) { + TRY + rsmi_status_t ret; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); + + if (clkType != RSMI_CLK_TYPE_SYS && clkType != RSMI_CLK_TYPE_MEM) { + return RSMI_STATUS_INVALID_ARGS; + } + if (level != RSMI_FREQ_IND_MIN && level != RSMI_FREQ_IND_MAX) { + return RSMI_STATUS_INVALID_ARGS; + } + + std::map clk_char_map = { + {RSMI_CLK_TYPE_SYS, "s"}, + {RSMI_CLK_TYPE_MEM, "m"}, + }; + DEVICE_MUTEX + + // Set perf. level to manual so that we can then set the power profile + ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL); + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + // For clock frequency setting, enter a new value by writing a string that + // contains "s/m index clock" to the file. The index should be 0 if to set + // minimum clock. And 1 if to set maximum clock. E.g., "s 0 500" will update + // minimum sclk to be 500 MHz. "m 1 800" will update maximum mclk to 800Mhz. + + std::string sysvalue = clk_char_map[clkType]; + sysvalue += ' ' + std::to_string(level); + sysvalue += ' ' + std::to_string(clkvalue); + sysvalue += '\n'; + + ret = set_dev_range(dv_ind, sysvalue); + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + ret = set_dev_range(dv_ind, "c"); + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + return RSMI_STATUS_SUCCESS; + CATCH +} + rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, uint64_t maxclkvalue, rsmi_clk_type_t clkType) {