diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index eae998bfdb..ad3eba258e 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -4,6 +4,76 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ***All information listed below is for reference and subject to change.*** +## amd_smi_lib for ROCm 6.2 (Unreleased) + +### Added + +- **Added `MIN_POWER` to output of `amd-smi static --limit`** +This change was to help users to identify what range they can change the power cap of the GPU to. We added this to simplify why a device supports (or does not support) power capping (also known as overdrive). See `amd-smi set -g all --power-cap ` or `amd-smi reset -g all --power-cap`. +```shell +$ amd-smi static --limit +GPU: 0 + LIMIT: + MAX_POWER: 203 W + MIN_POWER: 0 W + SOCKET_POWER: 203 W + SLOWDOWN_EDGE_TEMPERATURE: 100 °C + SLOWDOWN_HOTSPOT_TEMPERATURE: 110 °C + SLOWDOWN_VRAM_TEMPERATURE: 100 °C + SHUTDOWN_EDGE_TEMPERATURE: 105 °C + SHUTDOWN_HOTSPOT_TEMPERATURE: 115 °C + SHUTDOWN_VRAM_TEMPERATURE: 105 °C + +GPU: 1 + LIMIT: + MAX_POWER: 213 W + MIN_POWER: 213 W + SOCKET_POWER: 213 W + SLOWDOWN_EDGE_TEMPERATURE: 109 °C + SLOWDOWN_HOTSPOT_TEMPERATURE: 110 °C + SLOWDOWN_VRAM_TEMPERATURE: 100 °C + SHUTDOWN_EDGE_TEMPERATURE: 114 °C + SHUTDOWN_HOTSPOT_TEMPERATURE: 115 °C + SHUTDOWN_VRAM_TEMPERATURE: 105 °C +``` + +### Changed + +- **`amdsmi_get_power_cap_info` now returns values in uW instead of W** +`amdsmi_get_power_cap_info` will return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). + +### Optimizations + +- N/A + +### Fixed +- **Fixed `amd-smi metric --power` now provides power output for Navi2x/Navi3x/MI1x** +These systems use an older version of gpu_metrics in amdgpu. This fix only updates what CLI outputs. +No change in any of our APIs. +```shell +$ amd-smi metric --power +GPU: 0 + POWER: + SOCKET_POWER: 11 W + GFX_VOLTAGE: 768 mV + SOC_VOLTAGE: 925 mV + MEM_VOLTAGE: 1250 mV + POWER_MANAGEMENT: ENABLED + THROTTLE_STATUS: UNTHROTTLED + +GPU: 1 + POWER: + SOCKET_POWER: 17 W + GFX_VOLTAGE: 781 mV + SOC_VOLTAGE: 806 mV + MEM_VOLTAGE: 1250 mV + POWER_MANAGEMENT: ENABLED + THROTTLE_STATUS: UNTHROTTLED +``` +- **Fixed `amdsmitstReadWrite.TestPowerCapReadWrite` test for Navi3X, Navi2X, MI100** +Updates required `amdsmi_get_power_cap_info` to return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). + + ## amd_smi_lib for ROCm 6.1.2 ### Added diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 54539ddd66..6cf63f9b92 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -414,7 +414,11 @@ class AMDSMICommands(): power_limit_error = False power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) max_power_limit = power_cap_info['max_power_cap'] + max_power_limit = AMDSMIHelpers.convert_SI_unit(max_power_limit, AMDSMIHelpers.SI_Unit.MICRO) + min_power_limit = power_cap_info['min_power_cap'] + min_power_limit = AMDSMIHelpers.convert_SI_unit(min_power_limit, AMDSMIHelpers.SI_Unit.MICRO) socket_power_limit = power_cap_info['power_cap'] + socket_power_limit = AMDSMIHelpers.convert_SI_unit(socket_power_limit, AMDSMIHelpers.SI_Unit.MICRO) except amdsmi_exception.AmdSmiLibraryException as e: power_limit_error = True max_power_limit = "N/A" @@ -492,11 +496,18 @@ class AMDSMICommands(): power_unit = 'W' temp_unit_human_readable = '\N{DEGREE SIGN}C' temp_unit_json = 'C' - if self.logger.is_human_readable_format(): - if not power_limit_error: - max_power_limit = f"{max_power_limit} {power_unit}" - socket_power_limit = f"{socket_power_limit} {power_unit}" + if not power_limit_error: + max_power_limit = self.helpers.unit_format(self.logger, + max_power_limit, + power_unit) + min_power_limit = self.helpers.unit_format(self.logger, + min_power_limit, + power_unit) + socket_power_limit = self.helpers.unit_format(self.logger, + socket_power_limit, + power_unit) + if self.logger.is_human_readable_format(): if not slowdown_temp_edge_limit_error: slowdown_temp_edge_limit = f"{slowdown_temp_edge_limit} {temp_unit_human_readable}" if not slowdown_temp_hotspot_limit_error: @@ -509,13 +520,8 @@ class AMDSMICommands(): shutdown_temp_hotspot_limit = f"{shutdown_temp_hotspot_limit} {temp_unit_human_readable}" if not shutdown_temp_vram_limit_error: shutdown_temp_vram_limit = f"{shutdown_temp_vram_limit} {temp_unit_human_readable}" - if self.logger.is_json_format(): - if not power_limit_error: - max_power_limit = {"value" : max_power_limit, - "unit" : power_unit} - socket_power_limit = {"value" : socket_power_limit, - "unit" : power_unit} + if self.logger.is_json_format(): if not slowdown_temp_edge_limit_error: slowdown_temp_edge_limit = {"value" : slowdown_temp_edge_limit, "unit" : temp_unit_json} @@ -538,6 +544,7 @@ class AMDSMICommands(): limit_info = {} # Power limits limit_info['max_power'] = max_power_limit + limit_info['min_power'] = min_power_limit limit_info['socket_power'] = socket_power_limit # Shutdown limits @@ -1326,24 +1333,19 @@ class AMDSMICommands(): for key, value in power_info.items(): if value == 0xFFFF: power_info[key] = "N/A" - elif self.logger.is_human_readable_format(): - if "voltage" in key: - power_info[key] = f"{value} {voltage_unit}" - elif "power" in key: - power_info[key] = f"{value} {power_unit}" - elif self.logger.is_json_format(): - if "voltage" in key: - power_info[key] = {"value" : value, - "unit" : voltage_unit} - elif "power" in key: - power_info[key] = {"value" : value, - "unit" : power_unit} - - power_dict['socket_power'] = power_info['current_socket_power'] - - if power_dict['socket_power'] == "N/A": - # For older gpu's when current power doesn't populate we use the average socket power instead - power_dict['socket_power'] = power_info['average_socket_power'] + elif "voltage" in key: + power_info[key] = self.helpers.unit_format(self.logger, + value, + voltage_unit) + elif "power" in key: + if ((key == "current_socket_power" or key == "average_socket_power") + and value != "N/A"): + power_dict['socket_power'] = self.helpers.unit_format(self.logger, + value, + power_unit) + power_info[key] = self.helpers.unit_format(self.logger, + value, + power_unit) power_dict['gfx_voltage'] = power_info['gfx_voltage'] power_dict['soc_voltage'] = power_info['soc_voltage'] @@ -3478,8 +3480,11 @@ class AMDSMICommands(): power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") min_power_cap = power_cap_info["min_power_cap"] + min_power_cap = AMDSMIHelpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO) max_power_cap = power_cap_info["max_power_cap"] + max_power_cap = AMDSMIHelpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO) current_power_cap = power_cap_info["power_cap"] + current_power_cap = AMDSMIHelpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO) except amdsmi_exception.AmdSmiLibraryException as e: raise ValueError(f"Unable to get power cap info from {gpu_string}") from e @@ -3487,7 +3492,9 @@ class AMDSMICommands(): self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}") elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap: try: - amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, args.power_cap * 1000000) + new_power_cap = AMDSMIHelpers.convert_SI_unit(args.power_cap, AMDSMIHelpers.SI_Unit.BASE, + AMDSMIHelpers.SI_Unit.MICRO) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, new_power_cap) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e @@ -3882,20 +3889,26 @@ class AMDSMICommands(): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") - default_power_cap = power_cap_info["default_power_cap"] + default_power_cap_in_w = power_cap_info["default_power_cap"] + default_power_cap_in_w = AMDSMIHelpers.convert_SI_unit(default_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) + current_power_cap_in_w = power_cap_info["power_cap"] + current_power_cap_in_w = AMDSMIHelpers.convert_SI_unit(current_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) except amdsmi_exception.AmdSmiLibraryException as e: raise ValueError(f"Unable to get power cap info from {gpu_id}") from e - if args.power_cap == default_power_cap: - self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap}") + if current_power_cap_in_w == default_power_cap_in_w: + self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap_in_w}") else: try: - amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap * 1000000) + default_power_cap_in_uw = AMDSMIHelpers.convert_SI_unit(default_power_cap_in_w, + AMDSMIHelpers.SI_Unit.BASE, + AMDSMIHelpers.SI_Unit.MICRO) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap_in_uw) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to reset power cap to {default_power_cap} on GPU {gpu_id}") from e - self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap}") + raise ValueError(f"Unable to reset power cap to {default_power_cap_in_w} on GPU {gpu_id}") from e + self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap_in_w}") if multiple_devices: self.logger.store_multiple_device_output() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 6383969a6a..fc19194727 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -29,6 +29,7 @@ import time from subprocess import run from subprocess import PIPE, STDOUT from typing import List +from enum import Enum from amdsmi_init import * from BDF import BDF @@ -726,3 +727,46 @@ class AMDSMIHelpers(): if logger.is_human_readable_format(): return f"{value} {unit}" return f"{value}" + + class SI_Unit(float, Enum): + GIGA = 1000000000 # 10^9 + MEGA = 1000000 # 10^6 + KILO = 1000 # 10^3 + HECTO = 100 # 10^2 + DEKA = 10 # 10^1 + BASE = 1 # 10^0 + DECI = 0.1 # 10^-1 + CENTI = 0.01 # 10^-2 + MILLI = 0.001 # 10^-3 + MICRO = 0.000001 # 10^-6 + NANO = 0.000000001 # 10^-9 + + def convert_SI_unit(val: float, unit_in: SI_Unit, unit_out = SI_Unit.BASE) -> float: + """This function will convert a value into another + scientific (SI) unit. Defaults unit_out to SI_Unit.BASE + This function returns a float. + + params: + val: float unit to convert + unit_in: Requires using SI_Unit to set current value's SI unit (eg. SI_Unit.MICRO) + unit_out - Requires using SI_Unit to set current value's SI unit + default value is SI_Unit.BASE (eg. SI_Unit.MICRO) + return: + float : converted SI unit of value requested + """ + return val * unit_in / unit_out + + def convert_SI_unit(val: int, unit_in: SI_Unit, unit_out=SI_Unit.BASE) -> int: + """This function will convert a value into another + scientific (SI) unit. Defaults unit_out to SI_Unit.BASE + This function returns a int. + + params: + val: int unit to convert + unit_in: Requires using SI_Unit to set current value's SI unit (eg. SI_Unit.MICRO) + unit_out - Requires using SI_Unit to set current value's SI unit + default value is SI_Unit.BASE (eg. SI_Unit.MICRO) + return: + int : converted SI unit of value requested + """ + return int(float(val) * unit_in / unit_out) diff --git a/projects/amdsmi/example/amd_smi_drm_example.cc b/projects/amdsmi/example/amd_smi_drm_example.cc index 25ac6ade00..92103085e9 100644 --- a/projects/amdsmi/example/amd_smi_drm_example.cc +++ b/projects/amdsmi/example/amd_smi_drm_example.cc @@ -657,15 +657,15 @@ int main() { CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_get_power_cap_info:\n"); std::cout << "\t\t Power Cap: " << cap_info.power_cap - << "W\n"; + << " uW\n"; std::cout << "\t\t Default Power Cap: " << cap_info.default_power_cap - << "\n\n"; + << " uW\n\n"; std::cout << "\t\t Dpm Cap: " << cap_info.dpm_cap - << "\n\n"; + << " MHz\n\n"; std::cout << "\t\t Min Power Cap: " << cap_info.min_power_cap - << "\n\n"; + << " uW\n\n"; std::cout << "\t\t Max Power Cap: " << cap_info.max_power_cap - << "\n\n"; + << " uW\n\n"; /// Get GPU Metrics info std::cout << "\n\n"; diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index f354515ecb..80b8417dd3 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -522,11 +522,11 @@ typedef struct { } amdsmi_pcie_info_t; typedef struct { - uint64_t power_cap; - uint64_t default_power_cap; - uint64_t dpm_cap; - uint64_t min_power_cap; - uint64_t max_power_cap; + uint64_t power_cap; //!< current power cap (uW) + uint64_t default_power_cap; //!< default power cap (uW) + uint64_t dpm_cap; //!< dpm power cap (MHz) + uint64_t min_power_cap; //!< minimum power cap (uW) + uint64_t max_power_cap; //!< maximum power cap (uW) uint64_t reserved[3]; } amdsmi_power_cap_info_t; @@ -4615,7 +4615,8 @@ amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_handle, amdsmi_board /** * @brief Returns the power caps as currently configured in the - * system. It is not supported on virtual machine guest + * system. Power in units of uW. + * It is not supported on virtual machine guest * * @platform{gpu_bm_linux} @platform{host} * diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index 7d1f414565..e49a5699f4 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -414,13 +414,13 @@ Input parameters: Output: Dictionary with fields -Field | Description ----|--- -`power_cap` | power capability -`dpm_cap` | dynamic power management capability -`default_power_cap` | default power capability -`min_power_cap` | min power capability -`max_power_cap` | max power capability +Field | Description | Units +---|---|--- +`power_cap` | power capability | uW +`dpm_cap` | dynamic power management capability | MHz +`default_power_cap` | default power capability | uW +`min_power_cap` | min power capability | uW +`max_power_cap` | max power capability | uW Exceptions that can be thrown by `amdsmi_get_power_cap_info` function: diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index bb88f578cf..cc504ea3b9 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1208,15 +1208,10 @@ amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle, if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) set_ret_success = true; - // Dividing by 1000000 to get measurement in Watts - (info->default_power_cap) /= 1000000; status = rsmi_wrapper(rsmi_dev_power_cap_range_get, processor_handle, sensor_ind, &(info->max_power_cap), &(info->min_power_cap)); - // Dividing by 1000000 to get measurement in Watts - (info->max_power_cap) /= 1000000; - (info->min_power_cap) /= 1000000; if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) set_ret_success = true; diff --git a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc index 13762c3808..10f27ab309 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc @@ -201,8 +201,6 @@ amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int return AMDSMI_STATUS_API_FAILED; } - // Dividing by 1000000 to get measurement in Watts - *cap /= 1000000; return AMDSMI_STATUS_SUCCESS; } diff --git a/projects/amdsmi/tests/amd_smi_test/functional/power_cap_read_write.cc b/projects/amdsmi/tests/amd_smi_test/functional/power_cap_read_write.cc index dbf4726779..5e1a065a61 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/power_cap_read_write.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/power_cap_read_write.cc @@ -89,9 +89,10 @@ void TestPowerCapReadWrite::Close() { void TestPowerCapReadWrite::Run(void) { amdsmi_status_t ret; - uint64_t orig, min, max, new_cap; + uint64_t default_cap, min, max, new_cap, curr_cap; clock_t start, end; double cpu_time_used; + const uint64_t MICRO_CONVERSION = 1000000; TestBase::Run(); if (setup_failed_) { @@ -110,22 +111,24 @@ void TestPowerCapReadWrite::Run(void) { ASSERT_EQ(ret, AMDSMI_STATUS_INVAL); min = info.min_power_cap; max = info.max_power_cap; - orig = info.default_power_cap; + default_cap = info.default_power_cap; + curr_cap = info.power_cap; + new_cap = (max + min)/2; // Check if power cap is within the range // skip the test otherwise - if (orig < min || orig > max) { - std::cout << "Power cap is not within the range. Skipping test for " << dv_ind << std::endl; + if (new_cap < min || new_cap > max) { + std::cout << "Power cap requested (" << new_cap + << " uW) is not within the range. Skipping test for " << dv_ind << std::endl; continue; } - new_cap = (max + min)/2; - IF_VERB(STANDARD) { - std::cout << "Original Power Cap: " << orig << " uW" << std::endl; - std::cout << "Power Cap Range: " << max << " uW to " << min << + std::cout << "[Before Set] Default Power Cap: " << default_cap << " uW" << std::endl; + std::cout << "[Before Set] Current Power Cap: " << curr_cap << " uW" << std::endl; + std::cout << "[Before Set] Power Cap Range [max to min]: " << max << " uW to " << min << " uW" << std::endl; - std::cout << "Setting new cap to " << new_cap << "..." << std::endl; + std::cout << "[Before Set] Setting new cap to " << new_cap << "..." << std::endl; } start = clock(); ret = amdsmi_set_power_cap(processor_handles_[dv_ind], 0, new_cap); @@ -142,25 +145,35 @@ void TestPowerCapReadWrite::Run(void) { ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, &info); CHK_ERR_ASRT(ret) - new_cap = info.default_power_cap; + curr_cap = info.power_cap; - // TODO(cfreehil) add some kind of assertion to verify new_cap is correct - // (or within a range) IF_VERB(STANDARD) { - std::cout << "Time spent: " << cpu_time_used << " uS" << std::endl; - std::cout << "New Power Cap: " << new_cap << " uW" << std::endl; - std::cout << "Resetting cap to " << orig << "..." << std::endl; + std::cout << "[After Set] Time spent: " << cpu_time_used << " uS" << std::endl; + std::cout << "[After Set] Current Power Cap: " << curr_cap << " uW" << std::endl; + std::cout << "[After Set] Requested Power Cap: " << new_cap << " uW" << std::endl; + std::cout << "[After Set] Power Cap Range [max to min]: " << max << " uW to " + << min << " uW" << std::endl; + std::cout << "[After Set] Resetting cap to " << default_cap << "..." << std::endl; } + // Confirm in watts the values are equal + ASSERT_EQ(curr_cap/MICRO_CONVERSION, new_cap/MICRO_CONVERSION); - ret = amdsmi_set_power_cap(processor_handles_[dv_ind], 0, orig); + // Reset to default power cap + ret = amdsmi_set_power_cap(processor_handles_[dv_ind], 0, default_cap); CHK_ERR_ASRT(ret) ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, &info); CHK_ERR_ASRT(ret) - new_cap = info.default_power_cap; + curr_cap = info.power_cap; IF_VERB(STANDARD) { - std::cout << "Current Power Cap: " << new_cap << " uW" << std::endl; + std::cout << "[After Reset] Current Power Cap: " << curr_cap << " uW" << std::endl; + std::cout << "[After Reset] Requested Power Cap (default): " << default_cap << " uW" + << std::endl; + std::cout << "[After Reset] Power Cap Range [max to min]: " << max << " uW to " + << min << " uW" << std::endl; } + // Confirm in watts the values are equal + ASSERT_EQ(curr_cap/MICRO_CONVERSION, default_cap/MICRO_CONVERSION); } }