diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index d4ccd0834b..2c3eda88a9 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -58,38 +58,94 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr - **Added support for PPT1 power limit information**. - Support has been added for querying and setting the PPT (Package Power Tracking) limits - There are two PPT limits, PPT0 has lower limit and tracks a filtered version of the input power and PPT1 has higher limit but tracks the raw input power. This is to catch spikes in the raw data. + - New API added: + - amdsmi_get_supported_power_cap(): Returns which power cap types are supported on the device (PPT0, PPT1). This will allow users to know which power cap types they can get/set. + - Original APIs remain the same but now can get/set both PPT0 and PPT1 limits (on supported hardware): + - amdsmi_get_power_cap_info() + - amdsmi_set_power_cap() - See the Changed section for changes made to the `set` and `static` commands regarding support for PPT1. ### Changed -- **`amd-smi set --power-cap` now requires sepcification of the power cap type**. - - command now takes the form: `amd-smi set --power-cap ` - - acceptable power cap types are "ppt0" and "ppt1" +- **`amd-smi set --power-cap` now requires specification of the power cap type**. + - Command now takes the form: `amd-smi set --power-cap `. Acceptable power cap types are "ppt0" and "ppt1". + Ex. - ```console - $ sudo amd-smi set --power-cap ppt1 1150 - GPU: 0 - POWERCAP: Successfully set ppt1 power cap to 1150W - ... - ``` + ```console + $ sudo amd-smi set --power-cap ppt1 1150 + GPU: 0 + POWERCAP: Successfully set PPT1 power cap to 1150W + ... + ``` +- **`amd-smi reset --power-cap` will attempt to reset both power caps**. + - When using the reset command, both PPT0 and PPT1 power caps will be reset to their default values. If a device only has PPT0, then only PPT0 will be reset. + Ex. + ```console + $ sudo amd-smi reset --power-cap ppt1 1150 + GPU: 0 + POWERCAP: + PPT0: Successfully reset power cap to 203W + PPT1: [AMDSMI_STATUS_NOT_SUPPORTED] Unable to reset to default power cap + ... + ``` - **`amd-smi static --limit` now has a PPT1 section when PPT1 is available**. - - ```console - $ amd-smi static --limit - GPU: 0 - LIMIT: - PPT0: - MAX_POWER_LIMIT: 1000 - MIN_POWER_LIMIT: 0 - SOCKET_POWER_LIMIT: 1000 - PPT1: - MAX_POWER_LIMIT: 1300 - MIN_POWER_LIMIT: 1100 - SOCKET_POWER_LIMIT: 1250 - SLOWDOWN_EDGE_TEMPERATURE: N/A - ... - ``` + - The static --limit command has been updated to include PPT1 power limit information when available on the device. + ```console + $ amd-smi static --limit + GPU: 0 + LIMIT: + PPT0: + MAX_POWER_LIMIT: 1000 + MIN_POWER_LIMIT: 0 + SOCKET_POWER_LIMIT: 1000 + PPT1: + MAX_POWER_LIMIT: 1300 + MIN_POWER_LIMIT: 1100 + SOCKET_POWER_LIMIT: 1250 + SLOWDOWN_EDGE_TEMPERATURE: N/A + ... + ``` + - JSON and CSV formats are updated to reflect this change as well. + Ex. + ```console + $ amd-smi static --limit --json + { + "gpu_data": [ + { + "gpu": 0, + "limit": { + "ppt0": { + "max_power_limit": { + "value": 203, + "unit": "W" + }, + "min_power_limit": { + "value": 0, + "unit": "W" + }, + "socket_power_limit": { + "value": 100, + "unit": "W" + } + }, + "ppt1": { + "max_power_limit": "N/A", + "min_power_limit": "N/A", + "socket_power_limit": "N/A" + }, + ... + } + }, + ... + ``` + + ```console + $ amd-smi static --limit --csv + gpu,ppt0_max_power_limit,ppt0_min_power_limit,ppt0_socket_power_limit,ppt1_max_power_limit,ppt1_min_power_limit,ppt1_socket_power_limit,slowdown_edge_temperature,slowdown_hotspot_temperature,slowdown_vram_temperature,shutdown_edge_temperature,shutdown_hotspot_temperature,shutdown_vram_temperature + 0,203,0,100,N/A,N/A,N/A,100,110,100,105,115,105 + 1,213,0,100,N/A,N/A,N/A,109,110,100,114,115,105 + ``` ### Removed diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 2c6a113f6c..fd7b38b8cf 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -611,10 +611,13 @@ class AMDSMICommands(): for power_type in amdsmi_interface.AmdSmiPowerCapType: # Strip 'AMDSMI_POWER_CAP_TYPE_' prefix and convert to lowercase key = power_type.name.replace('AMDSMI_POWER_CAP_TYPE_', '').lower() - power_limit_types[key] = "N/A" + power_limit_types[key] = { + "max_power_limit" : "N/A", + "min_power_limit" : "N/A", + "socket_power_limit" : "N/A" + } try: - power_limit_error = False power_cap_types = amdsmi_interface.amdsmi_get_supported_power_cap(args.gpu) for sensor in power_cap_types['sensor_inds']: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu, sensor) @@ -625,9 +628,9 @@ class AMDSMICommands(): socket_power_limit = power_cap_info['power_cap'] socket_power_limit = self.helpers.convert_SI_unit(socket_power_limit, AMDSMIHelpers.SI_Unit.MICRO) ppt = { - "max_power_limit" : max_power_limit, - "min_power_limit" : min_power_limit, - "socket_power_limit" : socket_power_limit + "max_power_limit" : self.helpers.unit_format(self.logger, max_power_limit, 'W'), + "min_power_limit" : self.helpers.unit_format(self.logger, min_power_limit, 'W'), + "socket_power_limit" : self.helpers.unit_format(self.logger, socket_power_limit, 'W') } sensor_name = power_cap_types['sensor_types'][sensor] @@ -635,7 +638,6 @@ class AMDSMICommands(): sensor_key = sensor_name.name.replace('AMDSMI_POWER_CAP_TYPE_', '').lower() power_limit_types[sensor_key] = ppt except amdsmi_exception.AmdSmiLibraryException as e: - power_limit_error = True logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info()) # Edge temperature limits @@ -709,16 +711,6 @@ class AMDSMICommands(): power_unit = 'W' temp_unit_human_readable = '\N{DEGREE SIGN}C' temp_unit_json = 'C' - if not power_limit_error: - max_power_limit = self.helpers.unit_format(self.logger, - max_power_limit, - power_unit) - min_power_limit = self.helpers.unit_format(self.logger, - min_power_limit, - power_unit) - socket_power_limit = self.helpers.unit_format(self.logger, - socket_power_limit, - power_unit) if self.logger.is_human_readable_format(): if not slowdown_temp_edge_limit_error: @@ -5488,65 +5480,34 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_output() return if args.power_cap: + final_output = {"ppt0": "[AMDSMI_STATUS_NOT_SUPPORTED] Unable to reset to default power cap", "ppt1": "[AMDSMI_STATUS_NOT_SUPPORTED] Unable to reset to default power cap"} power_limit_types = {} for power_type in amdsmi_interface.AmdSmiPowerCapType: # Strip 'AMDSMI_POWER_CAP_TYPE_' prefix and convert to lowercase key = power_type.name.replace('AMDSMI_POWER_CAP_TYPE_', '').lower() power_limit_types[key] = "N/A" + current_sensor_num = 0 try: power_cap_types = amdsmi_interface.amdsmi_get_supported_power_cap(args.gpu) for sensor in power_cap_types['sensor_inds']: + current_sensor_num = sensor power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu, sensor) logging.debug(f"Power cap info for gpu {gpu_id} ppt{sensor} | {power_cap_info}") - default_power_cap_in_w = power_cap_info["default_power_cap"] - default_power_cap_in_w = self.helpers.convert_SI_unit(default_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) - current_power_cap_in_w = power_cap_info["power_cap"] - current_power_cap_in_w = self.helpers.convert_SI_unit(current_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) + default_power_cap_in_mw = power_cap_info["default_power_cap"] + default_power_cap_in_w = self.helpers.convert_SI_unit(default_power_cap_in_mw, AMDSMIHelpers.SI_Unit.MICRO) + current_power_cap_in_mw = power_cap_info["power_cap"] + current_power_cap_in_w = self.helpers.convert_SI_unit(current_power_cap_in_mw, AMDSMIHelpers.SI_Unit.MICRO) sensor_name = power_cap_types['sensor_types'][sensor] # Strip 'AMDSMI_POWER_CAP_TYPE_' prefix and convert to lowercase sensor_key = sensor_name.name.replace('AMDSMI_POWER_CAP_TYPE_', '').lower() power_limit_types[sensor_key] = (default_power_cap_in_w, current_power_cap_in_w) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, sensor, default_power_cap_in_mw) + final_output[f"ppt{current_sensor_num}"] = f"Successfully reset power cap to {default_power_cap_in_w}W" except amdsmi_exception.AmdSmiLibraryException as e: - self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to reset power cap to default") - self.logger.print_output() - self.logger.clear_multiple_devices_output() - return - - # TODO Make agnostic to number of power cap types - final_output = {"ppt0": "", "ppt1": ""} - if power_limit_types['ppt0'] == "N/A": - final_output['ppt0'] = f"PPT0 Power cap information is not available" - elif power_limit_types['ppt0'][1] == power_limit_types['ppt0'][0]: - final_output['ppt0'] = f"PPT0 Power cap is already set to {power_limit_types['ppt0'][0]}W" - else: - try: - default_ppt0_power_cap_in_uw = self.helpers.convert_SI_unit(power_limit_types['ppt0'][0], - AMDSMIHelpers.SI_Unit.BASE, - AMDSMIHelpers.SI_Unit.MICRO) - amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_ppt0_power_cap_in_uw) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to reset PPT0 power cap to {power_limit_types['ppt0'][0]} on GPU {gpu_id}") from e - final_output['ppt0'] = f"Successfully reset PPT0 power cap to {power_limit_types['ppt0'][0]}W" - - if power_limit_types['ppt1'] == "N/A": - final_output['ppt1'] = f"PPT1 Power cap information is not available" - elif power_limit_types['ppt1'][1] == power_limit_types['ppt1'][0]: - final_output['ppt1'] = f"PPT1 Power cap is already set to {power_limit_types['ppt1'][0]}W" - else: - try: - default_ppt1_power_cap_in_uw = self.helpers.convert_SI_unit(power_limit_types['ppt1'][0], - AMDSMIHelpers.SI_Unit.BASE, - AMDSMIHelpers.SI_Unit.MICRO) - amdsmi_interface.amdsmi_set_power_cap(args.gpu, 1, default_ppt1_power_cap_in_uw) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to reset PPT1 power cap to {power_limit_types['ppt1'][0]} on GPU {gpu_id}") from e - final_output['ppt1'] = f"Successfully reset PPT1 power cap to {power_limit_types['ppt1'][0]}W" - + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + final_output[f"ppt{current_sensor_num}"] = f"[{e.get_error_info(detailed=False)}] Unable to reset cap to default power cap" self.logger.store_output(args.gpu, 'powercap', final_output) self.logger.print_output() self.logger.clear_multiple_devices_output() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 5e394b0f15..9f10d5b9df 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -846,12 +846,12 @@ class AMDSMIHelpers(): if power_limit_types[ppt_key]['power_cap_max'] == 0: power_limit_types[ppt_key]['power_cap_max'] = "N/A" - ppt0_power_cap_max = self.format_power_cap(power_limit_types['ppt0']['power_cap_min']) - ppt0_power_cap_min = self.format_power_cap(power_limit_types['ppt0']['power_cap_max']) + ppt0_power_cap_max = self.format_power_cap(power_limit_types['ppt0']['power_cap_max']) + ppt0_power_cap_min = self.format_power_cap(power_limit_types['ppt0']['power_cap_min']) ppt1_power_cap_max = self.format_power_cap(power_limit_types['ppt1']['power_cap_max']) ppt1_power_cap_min = self.format_power_cap(power_limit_types['ppt1']['power_cap_min']) - return (ppt0_power_cap_min, ppt0_power_cap_min, ppt1_power_cap_max, ppt1_power_cap_min) + return (ppt0_power_cap_min, ppt0_power_cap_max, ppt1_power_cap_min, ppt1_power_cap_max) def format_power_cap(self, value): diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 466e34b176..558205df45 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1260,7 +1260,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id, an integer. Valid id's include:\n\t{xgmi_plpd_help_info}" set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels.\n\tUse `amd-smi static --bus` to find acceptable pcie levels." ppt0_power_cap_min, ppt0_power_cap_max, ppt1_power_cap_min, ppt1_power_cap_max = self.helpers.get_power_caps() - set_power_cap_help = f"Set either PPT0 or PPT1 power capacity limit:\n\tex: amd-smi set -o ppt0 1300\n\tPPT0 min cap: {ppt0_power_cap_min}, PPT0 max cap: {ppt0_power_cap_max}\n\tPPT1 min cap: {ppt1_power_cap_min}, PPT1 max cap: {ppt1_power_cap_max}" + set_power_cap_help = f"Set either PPT0 or PPT1 power capacity limit:\n\tEx: `amd-smi set -o ppt0 1300`\n\tPPT0 min cap: {ppt0_power_cap_min}, PPT0 max cap: {ppt0_power_cap_max}\n\tPPT1 min cap: {ppt1_power_cap_min}, PPT1 max cap: {ppt1_power_cap_max}" set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \n\tex: amd-smi set -L (sclk | mclk) (min | max) value" set_process_isolation_help = "Enable or disable the GPU process isolation on a per partition basis:\n 0 for disable and 1 for enable.\n"