From 2feb0ae998cf01d346473f4955ecac3698e774e1 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Thu, 4 Dec 2025 09:52:59 -0600 Subject: [PATCH] Fix powercap default to enum for sensor_ind (#2004) * Fix powercap default to enum for sensor_ind Signed-off-by: Maisam Arif * [SWDEV-559965] Refactor amdsmi set power cap Modified power cap set to accept args with optional power_cap type. Added power_cap helper validate_and_set_power_cap(). Fixed JSON output format. Signed-off-by: Bindhiya Kanangot Balakrishnan --------- Signed-off-by: Maisam Arif Signed-off-by: Bindhiya Kanangot Balakrishnan Co-authored-by: Bindhiya Kanangot Balakrishnan --- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 55 ++++--------- projects/amdsmi/amdsmi_cli/amdsmi_helpers.py | 81 +++++++++++++++++++ projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 22 ++--- .../amdsmi/py-interface/amdsmi_interface.py | 2 +- 4 files changed, 109 insertions(+), 51 deletions(-) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 6d6107cce5..411b348b83 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -4983,50 +4983,23 @@ class AMDSMICommands(): # Universal args if isinstance(args.power_cap, tuple): pwr_type = args.power_cap.pwr_type - pwr_type_as_int = (0 if pwr_type == "ppt0" else 1 if pwr_type == "ppt1" else None) - pwr_type = pwr_type.upper() requested_power_cap = args.power_cap.watts - try: - power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu, pwr_type_as_int) - logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") - min_power_cap = power_cap_info["min_power_cap"] - min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO) - max_power_cap = power_cap_info["max_power_cap"] - max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO) - current_power_cap = power_cap_info["power_cap"] - current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO) - except amdsmi_exception.AmdSmiLibraryException as e: - min_power_cap = "N/A" - max_power_cap = "N/A" - current_power_cap = "N/A" - self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set {pwr_type} power cap to {requested_power_cap}W") - self.logger.print_output() - self.logger.clear_multiple_devices_output() - return - if requested_power_cap == current_power_cap: - self.logger.store_output(args.gpu, 'powercap', f"{pwr_type} power cap is already set to {requested_power_cap}W") - elif current_power_cap == 0: - self.logger.store_output(args.gpu, 'powercap', f"Unable to set {pwr_type} power cap to {requested_power_cap}W, current value is {current_power_cap}W") - elif requested_power_cap >= min_power_cap and requested_power_cap <= max_power_cap and requested_power_cap > 0: - try: - new_power_cap = self.helpers.convert_SI_unit(requested_power_cap, AMDSMIHelpers.SI_Unit.BASE, - AMDSMIHelpers.SI_Unit.MICRO) - amdsmi_interface.amdsmi_set_power_cap(args.gpu, pwr_type_as_int, new_power_cap) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set {pwr_type} power cap to {requested_power_cap}W") - self.logger.print_output() - self.logger.clear_multiple_devices_output() - return - - self.logger.store_output(args.gpu, 'powercap', f"Successfully set {pwr_type} power cap to {requested_power_cap}W") + # If pwr_type is None, default to ppt0 (legacy behavior) + if pwr_type is None: + pwr_type = "ppt0" + pwr_type_as_int = 0 else: - # setting power cap to 0 will return the current power cap so the technical minimum value is 1 - if min_power_cap == 0: - min_power_cap = 1 - self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap}W and {max_power_cap}W") + pwr_type_as_int = 0 if pwr_type == "ppt0" else 1 + + # Set the power cap for the specified sensor + pwr_type_upper = pwr_type.upper() + result = self.helpers.validate_and_set_power_cap( + args.gpu, pwr_type_as_int, pwr_type_upper, requested_power_cap, self.logger) + self.logger.store_output(args.gpu, 'powercap', result) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices self.logger.print_output() self.logger.clear_multiple_devices_output() return diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 0933579ae2..04767456eb 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -2042,3 +2042,84 @@ class AMDSMIHelpers(): type_name, gpu_id, e.get_error_info()) return base_board_temp_dict + + def validate_and_set_power_cap(self, device_handle, power_type, power_type_key, requested_power_cap, logger): + """Validate and set power cap for a specific sensor. + + Args: + device_handle: GPU device handle + power_type: Sensor ID (0 for ppt0, 1 for ppt1) + power_type_key: Display name for the sensor (e.g., "PPT0") + requested_power_cap: Requested power cap value in watts + logger: AMDSMILogger instance for format-aware output + + Returns: + dict or str: Structured data for JSON/CSV or formatted string for human-readable output + """ + try: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(device_handle, power_type) + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + logging.debug(f"Power cap info for gpu {gpu_id} {power_type_key} | {power_cap_info}") + + min_power_cap = self.convert_SI_unit(power_cap_info["min_power_cap"], AMDSMIHelpers.SI_Unit.MICRO) + max_power_cap = self.convert_SI_unit(power_cap_info["max_power_cap"], AMDSMIHelpers.SI_Unit.MICRO) + current_power_cap = self.convert_SI_unit(power_cap_info["power_cap"], AMDSMIHelpers.SI_Unit.MICRO) + + # Return structured data for JSON/CSV or formatted string for human-readable + if requested_power_cap == current_power_cap: + if logger.is_json_format() or logger.is_csv_format(): + return { + "status": "already_set", + "sensor": power_type_key, + "requested_power_cap": self.unit_format(logger, requested_power_cap, "W"), + "current_power_cap": self.unit_format(logger, current_power_cap, "W"), + "message": f"{power_type_key} power cap is already set to {requested_power_cap}W" + } + return f"{power_type_key} power cap is already set to {requested_power_cap}W" + elif current_power_cap == 0: + if logger.is_json_format() or logger.is_csv_format(): + return { + "status": "error", + "sensor": power_type_key, + "requested_power_cap": self.unit_format(logger, requested_power_cap, "W"), + "current_power_cap": self.unit_format(logger, current_power_cap, "W"), + "message": f"Unable to set {power_type_key} power cap to {requested_power_cap}W, current value is {current_power_cap}W" + } + return f"Unable to set {power_type_key} power cap to {requested_power_cap}W, current value is {current_power_cap}W" + elif not (min_power_cap < requested_power_cap <= max_power_cap and requested_power_cap > 0): + # setting power cap to 0 will return the current power cap so the technical minimum value is 1 + min_cap_display = 1 if min_power_cap == 0 else min_power_cap + if logger.is_json_format() or logger.is_csv_format(): + return { + "status": "error", + "sensor": power_type_key, + "requested_power_cap": self.unit_format(logger, requested_power_cap, "W"), + "min_power_cap": self.unit_format(logger, min_cap_display, "W"), + "max_power_cap": self.unit_format(logger, max_power_cap, "W"), + "message": f"Power cap must be between {min_cap_display}W and {max_power_cap}W" + } + return f"Power cap must be between {min_cap_display}W and {max_power_cap}W" + # Set the power cap + new_power_cap = self.convert_SI_unit(requested_power_cap, AMDSMIHelpers.SI_Unit.BASE, AMDSMIHelpers.SI_Unit.MICRO) + amdsmi_interface.amdsmi_set_power_cap(device_handle, power_type, new_power_cap) + if logger.is_json_format() or logger.is_csv_format(): + return { + "status": "success", + "sensor": power_type_key, + "power_cap": self.unit_format(logger, requested_power_cap, "W"), + "message": f"Successfully set {power_type_key} power cap to {requested_power_cap}W" + } + return f"Successfully set {power_type_key} power cap to {requested_power_cap}W" + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + error_msg = f"[{e.get_error_info(detailed=False)}] Unable to set {power_type_key} power cap to {requested_power_cap}W" + if logger.is_json_format() or logger.is_csv_format(): + return { + "status": "error", + "sensor": power_type_key, + "requested_power_cap": self.unit_format(logger, requested_power_cap, "W"), + "error": e.get_error_info(detailed=False), + "message": error_msg + } + return error_msg diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index bb61927276..aa8b57c25a 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -300,15 +300,19 @@ class AMDSMIParser(argparse.ArgumentParser): class AMDSMIPowerCapArgs(argparse.Action): def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace, values: list, option_string: Optional[str] = None) -> None: - if len(values) != 2: + if len(values) == 1: + # Only wattage provided - set all available power cap types + power_cap_value = values[0] + power_cap_type = None # None means all available sensors + elif len(values) == 2: + # Both power type and wattage provided + power_cap_value = values[0] + power_cap_type = values[1] + if power_cap_type not in ['ppt0', 'ppt1']: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], power_cap_type, output_format) + else: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], values, output_format) - power_cap_type = values[0] - power_cap_value = values[1] - - if power_cap_type not in ['ppt0', 'ppt1']: - raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], power_cap_type, output_format) - if not power_cap_value.isdigit(): raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], power_cap_value, output_format) @@ -1320,7 +1324,7 @@ class AMDSMIParser(argparse.ArgumentParser): ptl_format_help_choices_str = ", ".join(self.helpers.get_ptl_values()[0][0:-1]) set_ptl_format_help = f"Set the PTL format on a GPU processor. For example, --ptl-format I8,F32\n\tSet to one of the following PTL formats: {ptl_format_help_choices_str}" ppt0_power_cap_min, ppt0_power_cap_max, ppt1_power_cap_min, ppt1_power_cap_max = self.helpers.get_power_caps() - set_power_cap_help = f"Set either PPT0 or PPT1 power capacity limit:\n\tEx: `amd-smi set -o ppt0 1300`\n\tPPT0 min cap: {ppt0_power_cap_min}, PPT0 max cap: {ppt0_power_cap_max}\n\tPPT1 min cap: {ppt1_power_cap_min}, PPT1 max cap: {ppt1_power_cap_max}" + set_power_cap_help = f"Set either PPT0 or PPT1 power capacity limit:\n\tEx: `amd-smi set -o 1300 ppt0`\n\tPPT0 min cap: {ppt0_power_cap_min}, PPT0 max cap: {ppt0_power_cap_max}\n\tPPT1 min cap: {ppt1_power_cap_min}, PPT1 max cap: {ppt1_power_cap_max}" set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \n\tex: amd-smi set -L (sclk | mclk) (min | max) value" set_process_isolation_help = "Enable or disable the GPU process isolation on a per partition basis:\n 0 for disable and 1 for enable.\n" @@ -1359,7 +1363,7 @@ class AMDSMIParser(argparse.ArgumentParser): required=False, help=set_compute_partition_help, metavar=('TYPE/INDEX')) set_value_exclusive_group.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') # Power cap is enabled on guest, maintain order - set_value_exclusive_group.add_argument('-o', '--power-cap', action=self._power_cap_options(), nargs=2, required=False, help=set_power_cap_help, metavar=('PWR_TYPE', 'WATTS')) + set_value_exclusive_group.add_argument('-o', '--power-cap', action=self._power_cap_options(), nargs='+', required=False, help=set_power_cap_help, metavar=('WATTS', '[PWR_TYPE]')) if self.helpers.is_baremetal(): set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID') set_value_exclusive_group.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=lambda value: self._not_negative_int(value, '--xgmi-plpd'), help=set_xgmi_plpd_help, metavar='POLICY_ID') diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 656c6ef0be..fbe7bb65ae 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -2191,7 +2191,7 @@ def amdsmi_get_supported_power_cap( def amdsmi_get_power_cap_info( processor_handle: processor_handle_t, - sensor_ind: int = 0 + sensor_ind: int = AmdSmiPowerCapType.PPT0 ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException(