From 4e568b2eea3517179de1437a8a11fd5a82be5e4f Mon Sep 17 00:00:00 2001 From: "Arif, Maisam" Date: Mon, 18 Aug 2025 14:59:14 -0500 Subject: [PATCH] [SWDEV-540665] Add power_cap set to Linux Guest (#626) Signed-off-by: Maisam Arif Change-Id: I3c8d707681c141390b40521231e0d638c81cdeaf [ROCm/amdsmi commit: 2d5accd000400fcc7e8cca8409960ac35b0c3f86] --- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 98 +++++++++---------- projects/amdsmi/amdsmi_cli/amdsmi_helpers.py | 1 + projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 14 ++- 3 files changed, 58 insertions(+), 55 deletions(-) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index e739613077..9e3dea0a5a 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -4463,8 +4463,9 @@ class AMDSMICommands(): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) else: - if not any([args.process_isolation is not None, - args.clk_limit is not None]): + if not any([args.power_cap is not None, + args.clk_limit is not None, + args.process_isolation is not None]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -4590,7 +4591,6 @@ class AMDSMICommands(): self.logger.print_output() self.logger.clear_multiple_devices_output() return - if args.memory_partition: #################################################################### # Get current and available memory partition modes # @@ -4634,52 +4634,6 @@ class AMDSMICommands(): self.logger.print_output() self.logger.clear_multiple_devices_output() return - - if isinstance(args.power_cap, int): - try: - power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) - logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") - min_power_cap = power_cap_info["min_power_cap"] - min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO) - max_power_cap = power_cap_info["max_power_cap"] - max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO) - current_power_cap = power_cap_info["power_cap"] - current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO) - except amdsmi_exception.AmdSmiLibraryException as e: - min_power_cap = "N/A" - max_power_cap = "N/A" - current_power_cap = "N/A" - self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set power cap to {args.power_cap}W") - self.logger.print_output() - self.logger.clear_multiple_devices_output() - return - - if args.power_cap == current_power_cap: - self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}W") - elif current_power_cap == 0: - self.logger.store_output(args.gpu, 'powercap', f"Unable to set power cap to {args.power_cap}W, current value is {current_power_cap}W") - elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap: - try: - new_power_cap = self.helpers.convert_SI_unit(args.power_cap, AMDSMIHelpers.SI_Unit.BASE, - AMDSMIHelpers.SI_Unit.MICRO) - amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, new_power_cap) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set power cap to {args.power_cap}W") - self.logger.print_output() - self.logger.clear_multiple_devices_output() - return - - self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}W") - else: - # setting power cap to 0 will return the current power cap so the technical minimum value is 1 - if min_power_cap == 0: - min_power_cap = 1 - self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap}W and {max_power_cap}W") - self.logger.print_output() - self.logger.clear_multiple_devices_output() - return if isinstance(args.soc_pstate, int): try: amdsmi_interface.amdsmi_set_soc_pstate(args.gpu, args.soc_pstate) @@ -4819,7 +4773,52 @@ class AMDSMICommands(): self.logger.print_output() self.logger.clear_multiple_devices_output() return + # Universal args + if isinstance(args.power_cap, int): + try: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) + logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") + min_power_cap = power_cap_info["min_power_cap"] + min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO) + max_power_cap = power_cap_info["max_power_cap"] + max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO) + current_power_cap = power_cap_info["power_cap"] + current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO) + except amdsmi_exception.AmdSmiLibraryException as e: + min_power_cap = "N/A" + max_power_cap = "N/A" + current_power_cap = "N/A" + self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set power cap to {args.power_cap}W") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.power_cap == current_power_cap: + self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}W") + elif current_power_cap == 0: + self.logger.store_output(args.gpu, 'powercap', f"Unable to set power cap to {args.power_cap}W, current value is {current_power_cap}W") + elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap: + try: + new_power_cap = self.helpers.convert_SI_unit(args.power_cap, AMDSMIHelpers.SI_Unit.BASE, + AMDSMIHelpers.SI_Unit.MICRO) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, new_power_cap) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set power cap to {args.power_cap}W") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}W") + else: + # setting power cap to 0 will return the current power cap so the technical minimum value is 1 + if min_power_cap == 0: + min_power_cap = 1 + self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap}W and {max_power_cap}W") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return if isinstance(args.clk_limit, tuple): clk_type = args.clk_limit.clk_type lim_type = args.clk_limit.lim_type @@ -4886,7 +4885,6 @@ class AMDSMICommands(): self.logger.print_output() self.logger.clear_multiple_devices_output() return - if isinstance(args.process_isolation, int): status_string = "Enabled" if args.process_isolation else "Disabled" result = f"Requested process isolation to {status_string}" # This should not print out diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index aaa9053886..afb0eb42dc 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -821,6 +821,7 @@ class AMDSMIHelpers(): power_cap_min = power_cap_info['min_power_cap'] except amdsmi_interface.AmdSmiLibraryException as e: logging.debug(f"AMDSMIHelpers.get_power_caps - Unable to get power cap info for device {dev}: {str(e)}") + power_cap_min, power_cap_max = "N/A", "N/A" continue return (power_cap_min, power_cap_max) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 0accb11520..9019fded73 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1220,15 +1220,17 @@ class AMDSMIParser(argparse.ArgumentParser): memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types()) set_compute_partition_help = f"Set one of the following the accelerator TYPE or profile INDEX:\n\t{accelerator_set_choices}.\n\tUse `sudo amd-smi partition --accelerator` to find acceptable values." set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" - power_cap_min, power_cap_max = self.helpers.get_power_caps() - power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO) - power_cap_min = self.helpers.convert_SI_unit(power_cap_min, AMDSMIHelpers.SI_Unit.MICRO) - set_power_cap_help = f"Set power capacity limit:\n\tmin cap: {power_cap_min} W, max cap: {power_cap_max} W" soc_pstate_help_info = ", ".join(self.helpers.get_soc_pstates()) set_soc_pstate_help = f"Set the GPU soc pstate policy using policy id, an integer. Valid id's include:\n\t{soc_pstate_help_info}" xgmi_plpd_help_info = ", ".join(self.helpers.get_xgmi_plpd_policies()) set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id, an integer. Valid id's include:\n\t{xgmi_plpd_help_info}" set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels." + power_cap_min, power_cap_max = self.helpers.get_power_caps() + if power_cap_max != "N/A": + power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO) + if power_cap_min != "N/A": + power_cap_min = self.helpers.convert_SI_unit(power_cap_min, AMDSMIHelpers.SI_Unit.MICRO) + set_power_cap_help = f"Set power capacity limit:\n\tmin cap: {power_cap_min} W, max cap: {power_cap_max} W" set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \n\tex: amd-smi set -L (sclk | mclk) (min | max) value" set_process_isolation_help = "Enable or disable the GPU process isolation on a per partition basis: 0 for disable and 1 for enable.\n" @@ -1266,7 +1268,9 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=accelerator_set_choices, type=lambda value: self._is_command_supported(value, accelerator_set_choices, '--compute-partition'), required=False, help=set_compute_partition_help, metavar=('TYPE/INDEX')) set_value_exclusive_group.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') - set_value_exclusive_group.add_argument('-o', '--power-cap', action='store', type=lambda value: self._positive_int(value, '--power-cap'), required=False, help=set_power_cap_help, metavar='WATTS') + # Power cap is enabled on guest, maintain order + set_value_exclusive_group.add_argument('-o', '--power-cap', action='store', type=lambda value: self._positive_int(value, '--power-cap'), required=False, help=set_power_cap_help, metavar='WATTS') + if self.helpers.is_baremetal(): set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID') set_value_exclusive_group.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=lambda value: self._not_negative_int(value, '--xgmi-plpd'), help=set_xgmi_plpd_help, metavar='POLICY_ID') set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'FREQ_LEVELS'))