[SWDEV-540665] Add power_cap set to Linux Guest (#626)

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Change-Id: I3c8d707681c141390b40521231e0d638c81cdeaf

[ROCm/amdsmi commit: 2d5accd000]
Этот коммит содержится в:
Arif, Maisam
2025-08-18 14:59:14 -05:00
коммит произвёл GitHub
родитель 7ab967ec69
Коммит 4e568b2eea
3 изменённых файлов: 58 добавлений и 55 удалений
+48 -50
Просмотреть файл
@@ -4463,8 +4463,9 @@ class AMDSMICommands():
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
else:
if not any([args.process_isolation is not None,
args.clk_limit is not None]):
if not any([args.power_cap is not None,
args.clk_limit is not None,
args.process_isolation is not None]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
@@ -4590,7 +4591,6 @@ class AMDSMICommands():
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
if args.memory_partition:
####################################################################
# Get current and available memory partition modes #
@@ -4634,52 +4634,6 @@ class AMDSMICommands():
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
if isinstance(args.power_cap, int):
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}")
min_power_cap = power_cap_info["min_power_cap"]
min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
max_power_cap = power_cap_info["max_power_cap"]
max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
current_power_cap = power_cap_info["power_cap"]
current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
except amdsmi_exception.AmdSmiLibraryException as e:
min_power_cap = "N/A"
max_power_cap = "N/A"
current_power_cap = "N/A"
self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set power cap to {args.power_cap}W")
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
if args.power_cap == current_power_cap:
self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}W")
elif current_power_cap == 0:
self.logger.store_output(args.gpu, 'powercap', f"Unable to set power cap to {args.power_cap}W, current value is {current_power_cap}W")
elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap:
try:
new_power_cap = self.helpers.convert_SI_unit(args.power_cap, AMDSMIHelpers.SI_Unit.BASE,
AMDSMIHelpers.SI_Unit.MICRO)
amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, new_power_cap)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set power cap to {args.power_cap}W")
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}W")
else:
# setting power cap to 0 will return the current power cap so the technical minimum value is 1
if min_power_cap == 0:
min_power_cap = 1
self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap}W and {max_power_cap}W")
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
if isinstance(args.soc_pstate, int):
try:
amdsmi_interface.amdsmi_set_soc_pstate(args.gpu, args.soc_pstate)
@@ -4819,7 +4773,52 @@ class AMDSMICommands():
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
# Universal args
if isinstance(args.power_cap, int):
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}")
min_power_cap = power_cap_info["min_power_cap"]
min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
max_power_cap = power_cap_info["max_power_cap"]
max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
current_power_cap = power_cap_info["power_cap"]
current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
except amdsmi_exception.AmdSmiLibraryException as e:
min_power_cap = "N/A"
max_power_cap = "N/A"
current_power_cap = "N/A"
self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set power cap to {args.power_cap}W")
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
if args.power_cap == current_power_cap:
self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}W")
elif current_power_cap == 0:
self.logger.store_output(args.gpu, 'powercap', f"Unable to set power cap to {args.power_cap}W, current value is {current_power_cap}W")
elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap:
try:
new_power_cap = self.helpers.convert_SI_unit(args.power_cap, AMDSMIHelpers.SI_Unit.BASE,
AMDSMIHelpers.SI_Unit.MICRO)
amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, new_power_cap)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set power cap to {args.power_cap}W")
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}W")
else:
# setting power cap to 0 will return the current power cap so the technical minimum value is 1
if min_power_cap == 0:
min_power_cap = 1
self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap}W and {max_power_cap}W")
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
if isinstance(args.clk_limit, tuple):
clk_type = args.clk_limit.clk_type
lim_type = args.clk_limit.lim_type
@@ -4886,7 +4885,6 @@ class AMDSMICommands():
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
if isinstance(args.process_isolation, int):
status_string = "Enabled" if args.process_isolation else "Disabled"
result = f"Requested process isolation to {status_string}" # This should not print out
+1
Просмотреть файл
@@ -821,6 +821,7 @@ class AMDSMIHelpers():
power_cap_min = power_cap_info['min_power_cap']
except amdsmi_interface.AmdSmiLibraryException as e:
logging.debug(f"AMDSMIHelpers.get_power_caps - Unable to get power cap info for device {dev}: {str(e)}")
power_cap_min, power_cap_max = "N/A", "N/A"
continue
return (power_cap_min, power_cap_max)
+9 -5
Просмотреть файл
@@ -1220,15 +1220,17 @@ class AMDSMIParser(argparse.ArgumentParser):
memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types())
set_compute_partition_help = f"Set one of the following the accelerator TYPE or profile INDEX:\n\t{accelerator_set_choices}.\n\tUse `sudo amd-smi partition --accelerator` to find acceptable values."
set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
power_cap_min, power_cap_max = self.helpers.get_power_caps()
power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO)
power_cap_min = self.helpers.convert_SI_unit(power_cap_min, AMDSMIHelpers.SI_Unit.MICRO)
set_power_cap_help = f"Set power capacity limit:\n\tmin cap: {power_cap_min} W, max cap: {power_cap_max} W"
soc_pstate_help_info = ", ".join(self.helpers.get_soc_pstates())
set_soc_pstate_help = f"Set the GPU soc pstate policy using policy id, an integer. Valid id's include:\n\t{soc_pstate_help_info}"
xgmi_plpd_help_info = ", ".join(self.helpers.get_xgmi_plpd_policies())
set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id, an integer. Valid id's include:\n\t{xgmi_plpd_help_info}"
set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels."
power_cap_min, power_cap_max = self.helpers.get_power_caps()
if power_cap_max != "N/A":
power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO)
if power_cap_min != "N/A":
power_cap_min = self.helpers.convert_SI_unit(power_cap_min, AMDSMIHelpers.SI_Unit.MICRO)
set_power_cap_help = f"Set power capacity limit:\n\tmin cap: {power_cap_min} W, max cap: {power_cap_max} W"
set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \n\tex: amd-smi set -L (sclk | mclk) (min | max) value"
set_process_isolation_help = "Enable or disable the GPU process isolation on a per partition basis: 0 for disable and 1 for enable.\n"
@@ -1266,7 +1268,9 @@ class AMDSMIParser(argparse.ArgumentParser):
set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=accelerator_set_choices, type=lambda value: self._is_command_supported(value, accelerator_set_choices, '--compute-partition'),
required=False, help=set_compute_partition_help, metavar=('TYPE/INDEX'))
set_value_exclusive_group.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
set_value_exclusive_group.add_argument('-o', '--power-cap', action='store', type=lambda value: self._positive_int(value, '--power-cap'), required=False, help=set_power_cap_help, metavar='WATTS')
# Power cap is enabled on guest, maintain order
set_value_exclusive_group.add_argument('-o', '--power-cap', action='store', type=lambda value: self._positive_int(value, '--power-cap'), required=False, help=set_power_cap_help, metavar='WATTS')
if self.helpers.is_baremetal():
set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID')
set_value_exclusive_group.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=lambda value: self._not_negative_int(value, '--xgmi-plpd'), help=set_xgmi_plpd_help, metavar='POLICY_ID')
set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'FREQ_LEVELS'))