Fix powercap default to enum for sensor_ind (#2004)

* Fix powercap default to enum for sensor_ind

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>

* [SWDEV-559965] Refactor amdsmi set power cap

Modified power cap set to accept args with
optional power_cap type. Added power_cap helper
validate_and_set_power_cap(). Fixed JSON output
format.

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>

---------

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
Co-authored-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
このコミットが含まれているのは:
Maisam Arif
2025-12-04 09:52:59 -06:00
committed by GitHub
コミット 2feb0ae998
4個のファイルの変更109行の追加51行の削除
+14 -41
ファイルの表示
@@ -4983,50 +4983,23 @@ class AMDSMICommands():
# Universal args
if isinstance(args.power_cap, tuple):
pwr_type = args.power_cap.pwr_type
pwr_type_as_int = (0 if pwr_type == "ppt0" else 1 if pwr_type == "ppt1" else None)
pwr_type = pwr_type.upper()
requested_power_cap = args.power_cap.watts
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu, pwr_type_as_int)
logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}")
min_power_cap = power_cap_info["min_power_cap"]
min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
max_power_cap = power_cap_info["max_power_cap"]
max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
current_power_cap = power_cap_info["power_cap"]
current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
except amdsmi_exception.AmdSmiLibraryException as e:
min_power_cap = "N/A"
max_power_cap = "N/A"
current_power_cap = "N/A"
self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set {pwr_type} power cap to {requested_power_cap}W")
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
if requested_power_cap == current_power_cap:
self.logger.store_output(args.gpu, 'powercap', f"{pwr_type} power cap is already set to {requested_power_cap}W")
elif current_power_cap == 0:
self.logger.store_output(args.gpu, 'powercap', f"Unable to set {pwr_type} power cap to {requested_power_cap}W, current value is {current_power_cap}W")
elif requested_power_cap >= min_power_cap and requested_power_cap <= max_power_cap and requested_power_cap > 0:
try:
new_power_cap = self.helpers.convert_SI_unit(requested_power_cap, AMDSMIHelpers.SI_Unit.BASE,
AMDSMIHelpers.SI_Unit.MICRO)
amdsmi_interface.amdsmi_set_power_cap(args.gpu, pwr_type_as_int, new_power_cap)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set {pwr_type} power cap to {requested_power_cap}W")
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
self.logger.store_output(args.gpu, 'powercap', f"Successfully set {pwr_type} power cap to {requested_power_cap}W")
# If pwr_type is None, default to ppt0 (legacy behavior)
if pwr_type is None:
pwr_type = "ppt0"
pwr_type_as_int = 0
else:
# setting power cap to 0 will return the current power cap so the technical minimum value is 1
if min_power_cap == 0:
min_power_cap = 1
self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap}W and {max_power_cap}W")
pwr_type_as_int = 0 if pwr_type == "ppt0" else 1
# Set the power cap for the specified sensor
pwr_type_upper = pwr_type.upper()
result = self.helpers.validate_and_set_power_cap(
args.gpu, pwr_type_as_int, pwr_type_upper, requested_power_cap, self.logger)
self.logger.store_output(args.gpu, 'powercap', result)
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output()
self.logger.clear_multiple_devices_output()
return
+81
ファイルの表示
@@ -2042,3 +2042,84 @@ class AMDSMIHelpers():
type_name, gpu_id, e.get_error_info())
return base_board_temp_dict
def validate_and_set_power_cap(self, device_handle, power_type, power_type_key, requested_power_cap, logger):
"""Validate and set power cap for a specific sensor.
Args:
device_handle: GPU device handle
power_type: Sensor ID (0 for ppt0, 1 for ppt1)
power_type_key: Display name for the sensor (e.g., "PPT0")
requested_power_cap: Requested power cap value in watts
logger: AMDSMILogger instance for format-aware output
Returns:
dict or str: Structured data for JSON/CSV or formatted string for human-readable output
"""
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(device_handle, power_type)
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
logging.debug(f"Power cap info for gpu {gpu_id} {power_type_key} | {power_cap_info}")
min_power_cap = self.convert_SI_unit(power_cap_info["min_power_cap"], AMDSMIHelpers.SI_Unit.MICRO)
max_power_cap = self.convert_SI_unit(power_cap_info["max_power_cap"], AMDSMIHelpers.SI_Unit.MICRO)
current_power_cap = self.convert_SI_unit(power_cap_info["power_cap"], AMDSMIHelpers.SI_Unit.MICRO)
# Return structured data for JSON/CSV or formatted string for human-readable
if requested_power_cap == current_power_cap:
if logger.is_json_format() or logger.is_csv_format():
return {
"status": "already_set",
"sensor": power_type_key,
"requested_power_cap": self.unit_format(logger, requested_power_cap, "W"),
"current_power_cap": self.unit_format(logger, current_power_cap, "W"),
"message": f"{power_type_key} power cap is already set to {requested_power_cap}W"
}
return f"{power_type_key} power cap is already set to {requested_power_cap}W"
elif current_power_cap == 0:
if logger.is_json_format() or logger.is_csv_format():
return {
"status": "error",
"sensor": power_type_key,
"requested_power_cap": self.unit_format(logger, requested_power_cap, "W"),
"current_power_cap": self.unit_format(logger, current_power_cap, "W"),
"message": f"Unable to set {power_type_key} power cap to {requested_power_cap}W, current value is {current_power_cap}W"
}
return f"Unable to set {power_type_key} power cap to {requested_power_cap}W, current value is {current_power_cap}W"
elif not (min_power_cap < requested_power_cap <= max_power_cap and requested_power_cap > 0):
# setting power cap to 0 will return the current power cap so the technical minimum value is 1
min_cap_display = 1 if min_power_cap == 0 else min_power_cap
if logger.is_json_format() or logger.is_csv_format():
return {
"status": "error",
"sensor": power_type_key,
"requested_power_cap": self.unit_format(logger, requested_power_cap, "W"),
"min_power_cap": self.unit_format(logger, min_cap_display, "W"),
"max_power_cap": self.unit_format(logger, max_power_cap, "W"),
"message": f"Power cap must be between {min_cap_display}W and {max_power_cap}W"
}
return f"Power cap must be between {min_cap_display}W and {max_power_cap}W"
# Set the power cap
new_power_cap = self.convert_SI_unit(requested_power_cap, AMDSMIHelpers.SI_Unit.BASE, AMDSMIHelpers.SI_Unit.MICRO)
amdsmi_interface.amdsmi_set_power_cap(device_handle, power_type, new_power_cap)
if logger.is_json_format() or logger.is_csv_format():
return {
"status": "success",
"sensor": power_type_key,
"power_cap": self.unit_format(logger, requested_power_cap, "W"),
"message": f"Successfully set {power_type_key} power cap to {requested_power_cap}W"
}
return f"Successfully set {power_type_key} power cap to {requested_power_cap}W"
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
error_msg = f"[{e.get_error_info(detailed=False)}] Unable to set {power_type_key} power cap to {requested_power_cap}W"
if logger.is_json_format() or logger.is_csv_format():
return {
"status": "error",
"sensor": power_type_key,
"requested_power_cap": self.unit_format(logger, requested_power_cap, "W"),
"error": e.get_error_info(detailed=False),
"message": error_msg
}
return error_msg
+13 -9
ファイルの表示
@@ -300,15 +300,19 @@ class AMDSMIParser(argparse.ArgumentParser):
class AMDSMIPowerCapArgs(argparse.Action):
def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace,
values: list, option_string: Optional[str] = None) -> None:
if len(values) != 2:
if len(values) == 1:
# Only wattage provided - set all available power cap types
power_cap_value = values[0]
power_cap_type = None # None means all available sensors
elif len(values) == 2:
# Both power type and wattage provided
power_cap_value = values[0]
power_cap_type = values[1]
if power_cap_type not in ['ppt0', 'ppt1']:
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], power_cap_type, output_format)
else:
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], values, output_format)
power_cap_type = values[0]
power_cap_value = values[1]
if power_cap_type not in ['ppt0', 'ppt1']:
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], power_cap_type, output_format)
if not power_cap_value.isdigit():
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], power_cap_value, output_format)
@@ -1320,7 +1324,7 @@ class AMDSMIParser(argparse.ArgumentParser):
ptl_format_help_choices_str = ", ".join(self.helpers.get_ptl_values()[0][0:-1])
set_ptl_format_help = f"Set the PTL format on a GPU processor. For example, --ptl-format I8,F32\n\tSet to one of the following PTL formats: {ptl_format_help_choices_str}"
ppt0_power_cap_min, ppt0_power_cap_max, ppt1_power_cap_min, ppt1_power_cap_max = self.helpers.get_power_caps()
set_power_cap_help = f"Set either PPT0 or PPT1 power capacity limit:\n\tEx: `amd-smi set -o ppt0 1300`\n\tPPT0 min cap: {ppt0_power_cap_min}, PPT0 max cap: {ppt0_power_cap_max}\n\tPPT1 min cap: {ppt1_power_cap_min}, PPT1 max cap: {ppt1_power_cap_max}"
set_power_cap_help = f"Set either PPT0 or PPT1 power capacity limit:\n\tEx: `amd-smi set -o 1300 ppt0`\n\tPPT0 min cap: {ppt0_power_cap_min}, PPT0 max cap: {ppt0_power_cap_max}\n\tPPT1 min cap: {ppt1_power_cap_min}, PPT1 max cap: {ppt1_power_cap_max}"
set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \n\tex: amd-smi set -L (sclk | mclk) (min | max) value"
set_process_isolation_help = "Enable or disable the GPU process isolation on a per partition basis:\n 0 for disable and 1 for enable.\n"
@@ -1359,7 +1363,7 @@ class AMDSMIParser(argparse.ArgumentParser):
required=False, help=set_compute_partition_help, metavar=('TYPE/INDEX'))
set_value_exclusive_group.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
# Power cap is enabled on guest, maintain order
set_value_exclusive_group.add_argument('-o', '--power-cap', action=self._power_cap_options(), nargs=2, required=False, help=set_power_cap_help, metavar=('PWR_TYPE', 'WATTS'))
set_value_exclusive_group.add_argument('-o', '--power-cap', action=self._power_cap_options(), nargs='+', required=False, help=set_power_cap_help, metavar=('WATTS', '[PWR_TYPE]'))
if self.helpers.is_baremetal():
set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID')
set_value_exclusive_group.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=lambda value: self._not_negative_int(value, '--xgmi-plpd'), help=set_xgmi_plpd_help, metavar='POLICY_ID')
+1 -1
ファイルの表示
@@ -2191,7 +2191,7 @@ def amdsmi_get_supported_power_cap(
def amdsmi_get_power_cap_info(
processor_handle: processor_handle_t,
sensor_ind: int = 0
sensor_ind: int = AmdSmiPowerCapType.PPT0
) -> Dict[str, Any]:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(