Added set & reset --power-cap
Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I9fa6378cfcdb2ad9f8406c51d885209605330627
[ROCm/amdsmi commit: a1b2fb5e0e]
此提交包含在:
@@ -123,10 +123,10 @@ For convenience, here is the help output for each command
|
||||
usage: amd-smi list [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
|
||||
[-g GPU [GPU ...]]
|
||||
|
||||
Lists all the devices on the system and the links between devices.
|
||||
Lists all the sockets and for each socket, GPUs and/or CPUs associated to
|
||||
that socket alongside some basic information for each device.
|
||||
In virtualization environments, it can also list VFs associated to each
|
||||
Lists all the devices on the system and the links between devices.
|
||||
Lists all the sockets and for each socket, GPUs and/or CPUs associated to
|
||||
that socket alongside some basic information for each device.
|
||||
In virtualization environments, it can also list VFs associated to each
|
||||
GPU with some basic information for each VF.
|
||||
|
||||
optional arguments:
|
||||
@@ -149,7 +149,7 @@ usage: amd-smi static [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
|
||||
[-g GPU [GPU ...]] [-a] [-b] [-V] [-d] [-v] [-c] [-B] [-r] [-p] [-l]
|
||||
[-u]
|
||||
|
||||
If no GPU is specified, returns static information for all GPUs on the system.
|
||||
If no GPU is specified, returns static information for all GPUs on the system.
|
||||
If no static argument is provided, all static information will be displayed.
|
||||
|
||||
Static Arguments:
|
||||
@@ -229,7 +229,7 @@ usage: amd-smi metric [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
|
||||
[-g GPU [GPU ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-m] [-u]
|
||||
[-p] [-c] [-t] [-e] [-k] [-P] [-f] [-C] [-o] [-l] [-x] [-E]
|
||||
|
||||
If no GPU is specified, returns metric information for all GPUs on the system.
|
||||
If no GPU is specified, returns metric information for all GPUs on the system.
|
||||
If no metric argument is provided all metric information will be displayed.
|
||||
|
||||
Metric arguments:
|
||||
@@ -269,29 +269,29 @@ usage: amd-smi process [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
|
||||
[-g GPU [GPU ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-G]
|
||||
[-e] [-p PID] [-n NAME]
|
||||
|
||||
If no GPU is specified, returns information for all GPUs on the system.
|
||||
If no GPU is specified, returns information for all GPUs on the system.
|
||||
If no process argument is provided all process information will be displayed.
|
||||
|
||||
Process arguments:
|
||||
-h, --help show this help message and exit
|
||||
-g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
|
||||
ID:0 | BDF:0000:23:00.0 | UUID:ffff73bf-0000-1000-80ff-ffffffffffff
|
||||
all | Selects all devices
|
||||
ID: 0 | BDF: 0000:23:00.0 | UUID: c4ff73bf-0000-1000-802e-0812b504ed69
|
||||
all | Selects all devices
|
||||
-w, --watch INTERVAL Reprint the command in a loop of INTERVAL seconds
|
||||
-W, --watch_time TIME The total TIME to watch the given command
|
||||
-i, --iterations ITERATIONS Total number of ITERATIONS to loop on the given command
|
||||
-G, --general pid, process name, memory usage
|
||||
-e, --engine All engine usages
|
||||
-p, --pid PID Gets all process information about the specified process based on Process ID
|
||||
-n, --name NAME Gets all process information about the specified process based on Process Name.
|
||||
If multiple processes have the same name information is returned for all of them.
|
||||
-n, --name NAME Gets all process information about the specified process based on Process Name.
|
||||
If multiple processes have the same name information is returned for all of them.
|
||||
|
||||
Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
--csv Displays output in CSV format (human readable by default).
|
||||
--file FILE Saves output into a file on the provided path (stdout by default).
|
||||
--loglevel LEVEL Set the logging level from the possible choices:
|
||||
DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
```
|
||||
|
||||
```bash
|
||||
@@ -320,7 +320,7 @@ Command Modifiers:
|
||||
usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
|
||||
[-g GPU [GPU ...]] [-a] [-w] [-o] [-t] [-b]
|
||||
|
||||
If no GPU is specified, returns information for all GPUs on the system.
|
||||
If no GPU is specified, returns information for all GPUs on the system.
|
||||
If no topology argument is provided all topology information will be displayed.
|
||||
|
||||
Topology arguments:
|
||||
@@ -347,9 +347,9 @@ Command Modifiers:
|
||||
~$ amd-smi set --help
|
||||
usage: amd-smi set [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] -g GPU [GPU ...]
|
||||
[-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION]
|
||||
[-M PARTITION]
|
||||
[-M PARTITION] [-o WATTS]
|
||||
|
||||
A GPU must be specified to set a configuration.
|
||||
A GPU must be specified to set a configuration.
|
||||
A set argument must be provided; Multiple set arguments are accepted
|
||||
|
||||
Set Arguments:
|
||||
@@ -365,6 +365,7 @@ Set Arguments:
|
||||
CPX, SPX, DPX, TPX, QPX
|
||||
-M, --memory-partition PARTITION Set one of the following the memory partition modes:
|
||||
NPS1, NPS2, NPS4, NPS8
|
||||
-o, --power-cap WATTS Set power capacity limit
|
||||
|
||||
Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
@@ -377,9 +378,9 @@ Command Modifiers:
|
||||
```bash
|
||||
~$ amd-smi reset --help
|
||||
usage: amd-smi reset [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] -g GPU
|
||||
[GPU ...] [-G] [-c] [-f] [-p] [-x] [-d] [-C] [-M]
|
||||
[GPU ...] [-G] [-c] [-f] [-p] [-x] [-d] [-C] [-M] [-o]
|
||||
|
||||
A GPU must be specified to reset a configuration.
|
||||
A GPU must be specified to reset a configuration.
|
||||
A reset argument must be provided; Multiple reset arguments are accepted
|
||||
|
||||
Reset Arguments:
|
||||
@@ -395,6 +396,7 @@ Reset Arguments:
|
||||
-d, --perf-determinism Disable performance determinism
|
||||
-C, --compute-partition Reset compute partitions on the specified GPU
|
||||
-M, --memory-partition Reset memory partitions on the specified GPU
|
||||
-o, --power-cap Reset power capacity limit to max capable
|
||||
|
||||
Command Modifiers:
|
||||
--json Displays output in JSON format (human readable by default).
|
||||
|
||||
@@ -290,9 +290,9 @@ class AMDSMICommands():
|
||||
# Power limits
|
||||
try:
|
||||
power_limit_error = False
|
||||
power_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
|
||||
max_power_limit = power_info['max_power_cap']
|
||||
current_power_limit = power_info['power_cap']
|
||||
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
|
||||
max_power_limit = power_cap_info['max_power_cap']
|
||||
current_power_limit = power_cap_info['power_cap']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
power_limit_error = True
|
||||
max_power_limit = "N/A"
|
||||
@@ -1761,7 +1761,7 @@ class AMDSMICommands():
|
||||
|
||||
def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
|
||||
profile=None, perf_determinism=None, compute_partition=None,
|
||||
memory_partition=None):
|
||||
memory_partition=None, power_cap=None):
|
||||
"""Issue reset commands to target gpu(s)
|
||||
|
||||
Args:
|
||||
@@ -1774,6 +1774,7 @@ class AMDSMICommands():
|
||||
perf_determinism (int, optional): Value override for args.perf_determinism. Defaults to None.
|
||||
compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None.
|
||||
memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None.
|
||||
power_cap (int, optional): Value override for args.power_cap. Defaults to None.
|
||||
|
||||
Raises:
|
||||
ValueError: Value error if no gpu value is provided
|
||||
@@ -1785,18 +1786,20 @@ class AMDSMICommands():
|
||||
# Set args.* to passed in arguments
|
||||
if gpu:
|
||||
args.gpu = gpu
|
||||
if fan:
|
||||
if fan is not None:
|
||||
args.fan = fan
|
||||
if perf_level:
|
||||
args.perf_level = perf_level
|
||||
if profile:
|
||||
args.profile = profile
|
||||
if perf_determinism:
|
||||
if perf_determinism is not None:
|
||||
args.perf_determinism = perf_determinism
|
||||
if compute_partition:
|
||||
args.compute_partition = compute_partition
|
||||
if memory_partition:
|
||||
args.memory_partition = memory_partition
|
||||
if power_cap:
|
||||
args.power_cap = power_cap
|
||||
|
||||
# Handle No GPU passed
|
||||
if args.gpu == None:
|
||||
@@ -1810,7 +1813,11 @@ class AMDSMICommands():
|
||||
args.gpu = device_handle
|
||||
|
||||
# Error if no subcommand args are passed
|
||||
if not any([args.fan, args.perflevel, args.profile, args.perf_determinism]):
|
||||
if not any([args.fan is not None,
|
||||
args.perf_level,
|
||||
args.profile,
|
||||
args.perf_determinism is not None,
|
||||
args.power_cap]):
|
||||
command = " ".join(sys.argv[1:])
|
||||
raise AmdSmiRequiredCommandException(command, self.logger.format)
|
||||
|
||||
@@ -1874,6 +1881,31 @@ class AMDSMICommands():
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
|
||||
self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}")
|
||||
if isinstance(args.power_cap, int):
|
||||
try:
|
||||
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
|
||||
logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}")
|
||||
min_power_cap = power_cap_info["min_power_cap"]
|
||||
max_power_cap = power_cap_info["max_power_cap"]
|
||||
current_power_cap = power_cap_info["power_cap"]
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
raise ValueError(f"Unable to get power cap info from {gpu_string}") from e
|
||||
|
||||
if args.power_cap == current_power_cap:
|
||||
self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}")
|
||||
elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap:
|
||||
try:
|
||||
amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, args.power_cap * 1000000)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
raise ValueError(f"Unable to set power cap to {args.power_cap} on {gpu_string}") from e
|
||||
self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}")
|
||||
else:
|
||||
# setting power cap to 0 will return the current power cap so the technical minimum value is 1
|
||||
if min_power_cap == 0:
|
||||
min_power_cap = 1
|
||||
self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap} and {max_power_cap}")
|
||||
|
||||
if multiple_devices:
|
||||
self.logger.store_multiple_device_output()
|
||||
@@ -1884,7 +1916,7 @@ class AMDSMICommands():
|
||||
|
||||
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
|
||||
clocks=None, fans=None, profile=None, xgmierr=None, perf_determinism=None,
|
||||
compute_partition=None, memory_partition=None):
|
||||
compute_partition=None, memory_partition=None, power_cap=None):
|
||||
"""Issue reset commands to target gpu(s)
|
||||
|
||||
Args:
|
||||
@@ -1899,6 +1931,7 @@ class AMDSMICommands():
|
||||
perf_determinism (bool, optional): Value override for args.perf_determinism. Defaults to None.
|
||||
compute_partition (bool, optional): Value override for args.compute_partition. Defaults to None.
|
||||
memory_partition (bool, optional): Value override for args.memory_partition. Defaults to None.
|
||||
power_cap (int, optional): Value override for args.power_cap. Defaults to None.
|
||||
|
||||
Raises:
|
||||
ValueError: Value error if no gpu value is provided
|
||||
@@ -1926,6 +1959,8 @@ class AMDSMICommands():
|
||||
args.compute_partition = compute_partition
|
||||
if memory_partition:
|
||||
args.memory_partition = memory_partition
|
||||
if power_cap:
|
||||
args.power_cap = power_cap
|
||||
|
||||
# Handle No GPU passed
|
||||
if args.gpu == None:
|
||||
@@ -1942,7 +1977,9 @@ class AMDSMICommands():
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
# Error if no subcommand args are passed
|
||||
if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, args.perf_determinism]):
|
||||
if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, \
|
||||
args.perf_determinism, args.compute_partition, args.memory_partition, \
|
||||
args.power_cap]):
|
||||
command = " ".join(sys.argv[1:])
|
||||
raise AmdSmiRequiredCommandException(command, self.logger.format)
|
||||
|
||||
@@ -1960,8 +1997,8 @@ class AMDSMICommands():
|
||||
|
||||
self.logger.store_output(args.gpu, 'gpu_reset', result)
|
||||
if args.clocks:
|
||||
reset_clocks_results = {'overdrive' : '',
|
||||
'clocks' : '',
|
||||
reset_clocks_results = {'overdrive': '',
|
||||
'clocks': '',
|
||||
'performance': ''}
|
||||
try:
|
||||
amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, 0)
|
||||
@@ -2069,6 +2106,24 @@ class AMDSMICommands():
|
||||
result = "N/A"
|
||||
logging.debug("Failed to reset memory partition on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
self.logger.store_output(args.gpu, 'reset_memory_partition', result)
|
||||
if args.power_cap:
|
||||
try:
|
||||
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
|
||||
logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}")
|
||||
default_power_cap = power_cap_info["default_power_cap"]
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
raise ValueError(f"Unable to get power cap info from {gpu_id}") from e
|
||||
|
||||
if args.power_cap == default_power_cap:
|
||||
self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap}")
|
||||
else:
|
||||
try:
|
||||
amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap * 1000000)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
raise ValueError(f"Unable to reset power cap to {default_power_cap} on GPU {gpu_id}") from e
|
||||
self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap}")
|
||||
|
||||
if multiple_devices:
|
||||
self.logger.store_multiple_device_output()
|
||||
|
||||
@@ -108,13 +108,22 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
self._add_rocm_smi_parser(self.subparsers, rocmsmi)
|
||||
|
||||
|
||||
def _not_negative_int(self, int_value):
|
||||
# Argument type validator
|
||||
if int_value.isdigit(): # Is digit doesn't work on negative numbers
|
||||
return int(int_value)
|
||||
|
||||
outputformat = self.helpers.get_output_format()
|
||||
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)
|
||||
|
||||
def _positive_int(self, int_value):
|
||||
# Argument type validator
|
||||
if int_value.isdigit(): # Is digit works only on positive numbers
|
||||
return int(int_value)
|
||||
else:
|
||||
outputformat = self.helpers.get_output_format()
|
||||
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)
|
||||
if int_value.isdigit(): # Is digit doesn't work on negative numbers
|
||||
if int(int_value) > 0:
|
||||
return int(int_value)
|
||||
|
||||
outputformat = self.helpers.get_output_format()
|
||||
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)
|
||||
|
||||
|
||||
def _check_output_file_path(self):
|
||||
@@ -539,7 +548,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
# Optional Args
|
||||
process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help)
|
||||
process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help)
|
||||
process_parser.add_argument('-p', '--pid', action='store', type=self._positive_int, required=False, help=pid_help)
|
||||
process_parser.add_argument('-p', '--pid', action='store', type=self._not_negative_int, required=False, help=pid_help)
|
||||
process_parser.add_argument('-n', '--name', action='store', required=False, help=name_help)
|
||||
|
||||
|
||||
@@ -641,6 +650,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types())
|
||||
set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}"
|
||||
set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
|
||||
set_power_cap_help = "Set power capacity limit"
|
||||
|
||||
# Create set_value subparser
|
||||
set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help)
|
||||
@@ -657,9 +667,10 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%')
|
||||
set_value_parser.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL')
|
||||
set_value_parser.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE')
|
||||
set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLKMAX')
|
||||
set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._not_negative_int, required=False, help=set_perf_det_help, metavar='SCLKMAX')
|
||||
set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION')
|
||||
set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
|
||||
set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS')
|
||||
|
||||
|
||||
def _validate_set_clock(self, validate_clock_type=True):
|
||||
@@ -775,6 +786,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
reset_perf_det_help = "Disable performance determinism"
|
||||
reset_compute_help = "Reset compute partitions on the specified GPU"
|
||||
reset_memory_help = "Reset memory partitions on the specified GPU"
|
||||
reset_power_cap_help = "Reset power capacity limit to max capable"
|
||||
|
||||
# Create reset subparser
|
||||
reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help)
|
||||
@@ -796,6 +808,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
reset_parser.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perf_det_help)
|
||||
reset_parser.add_argument('-C', '--compute-partition', action='store_true', required=False, help=reset_compute_help)
|
||||
reset_parser.add_argument('-M', '--memory-partition', action='store_true', required=False, help=reset_memory_help)
|
||||
reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help)
|
||||
|
||||
|
||||
def _add_rocm_smi_parser(self, subparsers, func):
|
||||
|
||||
新增問題並參考
封鎖使用者