diff --git a/projects/amdsmi/amdsmi_cli/README.md b/projects/amdsmi/amdsmi_cli/README.md index 45281d39f1..2d48154e50 100644 --- a/projects/amdsmi/amdsmi_cli/README.md +++ b/projects/amdsmi/amdsmi_cli/README.md @@ -123,10 +123,10 @@ For convenience, here is the help output for each command usage: amd-smi list [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...]] -Lists all the devices on the system and the links between devices. -Lists all the sockets and for each socket, GPUs and/or CPUs associated to -that socket alongside some basic information for each device. -In virtualization environments, it can also list VFs associated to each +Lists all the devices on the system and the links between devices. +Lists all the sockets and for each socket, GPUs and/or CPUs associated to +that socket alongside some basic information for each device. +In virtualization environments, it can also list VFs associated to each GPU with some basic information for each VF. optional arguments: @@ -149,7 +149,7 @@ usage: amd-smi static [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...]] [-a] [-b] [-V] [-d] [-v] [-c] [-B] [-r] [-p] [-l] [-u] -If no GPU is specified, returns static information for all GPUs on the system. +If no GPU is specified, returns static information for all GPUs on the system. If no static argument is provided, all static information will be displayed. Static Arguments: @@ -229,7 +229,7 @@ usage: amd-smi metric [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-m] [-u] [-p] [-c] [-t] [-e] [-k] [-P] [-f] [-C] [-o] [-l] [-x] [-E] -If no GPU is specified, returns metric information for all GPUs on the system. +If no GPU is specified, returns metric information for all GPUs on the system. If no metric argument is provided all metric information will be displayed. Metric arguments: @@ -269,29 +269,29 @@ usage: amd-smi process [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-G] [-e] [-p PID] [-n NAME] -If no GPU is specified, returns information for all GPUs on the system. +If no GPU is specified, returns information for all GPUs on the system. If no process argument is provided all process information will be displayed. Process arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID:0 | BDF:0000:23:00.0 | UUID:ffff73bf-0000-1000-80ff-ffffffffffff - all | Selects all devices + ID: 0 | BDF: 0000:23:00.0 | UUID: c4ff73bf-0000-1000-802e-0812b504ed69 + all | Selects all devices -w, --watch INTERVAL Reprint the command in a loop of INTERVAL seconds -W, --watch_time TIME The total TIME to watch the given command -i, --iterations ITERATIONS Total number of ITERATIONS to loop on the given command -G, --general pid, process name, memory usage -e, --engine All engine usages -p, --pid PID Gets all process information about the specified process based on Process ID - -n, --name NAME Gets all process information about the specified process based on Process Name. - If multiple processes have the same name information is returned for all of them. + -n, --name NAME Gets all process information about the specified process based on Process Name. + If multiple processes have the same name information is returned for all of them. Command Modifiers: --json Displays output in JSON format (human readable by default). --csv Displays output in CSV format (human readable by default). --file FILE Saves output into a file on the provided path (stdout by default). --loglevel LEVEL Set the logging level from the possible choices: - DEBUG, INFO, WARNING, ERROR, CRITICAL + DEBUG, INFO, WARNING, ERROR, CRITICAL ``` ```bash @@ -320,7 +320,7 @@ Command Modifiers: usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...]] [-a] [-w] [-o] [-t] [-b] -If no GPU is specified, returns information for all GPUs on the system. +If no GPU is specified, returns information for all GPUs on the system. If no topology argument is provided all topology information will be displayed. Topology arguments: @@ -347,9 +347,9 @@ Command Modifiers: ~$ amd-smi set --help usage: amd-smi set [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] -g GPU [GPU ...] [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] - [-M PARTITION] + [-M PARTITION] [-o WATTS] -A GPU must be specified to set a configuration. +A GPU must be specified to set a configuration. A set argument must be provided; Multiple set arguments are accepted Set Arguments: @@ -365,6 +365,7 @@ Set Arguments: CPX, SPX, DPX, TPX, QPX -M, --memory-partition PARTITION Set one of the following the memory partition modes: NPS1, NPS2, NPS4, NPS8 + -o, --power-cap WATTS Set power capacity limit Command Modifiers: --json Displays output in JSON format (human readable by default). @@ -377,9 +378,9 @@ Command Modifiers: ```bash ~$ amd-smi reset --help usage: amd-smi reset [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] -g GPU - [GPU ...] [-G] [-c] [-f] [-p] [-x] [-d] [-C] [-M] + [GPU ...] [-G] [-c] [-f] [-p] [-x] [-d] [-C] [-M] [-o] -A GPU must be specified to reset a configuration. +A GPU must be specified to reset a configuration. A reset argument must be provided; Multiple reset arguments are accepted Reset Arguments: @@ -395,6 +396,7 @@ Reset Arguments: -d, --perf-determinism Disable performance determinism -C, --compute-partition Reset compute partitions on the specified GPU -M, --memory-partition Reset memory partitions on the specified GPU + -o, --power-cap Reset power capacity limit to max capable Command Modifiers: --json Displays output in JSON format (human readable by default). diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index e4ecb17791..7bb175bf74 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -290,9 +290,9 @@ class AMDSMICommands(): # Power limits try: power_limit_error = False - power_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) - max_power_limit = power_info['max_power_cap'] - current_power_limit = power_info['power_cap'] + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) + max_power_limit = power_cap_info['max_power_cap'] + current_power_limit = power_cap_info['power_cap'] except amdsmi_exception.AmdSmiLibraryException as e: power_limit_error = True max_power_limit = "N/A" @@ -1761,7 +1761,7 @@ class AMDSMICommands(): def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, - memory_partition=None): + memory_partition=None, power_cap=None): """Issue reset commands to target gpu(s) Args: @@ -1774,6 +1774,7 @@ class AMDSMICommands(): perf_determinism (int, optional): Value override for args.perf_determinism. Defaults to None. compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None. memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. + power_cap (int, optional): Value override for args.power_cap. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -1785,18 +1786,20 @@ class AMDSMICommands(): # Set args.* to passed in arguments if gpu: args.gpu = gpu - if fan: + if fan is not None: args.fan = fan if perf_level: args.perf_level = perf_level if profile: args.profile = profile - if perf_determinism: + if perf_determinism is not None: args.perf_determinism = perf_determinism if compute_partition: args.compute_partition = compute_partition if memory_partition: args.memory_partition = memory_partition + if power_cap: + args.power_cap = power_cap # Handle No GPU passed if args.gpu == None: @@ -1810,7 +1813,11 @@ class AMDSMICommands(): args.gpu = device_handle # Error if no subcommand args are passed - if not any([args.fan, args.perflevel, args.profile, args.perf_determinism]): + if not any([args.fan is not None, + args.perf_level, + args.profile, + args.perf_determinism is not None, + args.power_cap]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -1874,6 +1881,31 @@ class AMDSMICommands(): raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}") + if isinstance(args.power_cap, int): + try: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) + logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") + min_power_cap = power_cap_info["min_power_cap"] + max_power_cap = power_cap_info["max_power_cap"] + current_power_cap = power_cap_info["power_cap"] + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to get power cap info from {gpu_string}") from e + + if args.power_cap == current_power_cap: + self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}") + elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap: + try: + amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, args.power_cap * 1000000) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set power cap to {args.power_cap} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}") + else: + # setting power cap to 0 will return the current power cap so the technical minimum value is 1 + if min_power_cap == 0: + min_power_cap = 1 + self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap} and {max_power_cap}") if multiple_devices: self.logger.store_multiple_device_output() @@ -1884,7 +1916,7 @@ class AMDSMICommands(): def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, clocks=None, fans=None, profile=None, xgmierr=None, perf_determinism=None, - compute_partition=None, memory_partition=None): + compute_partition=None, memory_partition=None, power_cap=None): """Issue reset commands to target gpu(s) Args: @@ -1899,6 +1931,7 @@ class AMDSMICommands(): perf_determinism (bool, optional): Value override for args.perf_determinism. Defaults to None. compute_partition (bool, optional): Value override for args.compute_partition. Defaults to None. memory_partition (bool, optional): Value override for args.memory_partition. Defaults to None. + power_cap (int, optional): Value override for args.power_cap. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -1926,6 +1959,8 @@ class AMDSMICommands(): args.compute_partition = compute_partition if memory_partition: args.memory_partition = memory_partition + if power_cap: + args.power_cap = power_cap # Handle No GPU passed if args.gpu == None: @@ -1942,7 +1977,9 @@ class AMDSMICommands(): gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) # Error if no subcommand args are passed - if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, args.perf_determinism]): + if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, \ + args.perf_determinism, args.compute_partition, args.memory_partition, \ + args.power_cap]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -1960,8 +1997,8 @@ class AMDSMICommands(): self.logger.store_output(args.gpu, 'gpu_reset', result) if args.clocks: - reset_clocks_results = {'overdrive' : '', - 'clocks' : '', + reset_clocks_results = {'overdrive': '', + 'clocks': '', 'performance': ''} try: amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, 0) @@ -2069,6 +2106,24 @@ class AMDSMICommands(): result = "N/A" logging.debug("Failed to reset memory partition on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_memory_partition', result) + if args.power_cap: + try: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) + logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") + default_power_cap = power_cap_info["default_power_cap"] + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to get power cap info from {gpu_id}") from e + + if args.power_cap == default_power_cap: + self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap}") + else: + try: + amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap * 1000000) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to reset power cap to {default_power_cap} on GPU {gpu_id}") from e + self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap}") if multiple_devices: self.logger.store_multiple_device_output() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 455b793aa7..c730a5785c 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -108,13 +108,22 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_rocm_smi_parser(self.subparsers, rocmsmi) + def _not_negative_int(self, int_value): + # Argument type validator + if int_value.isdigit(): # Is digit doesn't work on negative numbers + return int(int_value) + + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat) + def _positive_int(self, int_value): # Argument type validator - if int_value.isdigit(): # Is digit works only on positive numbers - return int(int_value) - else: - outputformat = self.helpers.get_output_format() - raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat) + if int_value.isdigit(): # Is digit doesn't work on negative numbers + if int(int_value) > 0: + return int(int_value) + + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat) def _check_output_file_path(self): @@ -539,7 +548,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Optional Args process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help) process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help) - process_parser.add_argument('-p', '--pid', action='store', type=self._positive_int, required=False, help=pid_help) + process_parser.add_argument('-p', '--pid', action='store', type=self._not_negative_int, required=False, help=pid_help) process_parser.add_argument('-n', '--name', action='store', required=False, help=name_help) @@ -641,6 +650,7 @@ class AMDSMIParser(argparse.ArgumentParser): memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types()) set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}" set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" + set_power_cap_help = "Set power capacity limit" # Create set_value subparser set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help) @@ -657,9 +667,10 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%') set_value_parser.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL') set_value_parser.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE') - set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLKMAX') + set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._not_negative_int, required=False, help=set_perf_det_help, metavar='SCLKMAX') set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION') set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') + set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') def _validate_set_clock(self, validate_clock_type=True): @@ -775,6 +786,7 @@ class AMDSMIParser(argparse.ArgumentParser): reset_perf_det_help = "Disable performance determinism" reset_compute_help = "Reset compute partitions on the specified GPU" reset_memory_help = "Reset memory partitions on the specified GPU" + reset_power_cap_help = "Reset power capacity limit to max capable" # Create reset subparser reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help) @@ -796,6 +808,7 @@ class AMDSMIParser(argparse.ArgumentParser): reset_parser.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perf_det_help) reset_parser.add_argument('-C', '--compute-partition', action='store_true', required=False, help=reset_compute_help) reset_parser.add_argument('-M', '--memory-partition', action='store_true', required=False, help=reset_memory_help) + reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help) def _add_rocm_smi_parser(self, subparsers, func):