Added set & reset --power-cap

Signed-off-by: Maisam Arif <maisarif@amd.com> Change-Id: I9fa6378cfcdb2ad9f8406c51d885209605330627 [ROCm/amdsmi commit: a1b2fb5e0e]
2023-10-17 00:08:43 -05:00
@@ -123,10 +123,10 @@ For convenience, here is the help output for each command
 usage: amd-smi list [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
                    [-g GPU [GPU ...]]

-Lists all the devices on the system and the links between devices.                            
-Lists all the sockets and for each socket, GPUs and/or CPUs associated to                            
-that socket alongside some basic information for each device.                            
-In virtualization environments, it can also list VFs associated to each                            
+Lists all the devices on the system and the links between devices.
+Lists all the sockets and for each socket, GPUs and/or CPUs associated to
+that socket alongside some basic information for each device.
+In virtualization environments, it can also list VFs associated to each
 GPU with some basic information for each VF.

 optional arguments:
@@ -149,7 +149,7 @@ usage: amd-smi static [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
                      [-g GPU [GPU ...]] [-a] [-b] [-V] [-d] [-v] [-c] [-B] [-r] [-p] [-l]
                      [-u]

-If no GPU is specified, returns static information for all GPUs on the system.                                
+If no GPU is specified, returns static information for all GPUs on the system.
 If no static argument is provided, all static information will be displayed.

 Static Arguments:
@@ -229,7 +229,7 @@ usage: amd-smi metric [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
                      [-g GPU [GPU ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-m] [-u]
                      [-p] [-c] [-t] [-e] [-k] [-P] [-f] [-C] [-o] [-l] [-x] [-E]

-If no GPU is specified, returns metric information for all GPUs on the system.                                
+If no GPU is specified, returns metric information for all GPUs on the system.
 If no metric argument is provided all metric information will be displayed.

 Metric arguments:
@@ -269,29 +269,29 @@ usage: amd-smi process [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
                       [-g GPU [GPU ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-G]
                       [-e] [-p PID] [-n NAME]

-If no GPU is specified, returns information for all GPUs on the system.                                
+If no GPU is specified, returns information for all GPUs on the system.
 If no process argument is provided all process information will be displayed.

 Process arguments:
  -h, --help                   show this help message and exit
  -g, --gpu GPU [GPU ...]      Select a GPU ID, BDF, or UUID from the possible choices:
-                               ID:0 | BDF:0000:23:00.0 | UUID:ffff73bf-0000-1000-80ff-ffffffffffff
-                                all | Selects all devices
+                                ID: 0 | BDF: 0000:23:00.0 | UUID: c4ff73bf-0000-1000-802e-0812b504ed69
+                                  all | Selects all devices
  -w, --watch INTERVAL         Reprint the command in a loop of INTERVAL seconds
  -W, --watch_time TIME        The total TIME to watch the given command
  -i, --iterations ITERATIONS  Total number of ITERATIONS to loop on the given command
  -G, --general                pid, process name, memory usage
  -e, --engine                 All engine usages
  -p, --pid PID                Gets all process information about the specified process based on Process ID
-  -n, --name NAME              Gets all process information about the specified process based on Process Name.         
-                                                                                                                                                      If multiple processes have the same name information is returned for all of them.
+  -n, --name NAME              Gets all process information about the specified process based on Process Name.
+                               If multiple processes have the same name information is returned for all of them.

 Command Modifiers:
  --json                       Displays output in JSON format (human readable by default).
  --csv                        Displays output in CSV format (human readable by default).
  --file FILE                  Saves output into a file on the provided path (stdout by default).
  --loglevel LEVEL             Set the logging level from the possible choices:
-                                  DEBUG, INFO, WARNING, ERROR, CRITICAL
+                                DEBUG, INFO, WARNING, ERROR, CRITICAL
 ```

 ```bash
@@ -320,7 +320,7 @@ Command Modifiers:
 usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
                        [-g GPU [GPU ...]] [-a] [-w] [-o] [-t] [-b]

-If no GPU is specified, returns information for all GPUs on the system.                                
+If no GPU is specified, returns information for all GPUs on the system.
 If no topology argument is provided all topology information will be displayed.

 Topology arguments:
@@ -347,9 +347,9 @@ Command Modifiers:
 ~$ amd-smi set --help
 usage: amd-smi set [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] -g GPU [GPU ...]
                   [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION]
-                   [-M PARTITION]
+                   [-M PARTITION] [-o WATTS]

-A GPU must be specified to set a configuration.                                    
+A GPU must be specified to set a configuration.
 A set argument must be provided; Multiple set arguments are accepted

 Set Arguments:
@@ -365,6 +365,7 @@ Set Arguments:
                                        CPX, SPX, DPX, TPX, QPX
  -M, --memory-partition PARTITION   Set one of the following the memory partition modes:
                                        NPS1, NPS2, NPS4, NPS8
+  -o, --power-cap WATTS              Set power capacity limit

 Command Modifiers:
  --json                             Displays output in JSON format (human readable by default).
@@ -377,9 +378,9 @@ Command Modifiers:
 ```bash
 ~$ amd-smi reset --help
 usage: amd-smi reset [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] -g GPU
-                     [GPU ...] [-G] [-c] [-f] [-p] [-x] [-d] [-C] [-M]
+                     [GPU ...] [-G] [-c] [-f] [-p] [-x] [-d] [-C] [-M] [-o]

-A GPU must be specified to reset a configuration.                                
+A GPU must be specified to reset a configuration.
 A reset argument must be provided; Multiple reset arguments are accepted

 Reset Arguments:
@@ -395,6 +396,7 @@ Reset Arguments:
  -d, --perf-determinism   Disable performance determinism
  -C, --compute-partition  Reset compute partitions on the specified GPU
  -M, --memory-partition   Reset memory partitions on the specified GPU
+  -o, --power-cap          Reset power capacity limit to max capable

 Command Modifiers:
  --json                   Displays output in JSON format (human readable by default).
@@ -290,9 +290,9 @@ class AMDSMICommands():
                # Power limits
                try:
                    power_limit_error = False
-                    power_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
-                    max_power_limit = power_info['max_power_cap']
-                    current_power_limit = power_info['power_cap']
+                    power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
+                    max_power_limit = power_cap_info['max_power_cap']
+                    current_power_limit = power_cap_info['power_cap']
                except amdsmi_exception.AmdSmiLibraryException as e:
                    power_limit_error = True
                    max_power_limit = "N/A"
@@ -1761,7 +1761,7 @@ class AMDSMICommands():

    def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
                  profile=None, perf_determinism=None, compute_partition=None,
-                  memory_partition=None):
+                  memory_partition=None, power_cap=None):
        """Issue reset commands to target gpu(s)

        Args:
@@ -1774,6 +1774,7 @@ class AMDSMICommands():
            perf_determinism (int, optional): Value override for args.perf_determinism. Defaults to None.
            compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None.
            memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None.
+            power_cap (int, optional): Value override for args.power_cap. Defaults to None.

        Raises:
            ValueError: Value error if no gpu value is provided
@@ -1785,18 +1786,20 @@ class AMDSMICommands():
        # Set args.* to passed in arguments
        if gpu:
            args.gpu = gpu
-        if fan:
+        if fan is not None:
            args.fan = fan
        if perf_level:
            args.perf_level = perf_level
        if profile:
            args.profile = profile
-        if perf_determinism:
+        if perf_determinism is not None:
            args.perf_determinism = perf_determinism
        if compute_partition:
            args.compute_partition = compute_partition
        if memory_partition:
            args.memory_partition = memory_partition
+        if power_cap:
+            args.power_cap = power_cap

        # Handle No GPU passed
        if args.gpu == None:
@@ -1810,7 +1813,11 @@ class AMDSMICommands():
        args.gpu = device_handle

        # Error if no subcommand args are passed
-        if not any([args.fan, args.perflevel, args.profile, args.perf_determinism]):
+        if not any([args.fan is not None,
+                    args.perf_level,
+                    args.profile,
+                    args.perf_determinism is not None,
+                    args.power_cap]):
            command = " ".join(sys.argv[1:])
            raise AmdSmiRequiredCommandException(command, self.logger.format)

@@ -1874,6 +1881,31 @@ class AMDSMICommands():
                    raise PermissionError('Command requires elevation') from e
                raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
            self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}")
+        if isinstance(args.power_cap, int):
+            try:
+                power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
+                logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}")
+                min_power_cap = power_cap_info["min_power_cap"]
+                max_power_cap = power_cap_info["max_power_cap"]
+                current_power_cap = power_cap_info["power_cap"]
+            except amdsmi_exception.AmdSmiLibraryException as e:
+                raise ValueError(f"Unable to get power cap info from {gpu_string}") from e
+
+            if args.power_cap == current_power_cap:
+                self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}")
+            elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap:
+                try:
+                    amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, args.power_cap * 1000000)
+                except amdsmi_exception.AmdSmiLibraryException as e:
+                    if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
+                        raise PermissionError('Command requires elevation') from e
+                    raise ValueError(f"Unable to set power cap to {args.power_cap} on {gpu_string}") from e
+                self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}")
+            else:
+                # setting power cap to 0 will return the current power cap so the technical minimum value is 1
+                if min_power_cap == 0:
+                    min_power_cap = 1
+                self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap} and {max_power_cap}")

        if multiple_devices:
            self.logger.store_multiple_device_output()
@@ -1884,7 +1916,7 @@ class AMDSMICommands():

    def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
                clocks=None, fans=None, profile=None, xgmierr=None, perf_determinism=None,
-                compute_partition=None, memory_partition=None):
+                compute_partition=None, memory_partition=None, power_cap=None):
        """Issue reset commands to target gpu(s)

        Args:
@@ -1899,6 +1931,7 @@ class AMDSMICommands():
            perf_determinism (bool, optional): Value override for args.perf_determinism. Defaults to None.
            compute_partition (bool, optional): Value override for args.compute_partition. Defaults to None.
            memory_partition (bool, optional): Value override for args.memory_partition. Defaults to None.
+            power_cap (int, optional): Value override for args.power_cap. Defaults to None.

        Raises:
            ValueError: Value error if no gpu value is provided
@@ -1926,6 +1959,8 @@ class AMDSMICommands():
            args.compute_partition = compute_partition
        if memory_partition:
            args.memory_partition = memory_partition
+        if power_cap:
+            args.power_cap = power_cap

        # Handle No GPU passed
        if args.gpu == None:
@@ -1942,7 +1977,9 @@ class AMDSMICommands():
        gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)

        # Error if no subcommand args are passed
-        if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, args.perf_determinism]):
+        if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, \
+                    args.perf_determinism, args.compute_partition, args.memory_partition, \
+                    args.power_cap]):
            command = " ".join(sys.argv[1:])
            raise AmdSmiRequiredCommandException(command, self.logger.format)

@@ -1960,8 +1997,8 @@ class AMDSMICommands():

            self.logger.store_output(args.gpu, 'gpu_reset', result)
        if args.clocks:
-            reset_clocks_results = {'overdrive' : '',
-                                    'clocks' : '',
+            reset_clocks_results = {'overdrive': '',
+                                    'clocks': '',
                                    'performance': ''}
            try:
                amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, 0)
@@ -2069,6 +2106,24 @@ class AMDSMICommands():
                result = "N/A"
                logging.debug("Failed to reset memory partition on gpu %s | %s", gpu_id, e.get_error_info())
            self.logger.store_output(args.gpu, 'reset_memory_partition', result)
+        if args.power_cap:
+            try:
+                power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
+                logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}")
+                default_power_cap = power_cap_info["default_power_cap"]
+            except amdsmi_exception.AmdSmiLibraryException as e:
+                raise ValueError(f"Unable to get power cap info from {gpu_id}") from e
+
+            if args.power_cap == default_power_cap:
+                self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap}")
+            else:
+                try:
+                    amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap * 1000000)
+                except amdsmi_exception.AmdSmiLibraryException as e:
+                    if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
+                        raise PermissionError('Command requires elevation') from e
+                    raise ValueError(f"Unable to reset power cap to {default_power_cap} on GPU {gpu_id}") from e
+                self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap}")

        if multiple_devices:
            self.logger.store_multiple_device_output()
@@ -108,13 +108,22 @@ class AMDSMIParser(argparse.ArgumentParser):
        self._add_rocm_smi_parser(self.subparsers, rocmsmi)


+    def _not_negative_int(self, int_value):
+        # Argument type validator
+        if int_value.isdigit():  # Is digit doesn't work on negative numbers
+            return int(int_value)
+
+        outputformat = self.helpers.get_output_format()
+        raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)
+
    def _positive_int(self, int_value):
        # Argument type validator
-        if int_value.isdigit():  # Is digit works only on positive numbers
-            return int(int_value)
-        else:
-            outputformat = self.helpers.get_output_format()
-            raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)
+        if int_value.isdigit():  # Is digit doesn't work on negative numbers
+            if int(int_value) > 0:
+                return int(int_value)
+
+        outputformat = self.helpers.get_output_format()
+        raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)


    def _check_output_file_path(self):
@@ -539,7 +548,7 @@ class AMDSMIParser(argparse.ArgumentParser):
        # Optional Args
        process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help)
        process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help)
-        process_parser.add_argument('-p', '--pid', action='store', type=self._positive_int, required=False, help=pid_help)
+        process_parser.add_argument('-p', '--pid', action='store', type=self._not_negative_int, required=False, help=pid_help)
        process_parser.add_argument('-n', '--name', action='store', required=False, help=name_help)


@@ -641,6 +650,7 @@ class AMDSMIParser(argparse.ArgumentParser):
        memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types())
        set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}"
        set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
+        set_power_cap_help = "Set power capacity limit"

        # Create set_value subparser
        set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help)
@@ -657,9 +667,10 @@ class AMDSMIParser(argparse.ArgumentParser):
        set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%')
        set_value_parser.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL')
        set_value_parser.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE')
-        set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLKMAX')
+        set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._not_negative_int, required=False, help=set_perf_det_help, metavar='SCLKMAX')
        set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION')
        set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
+        set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS')


    def _validate_set_clock(self, validate_clock_type=True):
@@ -775,6 +786,7 @@ class AMDSMIParser(argparse.ArgumentParser):
        reset_perf_det_help = "Disable performance determinism"
        reset_compute_help = "Reset compute partitions on the specified GPU"
        reset_memory_help = "Reset memory partitions on the specified GPU"
+        reset_power_cap_help = "Reset power capacity limit to max capable"

        # Create reset subparser
        reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help)
@@ -796,6 +808,7 @@ class AMDSMIParser(argparse.ArgumentParser):
        reset_parser.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perf_det_help)
        reset_parser.add_argument('-C', '--compute-partition', action='store_true', required=False, help=reset_compute_help)
        reset_parser.add_argument('-M', '--memory-partition', action='store_true', required=False, help=reset_memory_help)
+        reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help)


    def _add_rocm_smi_parser(self, subparsers, func):