diff --git a/projects/amdsmi/amdsmi_cli/README.md b/projects/amdsmi/amdsmi_cli/README.md index 7032b74d0c..19b269e63e 100644 --- a/projects/amdsmi/amdsmi_cli/README.md +++ b/projects/amdsmi/amdsmi_cli/README.md @@ -160,9 +160,9 @@ Command Modifiers: amd-smi static --help usage: amd-smi static [-h] [--json | --csv] [--file FILE] [--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-g GPU [GPU ...]] - [-a] [-b] [-V] [-d] [-r] [-v] [-B] [-l] [-u] + [-a] [-b] [-V] [-d] [-v] [-c] [-B] [-r] [-p] [-l] [-u] -If no GPU is specified, returns static information for all GPUs on the system. +If no GPU is specified, returns static information for all GPUs on the system. If no static argument is provided, all static information will be displayed. Static Arguments: @@ -174,10 +174,11 @@ Static Arguments: -b, --bus All bus information -V, --vbios All video bios information (if available) -d, --driver Displays driver version - -r, --ras Displays RAS features information -v, --vram All vram information -c, --cache All cache information -B, --board All board information + -r, --ras Displays RAS features information + -p, --partition Partition information -l, --limit All limit metric values (i.e. power and thermal limits) -u, --numa All numa node information @@ -314,7 +315,8 @@ Command Modifiers: amd-smi set --help usage: amd-smi set [-h] [--json | --csv] [--file FILE] [--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] -g GPU [GPU ...] - [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] + [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] + [-M PARTITION] A GPU must be specified to set a configuration. A set argument must be provided; Multiple set arguments are accepted @@ -325,9 +327,11 @@ Set Arguments: ID:0 | BDF:0000:23:00.0 | UUID:c4ff73bf-0000-1000-80ff-ffffffffffff all | Selects all devices -f %, --fan % Sets GPU fan speed (0-255 or 0-100%) - -l LEVEL, --perflevel LEVEL Sets performance level + -l LEVEL, --perf-level LEVEL Sets performance level -P SETPROFILE, --profile SETPROFILE Set power profile level (#) or a quoted string of custom profile attributes - -d SCLKMAX, --perfdeterminism SCLKMAX Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation + -d SCLKMAX, --perf-determinism SCLKMAX Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation + -C PARTITION, --compute-partition PARTITION Sets compute partition mode + -M PARTITION, --memory-partition PARTITION Sets memory partition mode Command Modifiers: --json Displays output in JSON format (human readable by default). @@ -340,7 +344,7 @@ Command Modifiers: amd-smi reset --help usage: amd-smi reset [-h] [--json | --csv] [--file FILE] [--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] -g GPU [GPU ...] - [-G] [-c] [-f] [-p] [-x] [-d] + [-G] [-c] [-f] [-p] [-x] [-d] [-C] [-M] A GPU must be specified to reset a configuration. A reset argument must be provided; Multiple reset arguments are accepted @@ -355,7 +359,9 @@ Reset Arguments: -f, --fans Reset fans to automatic (driver) control -p, --profile Reset power profile back to default -x, --xgmierr Reset XGMI error counts - -d, --perfdeterminism Disable performance determinism + -d, --perf-determinism Disable performance determinism + -C, --compute-partition Reset compute partitions on the specified GPU + -M, --memory-partition Reset memory partitions on the specified GPU Command Modifiers: --json Displays output in JSON format (human readable by default). diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 3d1cb78bf7..65afc71a79 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -130,8 +130,8 @@ class AMDSMICommands(): def static(self, args, multiple_devices=False, gpu=None, asic=None, - bus=None, vbios=None, limit=None, driver=None, - ras=None, board=None, numa=None, vram=None, cache=None): + bus=None, vbios=None, limit=None, driver=None, ras=None, + board=None, numa=None, vram=None, cache=None, partition=None): """Get Static information for target gpu Args: @@ -148,6 +148,7 @@ class AMDSMICommands(): numa (bool, optional): Value override for args.numa. Defaults to None. vram (bool, optional): Value override for args.vram. Defaults to None. cache (bool, optional): Value override for args.cache. Defaults to None. + partition (bool, optional): Value override for args.partition. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -177,6 +178,8 @@ class AMDSMICommands(): if self.helpers.is_linux() and self.helpers.is_baremetal(): if ras: args.ras = ras + if partition: + args.partition = partition if limit: args.limit = limit @@ -192,8 +195,8 @@ class AMDSMICommands(): # If all arguments are False, it means that no argument was passed and the entire static should be printed if self.helpers.is_linux() and self.helpers.is_baremetal(): - if not any([args.asic, args.bus, args.vbios, args.limit, args.board, args.ras, args.driver, args.numa, args.vram, args.cache]): - args.asic = args.bus = args.vbios = args.limit = args.board = args.ras = args.driver = args.numa = args.vram = args.cache = self.all_arguments = True + if not any([args.asic, args.bus, args.vbios, args.limit, args.board, args.ras, args.driver, args.numa, args.vram, args.cache, args.partition]): + args.asic = args.bus = args.vbios = args.limit = args.board = args.ras = args.driver = args.numa = args.vram = args.cache = args.partition = self.all_arguments = True if self.helpers.is_linux() and self.helpers.is_virtual_os(): if not any([args.asic, args.bus, args.vbios, args.board, args.driver, args.vram, args.cache]): args.asic = args.bus = args.vbios = args.board = args.driver = args.vram = args.cache = self.all_arguments = True @@ -448,6 +451,7 @@ class AMDSMICommands(): logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['cache'] = cache_info + if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): if args.ras: ras_dict = {"eeprom_version": "N/A", @@ -469,6 +473,22 @@ class AMDSMICommands(): logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info()) static_dict["ras"] = ras_dict + if args.partition: + try: + compute_partition = amdsmi_interface.amdsmi_dev_compute_partition_get(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + compute_partition = "N/A" + logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + memory_partition = amdsmi_interface.amdsmi_dev_memory_partition_get(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + memory_partition = "N/A" + logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['partition'] = {"compute_partition": compute_partition, + "memory_partition": memory_partition} + if self.helpers.is_linux() and self.helpers.is_baremetal(): if args.numa: try: @@ -1711,8 +1731,9 @@ class AMDSMICommands(): self.logger.print_output(multiple_device_enabled=True) - def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perflevel=None, - profile=None, perfdeterminism=None): + def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, + profile=None, perfdeterminism=None, compute_partition=None, + memory_partition=None): """Issue reset commands to target gpu(s) Args: @@ -1720,9 +1741,11 @@ class AMDSMICommands(): multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. gpu (device_handle, optional): device_handle for target device. Defaults to None. fan (int, optional): Value override for args.fan. Defaults to None. - perflevel (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perflevel. Defaults to None. + perf_level (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perf_level. Defaults to None. profile (bool, optional): Value override for args.profile. Defaults to None. perfdeterminism (int, optional): Value override for args.perfdeterminism. Defaults to None. + compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None. + memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -1736,12 +1759,16 @@ class AMDSMICommands(): args.gpu = gpu if fan: args.fan = fan - if perflevel: - args.perflevel = perflevel + if perf_level: + args.perf_level = perf_level if profile: args.profile = profile if perfdeterminism: args.perfdeterminism = perfdeterminism + if compute_partition: + args.compute_partition = compute_partition + if memory_partition: + args.memory_partition = memory_partition # Handle No GPU passed if args.gpu == None: @@ -1775,16 +1802,16 @@ class AMDSMICommands(): raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e self.logger.store_output(args.gpu, 'fan', f"Successfully set fan speed {args.fan}") - if args.perflevel: - perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perflevel] + if args.perf_level: + perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perf_level] try: amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, perf_level) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set performance level {args.perflevel} on {gpu_string}") from e + raise ValueError(f"Unable to set performance level {args.perf_level} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perflevel}") + self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perf_level}") if args.profile: self.logger.store_output(args.gpu, 'profile', "Not Yet Implemented") if isinstance(args.perfdeterminism, int): @@ -1796,7 +1823,22 @@ class AMDSMICommands(): raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perfdeterminism} on {gpu_string}") from e self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perfdeterminism}") - + if args.compute_partition: + compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition] + try: + amdsmi_interface.amdsmi_dev_compute_partition_set(args.gpu, compute_partition) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set compute partition to {args.compute_partition} on {gpu_string}") from e + if args.memory_partition: + memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition] + try: + amdsmi_interface.amdsmi_dev_memory_partition_set(args.gpu, memory_partition) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e if multiple_devices: self.logger.store_multiple_device_output() return # Skip printing when there are multiple devices @@ -1805,7 +1847,8 @@ class AMDSMICommands(): def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, - clocks=None, fans=None, profile=None, xgmierr=None, perfdeterminism=None): + clocks=None, fans=None, profile=None, xgmierr=None, perfdeterminism=None, + compute_partition=None, memory_partition=None): """Issue reset commands to target gpu(s) Args: @@ -1818,6 +1861,8 @@ class AMDSMICommands(): profile (bool, optional): Value override for args.profile. Defaults to None. xgmierr (bool, optional): Value override for args.xgmierr. Defaults to None. perfdeterminism (bool, optional): Value override for args.perfdeterminism. Defaults to None. + compute_partition (bool, optional): Value override for args.compute_partition. Defaults to None. + memory_partition (bool, optional): Value override for args.memory_partition. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -1841,6 +1886,10 @@ class AMDSMICommands(): args.xgmierr = xgmierr if perfdeterminism: args.perfdeterminism = perfdeterminism + if compute_partition: + args.compute_partition = compute_partition + if memory_partition: + args.memory_partition = memory_partition # Handle No GPU passed if args.gpu == None: @@ -1958,8 +2007,27 @@ class AMDSMICommands(): raise PermissionError('Command requires elevation') from e result = "N/A" logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.store_output(args.gpu, 'reset_perf_determinism', result) + if args.compute_partition: + try: + amdsmi_interface.amdsmi_reset_gpu_compute_partition(args.gpu) + result = 'Successfully reset compute partition' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = "N/A" + logging.debug("Failed to reset compute partition on gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'reset_compute_partition', result) + if args.memory_partition: + try: + amdsmi_interface.amdsmi_reset_gpu_memory_partition(args.gpu) + result = 'Successfully reset memory partition' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = "N/A" + logging.debug("Failed to reset memory partition on gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'reset_memory_partition', result) if multiple_devices: self.logger.store_multiple_device_output() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index f4914e01a4..9fbc4398ab 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -342,6 +342,20 @@ class AMDSMIHelpers(): return perf_levels_str, perf_levels_int + def get_compute_partition_types(self): + compute_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiComputePartitionType] + if 'INVALID' in compute_partitions_str: + compute_partitions_str.remove('INVALID') + return compute_partitions_str + + + def get_memory_partition_types(self): + memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType] + if 'UNKNOWN' in memory_partitions_str: + memory_partitions_str.remove('UNKNOWN') + return memory_partitions_str + + def get_clock_types(self): clock_types_str = [clock.name for clock in amdsmi_interface.AmdSmiClkType] clock_types_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiClkType)) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 97c3a53573..098fe2f629 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -311,6 +311,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Options arguments help text for Hypervisors and Baremetal ras_help = "Displays RAS features information" numa_help = "All numa node information" # Linux Baremetal only + partition_help = "Partition information" # Options arguments help text for Hypervisors dfc_help = "All DFC FW table information" @@ -339,6 +340,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Options to display on Hypervisors and Baremetal if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) + static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) if self.helpers.is_linux(): static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) @@ -615,11 +617,13 @@ class AMDSMIParser(argparse.ArgumentParser): \nA set argument must be provided; Multiple set arguments are accepted" set_value_optionals_title = "Set Arguments" - # Help text for Arguments only on Guest and BM platforms + # Help text for Arguments only on BM platforms set_fan_help = "Sets GPU fan speed (0-255 or 0-100%%)" set_perf_level_help = "Sets performance level" set_profile_help = "Set power profile level (#) or a quoted string of custom profile attributes" set_perf_det_help = "Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation" + set_compute_partition_help = "Sets compute partition mode" + set_memory_partition_help = "Sets memory partition mode" # Create set_value subparser set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help) @@ -633,9 +637,11 @@ class AMDSMIParser(argparse.ArgumentParser): # Optional Args set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%') - set_value_parser.add_argument('-l', '--perflevel', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL') + set_value_parser.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL') set_value_parser.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE') - set_value_parser.add_argument('-d', '--perfdeterminism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLKMAX') + set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLKMAX') + set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION') + set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') def _validate_set_clock(self, validate_clock_type=True): @@ -744,11 +750,13 @@ class AMDSMIParser(argparse.ArgumentParser): # Help text for Arguments only on Guest and BM platforms gpureset_help = "Reset the specified GPU" - resetclocks_help = "Reset clocks and overdrive to default" - resetfans_help = "Reset fans to automatic (driver) control" - resetprofile_help = "Reset power profile back to default" - resetxgmierr_help = "Reset XGMI error counts" - resetperfdet_help = "Disable performance determinism" + reset_clocks_help = "Reset clocks and overdrive to default" + reset_fans_help = "Reset fans to automatic (driver) control" + reset_profile_help = "Reset power profile back to default" + reset_xgmierr_help = "Reset XGMI error counts" + reset_perfdet_help = "Disable performance determinism" + reset_compute_help = "Reset compute partitions on the specified GPU" + reset_memory_help = "Reset memory partitions on the specified GPU" # Create reset subparser reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help) @@ -762,11 +770,13 @@ class AMDSMIParser(argparse.ArgumentParser): # Optional Args reset_parser.add_argument('-G', '--gpureset', action='store_true', required=False, help=gpureset_help) - reset_parser.add_argument('-c', '--clocks', action='store_true', required=False, help=resetclocks_help) - reset_parser.add_argument('-f', '--fans', action='store_true', required=False, help=resetfans_help) - reset_parser.add_argument('-p', '--profile', action='store_true', required=False, help=resetprofile_help) - reset_parser.add_argument('-x', '--xgmierr', action='store_true', required=False, help=resetxgmierr_help) - reset_parser.add_argument('-d', '--perfdeterminism', action='store_true', required=False, help=resetperfdet_help) + reset_parser.add_argument('-c', '--clocks', action='store_true', required=False, help=reset_clocks_help) + reset_parser.add_argument('-f', '--fans', action='store_true', required=False, help=reset_fans_help) + reset_parser.add_argument('-p', '--profile', action='store_true', required=False, help=reset_profile_help) + reset_parser.add_argument('-x', '--xgmierr', action='store_true', required=False, help=reset_xgmierr_help) + reset_parser.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perfdet_help) + reset_parser.add_argument('-C', '--compute-partition', action='store_true', required=False, help=reset_compute_help) + reset_parser.add_argument('-M', '--memory-partition', action='store_true', required=False, help=reset_memory_help) def _add_rocm_smi_parser(self, subparsers, func):