Added memory & compute partitions to cli tool

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I0db6e2b9e3ae2e19397a012e095173ec550c1e42


[ROCm/amdsmi commit: a0c2735343]
Этот коммит содержится в:
Maisam Arif
2023-10-13 04:57:34 -05:00
коммит произвёл Maisam Arif
родитель 7ada0bba5a
Коммит 12e45f96da
4 изменённых файлов: 135 добавлений и 37 удалений
+14 -8
Просмотреть файл
@@ -160,9 +160,9 @@ Command Modifiers:
amd-smi static --help
usage: amd-smi static [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-g GPU [GPU ...]]
[-a] [-b] [-V] [-d] [-r] [-v] [-B] [-l] [-u]
[-a] [-b] [-V] [-d] [-v] [-c] [-B] [-r] [-p] [-l] [-u]
If no GPU is specified, returns static information for all GPUs on the system.
If no GPU is specified, returns static information for all GPUs on the system.
If no static argument is provided, all static information will be displayed.
Static Arguments:
@@ -174,10 +174,11 @@ Static Arguments:
-b, --bus All bus information
-V, --vbios All video bios information (if available)
-d, --driver Displays driver version
-r, --ras Displays RAS features information
-v, --vram All vram information
-c, --cache All cache information
-B, --board All board information
-r, --ras Displays RAS features information
-p, --partition Partition information
-l, --limit All limit metric values (i.e. power and thermal limits)
-u, --numa All numa node information
@@ -314,7 +315,8 @@ Command Modifiers:
amd-smi set --help
usage: amd-smi set [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] -g GPU [GPU ...]
[-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX]
[-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION]
[-M PARTITION]
A GPU must be specified to set a configuration.
A set argument must be provided; Multiple set arguments are accepted
@@ -325,9 +327,11 @@ Set Arguments:
ID:0 | BDF:0000:23:00.0 | UUID:c4ff73bf-0000-1000-80ff-ffffffffffff
all | Selects all devices
-f %, --fan % Sets GPU fan speed (0-255 or 0-100%)
-l LEVEL, --perflevel LEVEL Sets performance level
-l LEVEL, --perf-level LEVEL Sets performance level
-P SETPROFILE, --profile SETPROFILE Set power profile level (#) or a quoted string of custom profile attributes
-d SCLKMAX, --perfdeterminism SCLKMAX Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation
-d SCLKMAX, --perf-determinism SCLKMAX Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation
-C PARTITION, --compute-partition PARTITION Sets compute partition mode
-M PARTITION, --memory-partition PARTITION Sets memory partition mode
Command Modifiers:
--json Displays output in JSON format (human readable by default).
@@ -340,7 +344,7 @@ Command Modifiers:
amd-smi reset --help
usage: amd-smi reset [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] -g GPU [GPU ...]
[-G] [-c] [-f] [-p] [-x] [-d]
[-G] [-c] [-f] [-p] [-x] [-d] [-C] [-M]
A GPU must be specified to reset a configuration.
A reset argument must be provided; Multiple reset arguments are accepted
@@ -355,7 +359,9 @@ Reset Arguments:
-f, --fans Reset fans to automatic (driver) control
-p, --profile Reset power profile back to default
-x, --xgmierr Reset XGMI error counts
-d, --perfdeterminism Disable performance determinism
-d, --perf-determinism Disable performance determinism
-C, --compute-partition Reset compute partitions on the specified GPU
-M, --memory-partition Reset memory partitions on the specified GPU
Command Modifiers:
--json Displays output in JSON format (human readable by default).
+84 -16
Просмотреть файл
@@ -130,8 +130,8 @@ class AMDSMICommands():
def static(self, args, multiple_devices=False, gpu=None, asic=None,
bus=None, vbios=None, limit=None, driver=None,
ras=None, board=None, numa=None, vram=None, cache=None):
bus=None, vbios=None, limit=None, driver=None, ras=None,
board=None, numa=None, vram=None, cache=None, partition=None):
"""Get Static information for target gpu
Args:
@@ -148,6 +148,7 @@ class AMDSMICommands():
numa (bool, optional): Value override for args.numa. Defaults to None.
vram (bool, optional): Value override for args.vram. Defaults to None.
cache (bool, optional): Value override for args.cache. Defaults to None.
partition (bool, optional): Value override for args.partition. Defaults to None.
Raises:
IndexError: Index error if gpu list is empty
@@ -177,6 +178,8 @@ class AMDSMICommands():
if self.helpers.is_linux() and self.helpers.is_baremetal():
if ras:
args.ras = ras
if partition:
args.partition = partition
if limit:
args.limit = limit
@@ -192,8 +195,8 @@ class AMDSMICommands():
# If all arguments are False, it means that no argument was passed and the entire static should be printed
if self.helpers.is_linux() and self.helpers.is_baremetal():
if not any([args.asic, args.bus, args.vbios, args.limit, args.board, args.ras, args.driver, args.numa, args.vram, args.cache]):
args.asic = args.bus = args.vbios = args.limit = args.board = args.ras = args.driver = args.numa = args.vram = args.cache = self.all_arguments = True
if not any([args.asic, args.bus, args.vbios, args.limit, args.board, args.ras, args.driver, args.numa, args.vram, args.cache, args.partition]):
args.asic = args.bus = args.vbios = args.limit = args.board = args.ras = args.driver = args.numa = args.vram = args.cache = args.partition = self.all_arguments = True
if self.helpers.is_linux() and self.helpers.is_virtual_os():
if not any([args.asic, args.bus, args.vbios, args.board, args.driver, args.vram, args.cache]):
args.asic = args.bus = args.vbios = args.board = args.driver = args.vram = args.cache = self.all_arguments = True
@@ -448,6 +451,7 @@ class AMDSMICommands():
logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['cache'] = cache_info
if self.helpers.is_hypervisor() or self.helpers.is_baremetal():
if args.ras:
ras_dict = {"eeprom_version": "N/A",
@@ -469,6 +473,22 @@ class AMDSMICommands():
logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info())
static_dict["ras"] = ras_dict
if args.partition:
try:
compute_partition = amdsmi_interface.amdsmi_dev_compute_partition_get(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
compute_partition = "N/A"
logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info())
try:
memory_partition = amdsmi_interface.amdsmi_dev_memory_partition_get(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
memory_partition = "N/A"
logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['partition'] = {"compute_partition": compute_partition,
"memory_partition": memory_partition}
if self.helpers.is_linux() and self.helpers.is_baremetal():
if args.numa:
try:
@@ -1711,8 +1731,9 @@ class AMDSMICommands():
self.logger.print_output(multiple_device_enabled=True)
def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perflevel=None,
profile=None, perfdeterminism=None):
def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
profile=None, perfdeterminism=None, compute_partition=None,
memory_partition=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -1720,9 +1741,11 @@ class AMDSMICommands():
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
fan (int, optional): Value override for args.fan. Defaults to None.
perflevel (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perflevel. Defaults to None.
perf_level (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perf_level. Defaults to None.
profile (bool, optional): Value override for args.profile. Defaults to None.
perfdeterminism (int, optional): Value override for args.perfdeterminism. Defaults to None.
compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None.
memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
@@ -1736,12 +1759,16 @@ class AMDSMICommands():
args.gpu = gpu
if fan:
args.fan = fan
if perflevel:
args.perflevel = perflevel
if perf_level:
args.perf_level = perf_level
if profile:
args.profile = profile
if perfdeterminism:
args.perfdeterminism = perfdeterminism
if compute_partition:
args.compute_partition = compute_partition
if memory_partition:
args.memory_partition = memory_partition
# Handle No GPU passed
if args.gpu == None:
@@ -1775,16 +1802,16 @@ class AMDSMICommands():
raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'fan', f"Successfully set fan speed {args.fan}")
if args.perflevel:
perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perflevel]
if args.perf_level:
perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perf_level]
try:
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, perf_level)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set performance level {args.perflevel} on {gpu_string}") from e
raise ValueError(f"Unable to set performance level {args.perf_level} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perflevel}")
self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perf_level}")
if args.profile:
self.logger.store_output(args.gpu, 'profile', "Not Yet Implemented")
if isinstance(args.perfdeterminism, int):
@@ -1796,7 +1823,22 @@ class AMDSMICommands():
raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perfdeterminism} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perfdeterminism}")
if args.compute_partition:
compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition]
try:
amdsmi_interface.amdsmi_dev_compute_partition_set(args.gpu, compute_partition)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set compute partition to {args.compute_partition} on {gpu_string}") from e
if args.memory_partition:
memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition]
try:
amdsmi_interface.amdsmi_dev_memory_partition_set(args.gpu, memory_partition)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
@@ -1805,7 +1847,8 @@ class AMDSMICommands():
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
clocks=None, fans=None, profile=None, xgmierr=None, perfdeterminism=None):
clocks=None, fans=None, profile=None, xgmierr=None, perfdeterminism=None,
compute_partition=None, memory_partition=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -1818,6 +1861,8 @@ class AMDSMICommands():
profile (bool, optional): Value override for args.profile. Defaults to None.
xgmierr (bool, optional): Value override for args.xgmierr. Defaults to None.
perfdeterminism (bool, optional): Value override for args.perfdeterminism. Defaults to None.
compute_partition (bool, optional): Value override for args.compute_partition. Defaults to None.
memory_partition (bool, optional): Value override for args.memory_partition. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
@@ -1841,6 +1886,10 @@ class AMDSMICommands():
args.xgmierr = xgmierr
if perfdeterminism:
args.perfdeterminism = perfdeterminism
if compute_partition:
args.compute_partition = compute_partition
if memory_partition:
args.memory_partition = memory_partition
# Handle No GPU passed
if args.gpu == None:
@@ -1958,8 +2007,27 @@ class AMDSMICommands():
raise PermissionError('Command requires elevation') from e
result = "N/A"
logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.store_output(args.gpu, 'reset_perf_determinism', result)
if args.compute_partition:
try:
amdsmi_interface.amdsmi_reset_gpu_compute_partition(args.gpu)
result = 'Successfully reset compute partition'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
result = "N/A"
logging.debug("Failed to reset compute partition on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.store_output(args.gpu, 'reset_compute_partition', result)
if args.memory_partition:
try:
amdsmi_interface.amdsmi_reset_gpu_memory_partition(args.gpu)
result = 'Successfully reset memory partition'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
result = "N/A"
logging.debug("Failed to reset memory partition on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.store_output(args.gpu, 'reset_memory_partition', result)
if multiple_devices:
self.logger.store_multiple_device_output()
+14
Просмотреть файл
@@ -342,6 +342,20 @@ class AMDSMIHelpers():
return perf_levels_str, perf_levels_int
def get_compute_partition_types(self):
compute_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiComputePartitionType]
if 'INVALID' in compute_partitions_str:
compute_partitions_str.remove('INVALID')
return compute_partitions_str
def get_memory_partition_types(self):
memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType]
if 'UNKNOWN' in memory_partitions_str:
memory_partitions_str.remove('UNKNOWN')
return memory_partitions_str
def get_clock_types(self):
clock_types_str = [clock.name for clock in amdsmi_interface.AmdSmiClkType]
clock_types_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiClkType))
+23 -13
Просмотреть файл
@@ -311,6 +311,7 @@ class AMDSMIParser(argparse.ArgumentParser):
# Options arguments help text for Hypervisors and Baremetal
ras_help = "Displays RAS features information"
numa_help = "All numa node information" # Linux Baremetal only
partition_help = "Partition information"
# Options arguments help text for Hypervisors
dfc_help = "All DFC FW table information"
@@ -339,6 +340,7 @@ class AMDSMIParser(argparse.ArgumentParser):
# Options to display on Hypervisors and Baremetal
if self.helpers.is_hypervisor() or self.helpers.is_baremetal():
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help)
if self.helpers.is_linux():
static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help)
static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help)
@@ -615,11 +617,13 @@ class AMDSMIParser(argparse.ArgumentParser):
\nA set argument must be provided; Multiple set arguments are accepted"
set_value_optionals_title = "Set Arguments"
# Help text for Arguments only on Guest and BM platforms
# Help text for Arguments only on BM platforms
set_fan_help = "Sets GPU fan speed (0-255 or 0-100%%)"
set_perf_level_help = "Sets performance level"
set_profile_help = "Set power profile level (#) or a quoted string of custom profile attributes"
set_perf_det_help = "Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation"
set_compute_partition_help = "Sets compute partition mode"
set_memory_partition_help = "Sets memory partition mode"
# Create set_value subparser
set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help)
@@ -633,9 +637,11 @@ class AMDSMIParser(argparse.ArgumentParser):
# Optional Args
set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%')
set_value_parser.add_argument('-l', '--perflevel', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL')
set_value_parser.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL')
set_value_parser.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE')
set_value_parser.add_argument('-d', '--perfdeterminism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLKMAX')
set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLKMAX')
set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION')
set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
def _validate_set_clock(self, validate_clock_type=True):
@@ -744,11 +750,13 @@ class AMDSMIParser(argparse.ArgumentParser):
# Help text for Arguments only on Guest and BM platforms
gpureset_help = "Reset the specified GPU"
resetclocks_help = "Reset clocks and overdrive to default"
resetfans_help = "Reset fans to automatic (driver) control"
resetprofile_help = "Reset power profile back to default"
resetxgmierr_help = "Reset XGMI error counts"
resetperfdet_help = "Disable performance determinism"
reset_clocks_help = "Reset clocks and overdrive to default"
reset_fans_help = "Reset fans to automatic (driver) control"
reset_profile_help = "Reset power profile back to default"
reset_xgmierr_help = "Reset XGMI error counts"
reset_perfdet_help = "Disable performance determinism"
reset_compute_help = "Reset compute partitions on the specified GPU"
reset_memory_help = "Reset memory partitions on the specified GPU"
# Create reset subparser
reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help)
@@ -762,11 +770,13 @@ class AMDSMIParser(argparse.ArgumentParser):
# Optional Args
reset_parser.add_argument('-G', '--gpureset', action='store_true', required=False, help=gpureset_help)
reset_parser.add_argument('-c', '--clocks', action='store_true', required=False, help=resetclocks_help)
reset_parser.add_argument('-f', '--fans', action='store_true', required=False, help=resetfans_help)
reset_parser.add_argument('-p', '--profile', action='store_true', required=False, help=resetprofile_help)
reset_parser.add_argument('-x', '--xgmierr', action='store_true', required=False, help=resetxgmierr_help)
reset_parser.add_argument('-d', '--perfdeterminism', action='store_true', required=False, help=resetperfdet_help)
reset_parser.add_argument('-c', '--clocks', action='store_true', required=False, help=reset_clocks_help)
reset_parser.add_argument('-f', '--fans', action='store_true', required=False, help=reset_fans_help)
reset_parser.add_argument('-p', '--profile', action='store_true', required=False, help=reset_profile_help)
reset_parser.add_argument('-x', '--xgmierr', action='store_true', required=False, help=reset_xgmierr_help)
reset_parser.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perfdet_help)
reset_parser.add_argument('-C', '--compute-partition', action='store_true', required=False, help=reset_compute_help)
reset_parser.add_argument('-M', '--memory-partition', action='store_true', required=False, help=reset_memory_help)
def _add_rocm_smi_parser(self, subparsers, func):