[SWDEV-533305] Remove partition info from amd-smi static (-p/--partition still available) + CLI API call cleanup (#529)

Updates:
- Separate extra APIs calls from amd-smi CLI to target specific CLI commands that need them.
- Remove extra current_compute_partition SYSFS calls from amd-smi static.
- Remove the partition information from the default `amd-smi static` CLI command.
- Users must now use the `-p` argument to view partition information with `amd-smi static`.
- The help text for the `partition` argument has been updated to reflect this change.
- The partition information can still be accessed using the `amd-smi partition -c -m` or `sudo amd-smi partition -a` commands.

---------

Signed-off-by: Charis Poag <Charis.Poag@amd.com>
This commit is contained in:
Poag, Charis
2025-07-07 11:21:46 -05:00
committed by GitHub
parent 8039ab9449
commit 88473b7fd0
5 changed files with 112 additions and 62 deletions
+10
View File
@@ -268,6 +268,16 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Optimized
- **Reduced amd-smi's CLI's API calls needed to be called before reading or (re)setting GPU features**.
- Now when users call any amd-smi CLI command, we have reduced the APIs needed to be called. Previously,
when a user would read a GPU's status, (for example) we would poll for other information helpful for our sets/reset
CLI calls. This change will increase overall run-time performance of the CLI tool.
- **Removed partition information from the default `amd-smi static` CLI command**.
- Users can still retrieve the same data by calling `amd-smi`, `amd-smi static -p`, or `amd-smi partition -c -m`/`sudo amd-smi partition -a`.
***Reason for this change***:
Reading current_compute_partition may momentarily wake the GPU up. This is due to reading XCD registers, which is expected behavior. Changing partitions is not a trivial operation, `current_compute_partition` SYSFS controls this action.
- **Optimized CLI command `amd-smi topology` in partition mode**.
- Reduced the number of `amdsmi_topo_get_p2p_status` API calls to one fourth.
+2 -1
View File
@@ -96,7 +96,8 @@ if __name__ == "__main__":
amd_smi_commands.xgmi,
amd_smi_commands.partition,
amd_smi_commands.ras,
amd_smi_commands.default)
amd_smi_commands.default,
sys_argv=sys.argv)
try:
try:
argcomplete.autocomplete(amd_smi_parser)
+42 -29
View File
@@ -404,13 +404,27 @@ class AMDSMICommands():
if args.clock == []:
args.clock = True
# Store args that are applicable to the current platform
# Store args that are applicable to the current platform (default arguments)
current_platform_args = ["asic", "bus", "vbios", "driver", "ras",
"vram", "cache", "board", "process_isolation",
"clock", "partition"]
"clock"]
current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras,
args.vram, args.cache, args.board, args.process_isolation,
args.clock, args.partition]
args.clock]
# amd-smi static default arguments:
# Exclude args that are not applicable to the current platform,
# but allow output if argument is passed.
#
# Note: Partition is a special case, it is no longer an amd-smi static
# default argument.
# Reason: Reading current_compute_partition may momentarily wake the
# GPU up. This is due to reading XCD registers, which is expected
# behavior. Changing partitions is not a trivial operation,
# current_compute_partition SYSFS controls this action.
if args.partition:
current_platform_args += ["partition"]
current_platform_values += [args.partition]
if not self.group_check_printed:
self.helpers.check_required_groups()
@@ -453,9 +467,12 @@ class AMDSMICommands():
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
logging.debug("=====================================================================")
logging.debug(f"Static Arg information for GPU {gpu_id} on {self.helpers.os_info()}")
logging.debug(f"Applicable Args: {current_platform_args}")
logging.debug(f"Arg Values: {current_platform_values}")
logging.debug(f"Function args: {args}")
logging.debug(f"Current platform args: {current_platform_args}")
logging.debug(f"Current platform values: {current_platform_values}")
logging.debug("=====================================================================")
# Populate static dictionary for each enabled argument
static_dict = {}
@@ -757,30 +774,26 @@ class AMDSMICommands():
logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info())
static_dict["ras"] = ras_dict
if 'partition' in current_platform_args:
if args.partition:
try:
compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
compute_partition = "N/A"
logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info())
try:
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
memory_partition = "N/A"
logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info())
try:
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
partition_id = kfd_info['current_partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
partition_id = "N/A"
logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['partition'] = {"accelerator_partition": compute_partition,
"memory_partition": memory_partition,
"partition_id": partition_id}
if args.partition:
try:
compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
compute_partition = "N/A"
logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info())
try:
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
memory_partition = "N/A"
logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info())
try:
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
partition_id = kfd_info['current_partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
partition_id = "N/A"
logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['partition'] = {"accelerator_partition": compute_partition,
"memory_partition": memory_partition,
"partition_id": partition_id}
if 'soc_pstate' in current_platform_args:
if args.soc_pstate:
try:
+57 -21
View File
@@ -69,7 +69,7 @@ class AMDSMIParser(argparse.ArgumentParser):
"""
def __init__(self, version, list, static, firmware, bad_pages, metric,
process, profile, event, topology, set_value, reset, monitor,
xgmi, partition, ras, default):
xgmi, partition, ras, default, sys_argv=None):
# Helper variables
self.helpers = AMDSMIHelpers()
@@ -118,25 +118,58 @@ class AMDSMIParser(argparse.ArgumentParser):
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', 'default']
# Add all subparsers
self._add_version_parser(self.subparsers, version)
self._add_list_parser(self.subparsers, list)
self._add_static_parser(self.subparsers, static)
self._add_firmware_parser(self.subparsers, firmware)
self._add_bad_pages_parser(self.subparsers, bad_pages)
self._add_metric_parser(self.subparsers, metric)
self._add_process_parser(self.subparsers, process)
self._add_profile_parser(self.subparsers, profile)
self._add_event_parser(self.subparsers, event)
self._add_topology_parser(self.subparsers, topology)
self._add_set_value_parser(self.subparsers, set_value)
self._add_reset_parser(self.subparsers, reset)
self._add_monitor_parser(self.subparsers, monitor)
self._add_xgmi_parser(self.subparsers, xgmi)
self._add_partition_parser(self.subparsers, partition)
self._add_ras_parser(self.subparsers, ras)
# the default command
self._add_default_parser(self.subparsers, default)
if sys_argv is not None:
if any(arg in sys_argv for arg in ['--help', '-h']):
self._add_version_parser(self.subparsers, version)
self._add_list_parser(self.subparsers, list)
self._add_static_parser(self.subparsers, static)
self._add_firmware_parser(self.subparsers, firmware)
self._add_bad_pages_parser(self.subparsers, bad_pages)
self._add_metric_parser(self.subparsers, metric)
self._add_process_parser(self.subparsers, process)
self._add_profile_parser(self.subparsers, profile)
self._add_event_parser(self.subparsers, event)
self._add_topology_parser(self.subparsers, topology)
self._add_set_value_parser(self.subparsers, set_value)
self._add_reset_parser(self.subparsers, reset)
self._add_monitor_parser(self.subparsers, monitor)
self._add_xgmi_parser(self.subparsers, xgmi)
self._add_partition_parser(self.subparsers, partition)
self._add_ras_parser(self.subparsers, ras)
elif any(arg in sys_argv for arg in ['version']):
self._add_version_parser(self.subparsers, version)
elif any(arg in sys_argv for arg in ['list']):
self._add_list_parser(self.subparsers, list)
elif any(arg in sys_argv for arg in ['static']):
self._add_static_parser(self.subparsers, static)
elif any(arg in sys_argv for arg in ['firmware', 'ucode']):
self._add_firmware_parser(self.subparsers, firmware)
elif any(arg in sys_argv for arg in ['bad-pages']):
self._add_bad_pages_parser(self.subparsers, bad_pages)
elif any(arg in sys_argv for arg in ['metric']):
self._add_metric_parser(self.subparsers, metric)
elif any(arg in sys_argv for arg in ['process']):
self._add_process_parser(self.subparsers, process)
elif any(arg in sys_argv for arg in ['profile']):
self._add_profile_parser(self.subparsers, profile)
elif any(arg in sys_argv for arg in ['event']):
self._add_event_parser(self.subparsers, event)
elif any(arg in sys_argv for arg in ['topology']):
self._add_topology_parser(self.subparsers, topology)
elif any(arg in sys_argv for arg in ['set', 'reset']):
self._add_set_value_parser(self.subparsers, set_value)
self._add_reset_parser(self.subparsers, reset)
elif any(arg in sys_argv for arg in ['monitor', 'dmon']):
self._add_monitor_parser(self.subparsers, monitor)
elif any(arg in sys_argv for arg in ['xgmi']):
self._add_xgmi_parser(self.subparsers, xgmi)
elif any(arg in sys_argv for arg in ['partition']):
self._add_partition_parser(self.subparsers, partition)
elif any(arg in sys_argv for arg in ['ras']):
self._add_ras_parser(self.subparsers, ras)
else:
# If no subcommand is given, add the default parser
self._add_default_parser(self.subparsers, default)
def _not_negative_int(self, int_value, sub_arg=None):
# Argument type validator
@@ -733,7 +766,10 @@ class AMDSMIParser(argparse.ArgumentParser):
# Might be able to remove Sudo requirement in ROCm 7.0
ras_help = "Displays RAS features information;\n\tSudo may be required for some features"
numa_help = "All numa node information" # Linux Baremetal only
partition_help = "Partition information"
partition_help = "Partition information:\n\t" \
"No longer available in default output.\n\tArgument is required to display." \
"\n\tEx. `amd-smi static -p` or use" \
"\n\t`amd-smi partition -c -m`/`sudo amd-smi partition -a`"
# Options arguments help text for Hypervisors
dfc_help = "All DFC FW table information"
+1 -11
View File
@@ -6816,18 +6816,9 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
return RSMI_STATUS_INVALID_ARGS;
}
DEVICE_MUTEX
std::string strCompPartition = "UNKNOWN";
const uint32_t PARTITION_LEN = 10;
char compute_partition[PARTITION_LEN];
compute_partition[0] = '\0';
rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN);
if (ret == RSMI_STATUS_SUCCESS) {
strCompPartition.clear();
strCompPartition = compute_partition;
}
uint64_t pci_id = UINT64_MAX;
*partition_id = UINT32_MAX;
ret = rsmi_dev_pci_id_get(dv_ind, &pci_id);
rsmi_status_t ret = rsmi_dev_pci_id_get(dv_ind, &pci_id);
if (ret == RSMI_STATUS_SUCCESS) {
*partition_id = static_cast<uint32_t>((pci_id >> 28) & 0xf);
}
@@ -6869,7 +6860,6 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
<< " | ======= end ======= "
<< " | Success"
<< " | Device #: " << dv_ind
<< " | Compute Partition: " << strCompPartition
<< " | Type: partition_id"
<< " | Data: " << static_cast<int>(*partition_id)
<< " | Returning = "