From 88473b7fd0d34790bca09fcc3607ea79e3a837e0 Mon Sep 17 00:00:00 2001 From: "Poag, Charis" Date: Mon, 7 Jul 2025 11:21:46 -0500 Subject: [PATCH] [SWDEV-533305] Remove partition info from `amd-smi static` (-p/--partition still available) + CLI API call cleanup (#529) Updates: - Separate extra APIs calls from amd-smi CLI to target specific CLI commands that need them. - Remove extra current_compute_partition SYSFS calls from amd-smi static. - Remove the partition information from the default `amd-smi static` CLI command. - Users must now use the `-p` argument to view partition information with `amd-smi static`. - The help text for the `partition` argument has been updated to reflect this change. - The partition information can still be accessed using the `amd-smi partition -c -m` or `sudo amd-smi partition -a` commands. --------- Signed-off-by: Charis Poag --- CHANGELOG.md | 10 +++++ amdsmi_cli/amdsmi_cli.py | 3 +- amdsmi_cli/amdsmi_commands.py | 71 ++++++++++++++++++------------- amdsmi_cli/amdsmi_parser.py | 78 +++++++++++++++++++++++++---------- rocm_smi/src/rocm_smi.cc | 12 +----- 5 files changed, 112 insertions(+), 62 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bf3868545..338c2799ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -268,6 +268,16 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Optimized +- **Reduced amd-smi's CLI's API calls needed to be called before reading or (re)setting GPU features**. + - Now when users call any amd-smi CLI command, we have reduced the APIs needed to be called. Previously, + when a user would read a GPU's status, (for example) we would poll for other information helpful for our sets/reset + CLI calls. This change will increase overall run-time performance of the CLI tool. + +- **Removed partition information from the default `amd-smi static` CLI command**. + - Users can still retrieve the same data by calling `amd-smi`, `amd-smi static -p`, or `amd-smi partition -c -m`/`sudo amd-smi partition -a`. + ***Reason for this change***: + Reading current_compute_partition may momentarily wake the GPU up. This is due to reading XCD registers, which is expected behavior. Changing partitions is not a trivial operation, `current_compute_partition` SYSFS controls this action. + - **Optimized CLI command `amd-smi topology` in partition mode**. - Reduced the number of `amdsmi_topo_get_p2p_status` API calls to one fourth. diff --git a/amdsmi_cli/amdsmi_cli.py b/amdsmi_cli/amdsmi_cli.py index 553858ea28..fe8ed5151d 100755 --- a/amdsmi_cli/amdsmi_cli.py +++ b/amdsmi_cli/amdsmi_cli.py @@ -96,7 +96,8 @@ if __name__ == "__main__": amd_smi_commands.xgmi, amd_smi_commands.partition, amd_smi_commands.ras, - amd_smi_commands.default) + amd_smi_commands.default, + sys_argv=sys.argv) try: try: argcomplete.autocomplete(amd_smi_parser) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index eeafcd1567..1134f0af4d 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -404,13 +404,27 @@ class AMDSMICommands(): if args.clock == []: args.clock = True - # Store args that are applicable to the current platform + # Store args that are applicable to the current platform (default arguments) current_platform_args = ["asic", "bus", "vbios", "driver", "ras", "vram", "cache", "board", "process_isolation", - "clock", "partition"] + "clock"] current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras, args.vram, args.cache, args.board, args.process_isolation, - args.clock, args.partition] + args.clock] + + # amd-smi static default arguments: + # Exclude args that are not applicable to the current platform, + # but allow output if argument is passed. + # + # Note: Partition is a special case, it is no longer an amd-smi static + # default argument. + # Reason: Reading current_compute_partition may momentarily wake the + # GPU up. This is due to reading XCD registers, which is expected + # behavior. Changing partitions is not a trivial operation, + # current_compute_partition SYSFS controls this action. + if args.partition: + current_platform_args += ["partition"] + current_platform_values += [args.partition] if not self.group_check_printed: self.helpers.check_required_groups() @@ -453,9 +467,12 @@ class AMDSMICommands(): # Get gpu_id for logging gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + logging.debug("=====================================================================") logging.debug(f"Static Arg information for GPU {gpu_id} on {self.helpers.os_info()}") - logging.debug(f"Applicable Args: {current_platform_args}") - logging.debug(f"Arg Values: {current_platform_values}") + logging.debug(f"Function args: {args}") + logging.debug(f"Current platform args: {current_platform_args}") + logging.debug(f"Current platform values: {current_platform_values}") + logging.debug("=====================================================================") # Populate static dictionary for each enabled argument static_dict = {} @@ -757,30 +774,26 @@ class AMDSMICommands(): logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info()) static_dict["ras"] = ras_dict - if 'partition' in current_platform_args: - if args.partition: - try: - compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu) - except amdsmi_exception.AmdSmiLibraryException as e: - compute_partition = "N/A" - logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info()) - - try: - memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu) - except amdsmi_exception.AmdSmiLibraryException as e: - memory_partition = "N/A" - logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info()) - - try: - kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu) - partition_id = kfd_info['current_partition_id'] - except amdsmi_exception.AmdSmiLibraryException as e: - partition_id = "N/A" - logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info()) - - static_dict['partition'] = {"accelerator_partition": compute_partition, - "memory_partition": memory_partition, - "partition_id": partition_id} + if args.partition: + try: + compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + compute_partition = "N/A" + logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info()) + try: + memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + memory_partition = "N/A" + logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info()) + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu) + partition_id = kfd_info['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + partition_id = "N/A" + logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info()) + static_dict['partition'] = {"accelerator_partition": compute_partition, + "memory_partition": memory_partition, + "partition_id": partition_id} if 'soc_pstate' in current_platform_args: if args.soc_pstate: try: diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 3986845590..c451bb7f79 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -69,7 +69,7 @@ class AMDSMIParser(argparse.ArgumentParser): """ def __init__(self, version, list, static, firmware, bad_pages, metric, process, profile, event, topology, set_value, reset, monitor, - xgmi, partition, ras, default): + xgmi, partition, ras, default, sys_argv=None): # Helper variables self.helpers = AMDSMIHelpers() @@ -118,25 +118,58 @@ class AMDSMIParser(argparse.ArgumentParser): 'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', 'default'] # Add all subparsers - self._add_version_parser(self.subparsers, version) - self._add_list_parser(self.subparsers, list) - self._add_static_parser(self.subparsers, static) - self._add_firmware_parser(self.subparsers, firmware) - self._add_bad_pages_parser(self.subparsers, bad_pages) - self._add_metric_parser(self.subparsers, metric) - self._add_process_parser(self.subparsers, process) - self._add_profile_parser(self.subparsers, profile) - self._add_event_parser(self.subparsers, event) - self._add_topology_parser(self.subparsers, topology) - self._add_set_value_parser(self.subparsers, set_value) - self._add_reset_parser(self.subparsers, reset) - self._add_monitor_parser(self.subparsers, monitor) - self._add_xgmi_parser(self.subparsers, xgmi) - self._add_partition_parser(self.subparsers, partition) - self._add_ras_parser(self.subparsers, ras) - - # the default command - self._add_default_parser(self.subparsers, default) + if sys_argv is not None: + if any(arg in sys_argv for arg in ['--help', '-h']): + self._add_version_parser(self.subparsers, version) + self._add_list_parser(self.subparsers, list) + self._add_static_parser(self.subparsers, static) + self._add_firmware_parser(self.subparsers, firmware) + self._add_bad_pages_parser(self.subparsers, bad_pages) + self._add_metric_parser(self.subparsers, metric) + self._add_process_parser(self.subparsers, process) + self._add_profile_parser(self.subparsers, profile) + self._add_event_parser(self.subparsers, event) + self._add_topology_parser(self.subparsers, topology) + self._add_set_value_parser(self.subparsers, set_value) + self._add_reset_parser(self.subparsers, reset) + self._add_monitor_parser(self.subparsers, monitor) + self._add_xgmi_parser(self.subparsers, xgmi) + self._add_partition_parser(self.subparsers, partition) + self._add_ras_parser(self.subparsers, ras) + elif any(arg in sys_argv for arg in ['version']): + self._add_version_parser(self.subparsers, version) + elif any(arg in sys_argv for arg in ['list']): + self._add_list_parser(self.subparsers, list) + elif any(arg in sys_argv for arg in ['static']): + self._add_static_parser(self.subparsers, static) + elif any(arg in sys_argv for arg in ['firmware', 'ucode']): + self._add_firmware_parser(self.subparsers, firmware) + elif any(arg in sys_argv for arg in ['bad-pages']): + self._add_bad_pages_parser(self.subparsers, bad_pages) + elif any(arg in sys_argv for arg in ['metric']): + self._add_metric_parser(self.subparsers, metric) + elif any(arg in sys_argv for arg in ['process']): + self._add_process_parser(self.subparsers, process) + elif any(arg in sys_argv for arg in ['profile']): + self._add_profile_parser(self.subparsers, profile) + elif any(arg in sys_argv for arg in ['event']): + self._add_event_parser(self.subparsers, event) + elif any(arg in sys_argv for arg in ['topology']): + self._add_topology_parser(self.subparsers, topology) + elif any(arg in sys_argv for arg in ['set', 'reset']): + self._add_set_value_parser(self.subparsers, set_value) + self._add_reset_parser(self.subparsers, reset) + elif any(arg in sys_argv for arg in ['monitor', 'dmon']): + self._add_monitor_parser(self.subparsers, monitor) + elif any(arg in sys_argv for arg in ['xgmi']): + self._add_xgmi_parser(self.subparsers, xgmi) + elif any(arg in sys_argv for arg in ['partition']): + self._add_partition_parser(self.subparsers, partition) + elif any(arg in sys_argv for arg in ['ras']): + self._add_ras_parser(self.subparsers, ras) + else: + # If no subcommand is given, add the default parser + self._add_default_parser(self.subparsers, default) def _not_negative_int(self, int_value, sub_arg=None): # Argument type validator @@ -733,7 +766,10 @@ class AMDSMIParser(argparse.ArgumentParser): # Might be able to remove Sudo requirement in ROCm 7.0 ras_help = "Displays RAS features information;\n\tSudo may be required for some features" numa_help = "All numa node information" # Linux Baremetal only - partition_help = "Partition information" + partition_help = "Partition information:\n\t" \ + "No longer available in default output.\n\tArgument is required to display." \ + "\n\tEx. `amd-smi static -p` or use" \ + "\n\t`amd-smi partition -c -m`/`sudo amd-smi partition -a`" # Options arguments help text for Hypervisors dfc_help = "All DFC FW table information" diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index ec03a85238..8b24aa46a8 100644 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -6816,18 +6816,9 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { return RSMI_STATUS_INVALID_ARGS; } DEVICE_MUTEX - std::string strCompPartition = "UNKNOWN"; - const uint32_t PARTITION_LEN = 10; - char compute_partition[PARTITION_LEN]; - compute_partition[0] = '\0'; - rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN); - if (ret == RSMI_STATUS_SUCCESS) { - strCompPartition.clear(); - strCompPartition = compute_partition; - } uint64_t pci_id = UINT64_MAX; *partition_id = UINT32_MAX; - ret = rsmi_dev_pci_id_get(dv_ind, &pci_id); + rsmi_status_t ret = rsmi_dev_pci_id_get(dv_ind, &pci_id); if (ret == RSMI_STATUS_SUCCESS) { *partition_id = static_cast((pci_id >> 28) & 0xf); } @@ -6869,7 +6860,6 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { << " | ======= end ======= " << " | Success" << " | Device #: " << dv_ind - << " | Compute Partition: " << strCompPartition << " | Type: partition_id" << " | Data: " << static_cast(*partition_id) << " | Returning = "