[SWDEV-533305] Remove partition info from amd-smi static (-p/--partition still available) + CLI API call cleanup (#529)
Updates: - Separate extra APIs calls from amd-smi CLI to target specific CLI commands that need them. - Remove extra current_compute_partition SYSFS calls from amd-smi static. - Remove the partition information from the default `amd-smi static` CLI command. - Users must now use the `-p` argument to view partition information with `amd-smi static`. - The help text for the `partition` argument has been updated to reflect this change. - The partition information can still be accessed using the `amd-smi partition -c -m` or `sudo amd-smi partition -a` commands. --------- Signed-off-by: Charis Poag <Charis.Poag@amd.com>
This commit is contained in:
@@ -268,6 +268,16 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
|
||||
### Optimized
|
||||
|
||||
- **Reduced amd-smi's CLI's API calls needed to be called before reading or (re)setting GPU features**.
|
||||
- Now when users call any amd-smi CLI command, we have reduced the APIs needed to be called. Previously,
|
||||
when a user would read a GPU's status, (for example) we would poll for other information helpful for our sets/reset
|
||||
CLI calls. This change will increase overall run-time performance of the CLI tool.
|
||||
|
||||
- **Removed partition information from the default `amd-smi static` CLI command**.
|
||||
- Users can still retrieve the same data by calling `amd-smi`, `amd-smi static -p`, or `amd-smi partition -c -m`/`sudo amd-smi partition -a`.
|
||||
***Reason for this change***:
|
||||
Reading current_compute_partition may momentarily wake the GPU up. This is due to reading XCD registers, which is expected behavior. Changing partitions is not a trivial operation, `current_compute_partition` SYSFS controls this action.
|
||||
|
||||
- **Optimized CLI command `amd-smi topology` in partition mode**.
|
||||
- Reduced the number of `amdsmi_topo_get_p2p_status` API calls to one fourth.
|
||||
|
||||
|
||||
@@ -96,7 +96,8 @@ if __name__ == "__main__":
|
||||
amd_smi_commands.xgmi,
|
||||
amd_smi_commands.partition,
|
||||
amd_smi_commands.ras,
|
||||
amd_smi_commands.default)
|
||||
amd_smi_commands.default,
|
||||
sys_argv=sys.argv)
|
||||
try:
|
||||
try:
|
||||
argcomplete.autocomplete(amd_smi_parser)
|
||||
|
||||
@@ -404,13 +404,27 @@ class AMDSMICommands():
|
||||
if args.clock == []:
|
||||
args.clock = True
|
||||
|
||||
# Store args that are applicable to the current platform
|
||||
# Store args that are applicable to the current platform (default arguments)
|
||||
current_platform_args = ["asic", "bus", "vbios", "driver", "ras",
|
||||
"vram", "cache", "board", "process_isolation",
|
||||
"clock", "partition"]
|
||||
"clock"]
|
||||
current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras,
|
||||
args.vram, args.cache, args.board, args.process_isolation,
|
||||
args.clock, args.partition]
|
||||
args.clock]
|
||||
|
||||
# amd-smi static default arguments:
|
||||
# Exclude args that are not applicable to the current platform,
|
||||
# but allow output if argument is passed.
|
||||
#
|
||||
# Note: Partition is a special case, it is no longer an amd-smi static
|
||||
# default argument.
|
||||
# Reason: Reading current_compute_partition may momentarily wake the
|
||||
# GPU up. This is due to reading XCD registers, which is expected
|
||||
# behavior. Changing partitions is not a trivial operation,
|
||||
# current_compute_partition SYSFS controls this action.
|
||||
if args.partition:
|
||||
current_platform_args += ["partition"]
|
||||
current_platform_values += [args.partition]
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
@@ -453,9 +467,12 @@ class AMDSMICommands():
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
logging.debug("=====================================================================")
|
||||
logging.debug(f"Static Arg information for GPU {gpu_id} on {self.helpers.os_info()}")
|
||||
logging.debug(f"Applicable Args: {current_platform_args}")
|
||||
logging.debug(f"Arg Values: {current_platform_values}")
|
||||
logging.debug(f"Function args: {args}")
|
||||
logging.debug(f"Current platform args: {current_platform_args}")
|
||||
logging.debug(f"Current platform values: {current_platform_values}")
|
||||
logging.debug("=====================================================================")
|
||||
|
||||
# Populate static dictionary for each enabled argument
|
||||
static_dict = {}
|
||||
@@ -757,30 +774,26 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
static_dict["ras"] = ras_dict
|
||||
if 'partition' in current_platform_args:
|
||||
if args.partition:
|
||||
try:
|
||||
compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
compute_partition = "N/A"
|
||||
logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
memory_partition = "N/A"
|
||||
logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
partition_id = "N/A"
|
||||
logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
static_dict['partition'] = {"accelerator_partition": compute_partition,
|
||||
"memory_partition": memory_partition,
|
||||
"partition_id": partition_id}
|
||||
if args.partition:
|
||||
try:
|
||||
compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
compute_partition = "N/A"
|
||||
logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
try:
|
||||
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
memory_partition = "N/A"
|
||||
logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
partition_id = "N/A"
|
||||
logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
static_dict['partition'] = {"accelerator_partition": compute_partition,
|
||||
"memory_partition": memory_partition,
|
||||
"partition_id": partition_id}
|
||||
if 'soc_pstate' in current_platform_args:
|
||||
if args.soc_pstate:
|
||||
try:
|
||||
|
||||
+57
-21
@@ -69,7 +69,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
"""
|
||||
def __init__(self, version, list, static, firmware, bad_pages, metric,
|
||||
process, profile, event, topology, set_value, reset, monitor,
|
||||
xgmi, partition, ras, default):
|
||||
xgmi, partition, ras, default, sys_argv=None):
|
||||
|
||||
# Helper variables
|
||||
self.helpers = AMDSMIHelpers()
|
||||
@@ -118,25 +118,58 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', 'default']
|
||||
|
||||
# Add all subparsers
|
||||
self._add_version_parser(self.subparsers, version)
|
||||
self._add_list_parser(self.subparsers, list)
|
||||
self._add_static_parser(self.subparsers, static)
|
||||
self._add_firmware_parser(self.subparsers, firmware)
|
||||
self._add_bad_pages_parser(self.subparsers, bad_pages)
|
||||
self._add_metric_parser(self.subparsers, metric)
|
||||
self._add_process_parser(self.subparsers, process)
|
||||
self._add_profile_parser(self.subparsers, profile)
|
||||
self._add_event_parser(self.subparsers, event)
|
||||
self._add_topology_parser(self.subparsers, topology)
|
||||
self._add_set_value_parser(self.subparsers, set_value)
|
||||
self._add_reset_parser(self.subparsers, reset)
|
||||
self._add_monitor_parser(self.subparsers, monitor)
|
||||
self._add_xgmi_parser(self.subparsers, xgmi)
|
||||
self._add_partition_parser(self.subparsers, partition)
|
||||
self._add_ras_parser(self.subparsers, ras)
|
||||
|
||||
# the default command
|
||||
self._add_default_parser(self.subparsers, default)
|
||||
if sys_argv is not None:
|
||||
if any(arg in sys_argv for arg in ['--help', '-h']):
|
||||
self._add_version_parser(self.subparsers, version)
|
||||
self._add_list_parser(self.subparsers, list)
|
||||
self._add_static_parser(self.subparsers, static)
|
||||
self._add_firmware_parser(self.subparsers, firmware)
|
||||
self._add_bad_pages_parser(self.subparsers, bad_pages)
|
||||
self._add_metric_parser(self.subparsers, metric)
|
||||
self._add_process_parser(self.subparsers, process)
|
||||
self._add_profile_parser(self.subparsers, profile)
|
||||
self._add_event_parser(self.subparsers, event)
|
||||
self._add_topology_parser(self.subparsers, topology)
|
||||
self._add_set_value_parser(self.subparsers, set_value)
|
||||
self._add_reset_parser(self.subparsers, reset)
|
||||
self._add_monitor_parser(self.subparsers, monitor)
|
||||
self._add_xgmi_parser(self.subparsers, xgmi)
|
||||
self._add_partition_parser(self.subparsers, partition)
|
||||
self._add_ras_parser(self.subparsers, ras)
|
||||
elif any(arg in sys_argv for arg in ['version']):
|
||||
self._add_version_parser(self.subparsers, version)
|
||||
elif any(arg in sys_argv for arg in ['list']):
|
||||
self._add_list_parser(self.subparsers, list)
|
||||
elif any(arg in sys_argv for arg in ['static']):
|
||||
self._add_static_parser(self.subparsers, static)
|
||||
elif any(arg in sys_argv for arg in ['firmware', 'ucode']):
|
||||
self._add_firmware_parser(self.subparsers, firmware)
|
||||
elif any(arg in sys_argv for arg in ['bad-pages']):
|
||||
self._add_bad_pages_parser(self.subparsers, bad_pages)
|
||||
elif any(arg in sys_argv for arg in ['metric']):
|
||||
self._add_metric_parser(self.subparsers, metric)
|
||||
elif any(arg in sys_argv for arg in ['process']):
|
||||
self._add_process_parser(self.subparsers, process)
|
||||
elif any(arg in sys_argv for arg in ['profile']):
|
||||
self._add_profile_parser(self.subparsers, profile)
|
||||
elif any(arg in sys_argv for arg in ['event']):
|
||||
self._add_event_parser(self.subparsers, event)
|
||||
elif any(arg in sys_argv for arg in ['topology']):
|
||||
self._add_topology_parser(self.subparsers, topology)
|
||||
elif any(arg in sys_argv for arg in ['set', 'reset']):
|
||||
self._add_set_value_parser(self.subparsers, set_value)
|
||||
self._add_reset_parser(self.subparsers, reset)
|
||||
elif any(arg in sys_argv for arg in ['monitor', 'dmon']):
|
||||
self._add_monitor_parser(self.subparsers, monitor)
|
||||
elif any(arg in sys_argv for arg in ['xgmi']):
|
||||
self._add_xgmi_parser(self.subparsers, xgmi)
|
||||
elif any(arg in sys_argv for arg in ['partition']):
|
||||
self._add_partition_parser(self.subparsers, partition)
|
||||
elif any(arg in sys_argv for arg in ['ras']):
|
||||
self._add_ras_parser(self.subparsers, ras)
|
||||
else:
|
||||
# If no subcommand is given, add the default parser
|
||||
self._add_default_parser(self.subparsers, default)
|
||||
|
||||
def _not_negative_int(self, int_value, sub_arg=None):
|
||||
# Argument type validator
|
||||
@@ -733,7 +766,10 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
# Might be able to remove Sudo requirement in ROCm 7.0
|
||||
ras_help = "Displays RAS features information;\n\tSudo may be required for some features"
|
||||
numa_help = "All numa node information" # Linux Baremetal only
|
||||
partition_help = "Partition information"
|
||||
partition_help = "Partition information:\n\t" \
|
||||
"No longer available in default output.\n\tArgument is required to display." \
|
||||
"\n\tEx. `amd-smi static -p` or use" \
|
||||
"\n\t`amd-smi partition -c -m`/`sudo amd-smi partition -a`"
|
||||
|
||||
# Options arguments help text for Hypervisors
|
||||
dfc_help = "All DFC FW table information"
|
||||
|
||||
@@ -6816,18 +6816,9 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
DEVICE_MUTEX
|
||||
std::string strCompPartition = "UNKNOWN";
|
||||
const uint32_t PARTITION_LEN = 10;
|
||||
char compute_partition[PARTITION_LEN];
|
||||
compute_partition[0] = '\0';
|
||||
rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN);
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
strCompPartition.clear();
|
||||
strCompPartition = compute_partition;
|
||||
}
|
||||
uint64_t pci_id = UINT64_MAX;
|
||||
*partition_id = UINT32_MAX;
|
||||
ret = rsmi_dev_pci_id_get(dv_ind, &pci_id);
|
||||
rsmi_status_t ret = rsmi_dev_pci_id_get(dv_ind, &pci_id);
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
*partition_id = static_cast<uint32_t>((pci_id >> 28) & 0xf);
|
||||
}
|
||||
@@ -6869,7 +6860,6 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
|
||||
<< " | ======= end ======= "
|
||||
<< " | Success"
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Compute Partition: " << strCompPartition
|
||||
<< " | Type: partition_id"
|
||||
<< " | Data: " << static_cast<int>(*partition_id)
|
||||
<< " | Returning = "
|
||||
|
||||
Reference in New Issue
Block a user