From 25ef420407a93ea29a6f8444a5e5d95033b55bc9 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 24 Apr 2024 06:11:32 -0500 Subject: [PATCH] Updated monitor --pcie to use gpu_metrics pcie bandwidth Signed-off-by: Maisam Arif Change-Id: Id37aebc0297317edcd0f459a4817f56a6030d902 --- CHANGELOG.md | 9 +++++++++ amdsmi_cli/amdsmi_commands.py | 36 ++++++----------------------------- amdsmi_cli/amdsmi_parser.py | 4 ++-- 3 files changed, 17 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d09bb5ea9..c0afea2e29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,15 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ### Changed +- **Updated `amd-smi monitor --pcie` output** +The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: + +```shell +$ amd-smi monitor --pcie +GPU PCIE_BW + 0 26 Mb/s +``` + - **Updated `amd-smi metric --ecc-blocks` output** The ecc blocks arguement was outputing blocks without counters available, updated the filtering show blocks that counters are available for: diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 7d157ee969..492cba6572 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -4285,38 +4285,14 @@ class AMDSMICommands(): self.logger.table_header += 'VRAM_TOTAL'.rjust(12) if args.pcie: try: - pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) - sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz'] - received = pcie_bw['received'] * pcie_bw['max_pkt_sz'] - - bw_unit = "Mb/s" - packet_size_unit = "B" - if sent > 0: - sent = sent // 1024 // 1024 - if received > 0: - received = received // 1024 // 1024 - - if self.logger.is_human_readable_format(): - sent = f"{sent} {bw_unit}" - received = f"{received} {bw_unit}" - pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}" - if self.logger.is_json_format(): - sent = {"value" : sent, - "unit" : bw_unit} - received = {"value" : received, - "unit" : bw_unit} - pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'], - "unit" : packet_size_unit} - - monitor_values['pcie_tx'] = sent - monitor_values['pcie_rx'] = received + pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + pcie_bw_unit = 'Mb/s' + monitor_values['pcie_bw'] = self.helpers.unit_format(self.logger, pcie_info['pcie_bandwidth'], pcie_bw_unit) except amdsmi_exception.AmdSmiLibraryException as e: - monitor_values['pcie_tx'] = "N/A" - monitor_values['pcie_rx'] = "N/A" - logging.debug("Failed to get pci throughput on gpu %s | %s", gpu_id, e.get_error_info()) + monitor_values['pcie_bw'] = "N/A" + logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.table_header += 'PCIE_TX'.rjust(10) - self.logger.table_header += 'PCIE_RX'.rjust(10) + self.logger.table_header += 'PCIE_BW'.rjust(10) self.logger.store_output(args.gpu, 'values', monitor_values) diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index f1dae73d29..4b11188b03 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -1110,7 +1110,7 @@ class AMDSMIParser(argparse.ArgumentParser): throttle_help = "Monitor thermal throttle status" ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts" mem_usage_help = "Monitor memory usage in MB" - pcie_throughput_help = "Monitor PCIe Tx/Rx in MB/s" + pcie_bandwidth_help = "Monitor PCIe bandwidth in Mb/s" # Create monitor subparser monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help) @@ -1133,7 +1133,7 @@ class AMDSMIParser(argparse.ArgumentParser): monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help) monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help) - monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_throughput_help) + monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help) def _add_rocm_smi_parser(self, subparsers, func):