Updated monitor --pcie to use gpu_metrics pcie bandwidth

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: Id37aebc0297317edcd0f459a4817f56a6030d902
This commit is contained in:
Maisam Arif
2024-04-24 06:11:32 -05:00
committed by Maisam Arif
parent a0d0210761
commit 25ef420407
3 changed files with 17 additions and 32 deletions
+9
View File
@@ -12,6 +12,15 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
### Changed
- **Updated `amd-smi monitor --pcie` output**
The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output:
```shell
$ amd-smi monitor --pcie
GPU PCIE_BW
0 26 Mb/s
```
- **Updated `amd-smi metric --ecc-blocks` output**
The ecc blocks arguement was outputing blocks without counters available, updated the filtering show blocks that counters are available for:
+6 -30
View File
@@ -4285,38 +4285,14 @@ class AMDSMICommands():
self.logger.table_header += 'VRAM_TOTAL'.rjust(12)
if args.pcie:
try:
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
received = pcie_bw['received'] * pcie_bw['max_pkt_sz']
bw_unit = "Mb/s"
packet_size_unit = "B"
if sent > 0:
sent = sent // 1024 // 1024
if received > 0:
received = received // 1024 // 1024
if self.logger.is_human_readable_format():
sent = f"{sent} {bw_unit}"
received = f"{received} {bw_unit}"
pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}"
if self.logger.is_json_format():
sent = {"value" : sent,
"unit" : bw_unit}
received = {"value" : received,
"unit" : bw_unit}
pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'],
"unit" : packet_size_unit}
monitor_values['pcie_tx'] = sent
monitor_values['pcie_rx'] = received
pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
pcie_bw_unit = 'Mb/s'
monitor_values['pcie_bw'] = self.helpers.unit_format(self.logger, pcie_info['pcie_bandwidth'], pcie_bw_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['pcie_tx'] = "N/A"
monitor_values['pcie_rx'] = "N/A"
logging.debug("Failed to get pci throughput on gpu %s | %s", gpu_id, e.get_error_info())
monitor_values['pcie_bw'] = "N/A"
logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'PCIE_TX'.rjust(10)
self.logger.table_header += 'PCIE_RX'.rjust(10)
self.logger.table_header += 'PCIE_BW'.rjust(10)
self.logger.store_output(args.gpu, 'values', monitor_values)
+2 -2
View File
@@ -1110,7 +1110,7 @@ class AMDSMIParser(argparse.ArgumentParser):
throttle_help = "Monitor thermal throttle status"
ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts"
mem_usage_help = "Monitor memory usage in MB"
pcie_throughput_help = "Monitor PCIe Tx/Rx in MB/s"
pcie_bandwidth_help = "Monitor PCIe bandwidth in Mb/s"
# Create monitor subparser
monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help)
@@ -1133,7 +1133,7 @@ class AMDSMIParser(argparse.ArgumentParser):
monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help)
monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help)
monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_throughput_help)
monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help)
def _add_rocm_smi_parser(self, subparsers, func):