[SWDEV-482412] Optimized PCIe Bandwidth gpu_metrics calls

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Change-Id: Ib37d232b94a080e9b490dd065628d2567aaf4642


[ROCm/amdsmi commit: 787d4462fa]
Этот коммит содержится в:
Maisam Arif
2024-09-11 23:26:30 -05:00
родитель ae9d7a098c
Коммит cabbdf44cb
7 изменённых файлов: 123 добавлений и 103 удалений
+4 -1
Просмотреть файл
@@ -274,7 +274,8 @@ GPU: 1
### Optimizations
- N/A
- **Adjusted ordering of gpu_metrics calls to ensure that pcie_bw values remain stable in `amd-smi metric` & `amd-smi monitor`**.
With this change additional padding was added to PCIE_BW `amd-smi monitor --pcie`
### Resolved issues
@@ -331,6 +332,8 @@ GPU POWER GPU_TEMP MEM_TEMP VRAM_USED VRAM_TOTAL
31 227 W 51 °C 49 °C 283 MB 196300 MB
```
- **Fixed incorrect implementation of the Python API `amdsmi_get_gpu_metrics_header_info()`**.
### Known issues
- N/A
+111 -94
Просмотреть файл
@@ -361,7 +361,6 @@ class AMDSMICommands():
static_dict["asic"] = "N/A"
logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info())
# static["asic"] = "N/A"
try:
subsystem_id = amdsmi_interface.amdsmi_get_gpu_subsystem_id(args.gpu)
if static_dict["asic"] != "N/A":
@@ -1299,13 +1298,19 @@ class AMDSMICommands():
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
# Put the metrics table in the debug logs
try:
gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4)
logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
if args.loglevel == "DEBUG":
try:
# Get GPU Metrics table version
gpu_metric_version_info = amdsmi_interface.amdsmi_get_gpu_metrics_header_info(args.gpu)
gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4)
logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str)
# Get GPU Metrics table
gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4)
logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, gpu_metric_str)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
logging.debug(f"Metric Arg information for GPU {gpu_id} on {self.helpers.os_info()}")
logging.debug(f"Args: {current_platform_args}")
@@ -1319,6 +1324,88 @@ class AMDSMICommands():
# Add timestamp and store values for specified arguments
values_dict = {}
# Populate the pcie_dict first due to multiple gpu metrics calls incorrectly increasing bandwidth
if "pcie" in current_platform_args:
if args.pcie:
pcie_dict = {"width": "N/A",
"speed": "N/A",
"bandwidth": "N/A",
"replay_count" : "N/A",
"l0_to_recovery_count" : "N/A",
"replay_roll_over_count" : "N/A",
"nak_sent_count" : "N/A",
"nak_received_count" : "N/A",
"current_bandwidth_sent": "N/A",
"current_bandwidth_received": "N/A",
"max_packet_size": "N/A"}
try:
pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
pcie_dict['width'] = pcie_metric['pcie_width']
if pcie_metric['pcie_speed'] != "N/A":
if pcie_metric['pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1)
else:
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000)
pcie_dict['speed'] = pcie_speed_GTs_value
pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth']
pcie_dict['replay_count'] = pcie_metric['pcie_replay_count']
pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count']
pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count']
pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count']
pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count']
pcie_speed_unit = 'GT/s'
pcie_bw_unit = 'Mb/s'
if self.logger.is_human_readable_format():
if pcie_dict['speed'] != "N/A":
pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
if pcie_dict['bandwidth'] != "N/A":
pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}"
if self.logger.is_json_format():
if pcie_dict['speed'] != "N/A":
pcie_dict['speed'] = {"value" : pcie_dict['speed'],
"unit" : pcie_speed_unit}
if pcie_dict['bandwidth'] != "N/A":
pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'],
"unit" : pcie_bw_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
try:
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
received = pcie_bw['received'] * pcie_bw['max_pkt_sz']
bw_unit = "Mb/s"
packet_size_unit = "B"
if sent > 0:
sent = sent // 1024 // 1024
if received > 0:
received = received // 1024 // 1024
if self.logger.is_human_readable_format():
sent = f"{sent} {bw_unit}"
received = f"{received} {bw_unit}"
pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}"
if self.logger.is_json_format():
sent = {"value" : sent,
"unit" : bw_unit}
received = {"value" : received,
"unit" : bw_unit}
pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'],
"unit" : packet_size_unit}
pcie_dict['current_bandwidth_sent'] = sent
pcie_dict['current_bandwidth_received'] = received
pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info())
if "usage" in current_platform_args:
if args.usage:
try:
@@ -1648,89 +1735,12 @@ class AMDSMICommands():
"unit" : temp_unit_json}
values_dict['temperature'] = temperatures
# Since pcie bw may increase based on frequent metrics calls, we add it to the output here, but the populate the values first
if "pcie" in current_platform_args:
if args.pcie:
pcie_dict = {"width": "N/A",
"speed": "N/A",
"bandwidth": "N/A",
"replay_count" : "N/A",
"l0_to_recovery_count" : "N/A",
"replay_roll_over_count" : "N/A",
"nak_sent_count" : "N/A",
"nak_received_count" : "N/A",
"current_bandwidth_sent": "N/A",
"current_bandwidth_received": "N/A",
"max_packet_size": "N/A"}
try:
pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
pcie_dict['width'] = pcie_metric['pcie_width']
if pcie_metric['pcie_speed'] != "N/A":
if pcie_metric['pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1)
else:
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000)
pcie_dict['speed'] = pcie_speed_GTs_value
pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth']
pcie_dict['replay_count'] = pcie_metric['pcie_replay_count']
pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count']
pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count']
pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count']
pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count']
pcie_speed_unit = 'GT/s'
pcie_bw_unit = 'Mb/s'
if self.logger.is_human_readable_format():
if pcie_dict['speed'] != "N/A":
pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
if pcie_dict['bandwidth'] != "N/A":
pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}"
if self.logger.is_json_format():
if pcie_dict['speed'] != "N/A":
pcie_dict['speed'] = {"value" : pcie_dict['speed'],
"unit" : pcie_speed_unit}
if pcie_dict['bandwidth'] != "N/A":
pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'],
"unit" : pcie_bw_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
try:
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
received = pcie_bw['received'] * pcie_bw['max_pkt_sz']
bw_unit = "Mb/s"
packet_size_unit = "B"
if sent > 0:
sent = sent // 1024 // 1024
if received > 0:
received = received // 1024 // 1024
if self.logger.is_human_readable_format():
sent = f"{sent} {bw_unit}"
received = f"{received} {bw_unit}"
pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}"
if self.logger.is_json_format():
sent = {"value" : sent,
"unit" : bw_unit}
received = {"value" : received,
"unit" : bw_unit}
pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'],
"unit" : packet_size_unit}
pcie_dict['current_bandwidth_sent'] = sent
pcie_dict['current_bandwidth_received'] = received
pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info())
values_dict['pcie'] = pcie_dict
if "ecc" in current_platform_args:
if args.ecc:
ecc_count = {}
@@ -4360,6 +4370,15 @@ class AMDSMICommands():
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
self.logger.table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.table_header
# Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls
if args.pcie:
try:
pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
except amdsmi_exception.AmdSmiLibraryException as e:
pcie_info = "N/A"
logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info())
# Resume regular ordering of values
if args.power_usage:
try:
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
@@ -4601,15 +4620,13 @@ class AMDSMICommands():
self.logger.table_header += 'VRAM_USED'.rjust(11)
self.logger.table_header += 'VRAM_TOTAL'.rjust(12)
if args.pcie:
try:
pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
if pcie_info != "N/A":
pcie_bw_unit = 'Mb/s'
monitor_values['pcie_bw'] = self.helpers.unit_format(self.logger, pcie_info['pcie_bandwidth'], pcie_bw_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['pcie_bw'] = "N/A"
logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info())
else:
monitor_values['pcie_bw'] = pcie_info
self.logger.table_header += 'PCIE_BW'.rjust(10)
self.logger.table_header += 'PCIE_BW'.rjust(12)
self.logger.store_output(args.gpu, 'values', monitor_values)
+1 -1
Просмотреть файл
@@ -121,7 +121,7 @@ class AMDSMILogger():
table_values += string_value.rjust(7)
elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'):
table_values += string_value.rjust(11)
elif key == 'vram_total' or 'ecc' in key:
elif key == 'vram_total' or 'ecc' in key or key == 'pcie_bw':
table_values += string_value.rjust(12)
elif key in ['pcie_replay']:
table_values += string_value.rjust(13)
+1 -1
Просмотреть файл
@@ -810,7 +810,7 @@ except AmdSmiException as e:
### amdsmi_get_pcie_info
Description: Returns the pcie metric and static information for the given GPU.
Description: Returns the pcie metric and static information for the given GPU. For accurate PCIe Bandwidth measurements it is recommended to use this function once per 1000ms
It is not supported on virtual machine guest
Input parameters:
+1 -1
Просмотреть файл
@@ -810,7 +810,7 @@ except AmdSmiException as e:
### amdsmi_get_pcie_info
Description: Returns the pcie metric and static information for the given GPU.
Description: Returns the pcie metric and static information for the given GPU. For accurate PCIe Bandwidth measurements it is recommended to use this function once per 1000ms
It is not supported on virtual machine guest
Input parameters:
+4 -4
Просмотреть файл
@@ -4073,12 +4073,12 @@ def amdsmi_get_gpu_metrics_header_info(
header_info = amdsmi_wrapper.amd_metrics_table_header_t()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_metrics_header_info(
ctypes.byref(header_info)
processor_handle, ctypes.byref(header_info)
)
)
return {
"structure_size": header_info.structure_size.value,
"format_revision": header_info.format_revision.value,
"content_revision": header_info.content_revision.value
"structure_size": header_info.structure_size,
"format_revision": header_info.format_revision,
"content_revision": header_info.content_revision
}
+1 -1
Просмотреть файл
@@ -45,7 +45,7 @@ signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
atexit.register(amdsmi_shut_down)
gpus = amdsmi_get_socket_handles()
gpus = amdsmi_get_processor_handles()
cpus = amdsmi_get_cpusocket_handles()
print(f"gpus variable populated with:{gpus}")