[SWDEV-482412] Optimized PCIe Bandwidth gpu_metrics calls
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Change-Id: Ib37d232b94a080e9b490dd065628d2567aaf4642
[ROCm/amdsmi commit: 787d4462fa]
Этот коммит содержится в:
@@ -274,7 +274,8 @@ GPU: 1
|
||||
|
||||
### Optimizations
|
||||
|
||||
- N/A
|
||||
- **Adjusted ordering of gpu_metrics calls to ensure that pcie_bw values remain stable in `amd-smi metric` & `amd-smi monitor`**.
|
||||
With this change additional padding was added to PCIE_BW `amd-smi monitor --pcie`
|
||||
|
||||
### Resolved issues
|
||||
|
||||
@@ -331,6 +332,8 @@ GPU POWER GPU_TEMP MEM_TEMP VRAM_USED VRAM_TOTAL
|
||||
31 227 W 51 °C 49 °C 283 MB 196300 MB
|
||||
```
|
||||
|
||||
- **Fixed incorrect implementation of the Python API `amdsmi_get_gpu_metrics_header_info()`**.
|
||||
|
||||
### Known issues
|
||||
|
||||
- N/A
|
||||
|
||||
@@ -361,7 +361,6 @@ class AMDSMICommands():
|
||||
static_dict["asic"] = "N/A"
|
||||
logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# static["asic"] = "N/A"
|
||||
try:
|
||||
subsystem_id = amdsmi_interface.amdsmi_get_gpu_subsystem_id(args.gpu)
|
||||
if static_dict["asic"] != "N/A":
|
||||
@@ -1299,13 +1298,19 @@ class AMDSMICommands():
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
# Put the metrics table in the debug logs
|
||||
try:
|
||||
gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4)
|
||||
logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
if args.loglevel == "DEBUG":
|
||||
try:
|
||||
# Get GPU Metrics table version
|
||||
gpu_metric_version_info = amdsmi_interface.amdsmi_get_gpu_metrics_header_info(args.gpu)
|
||||
gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4)
|
||||
logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str)
|
||||
|
||||
# Get GPU Metrics table
|
||||
gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4)
|
||||
logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, gpu_metric_str)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
|
||||
logging.debug(f"Metric Arg information for GPU {gpu_id} on {self.helpers.os_info()}")
|
||||
logging.debug(f"Args: {current_platform_args}")
|
||||
@@ -1319,6 +1324,88 @@ class AMDSMICommands():
|
||||
# Add timestamp and store values for specified arguments
|
||||
values_dict = {}
|
||||
|
||||
# Populate the pcie_dict first due to multiple gpu metrics calls incorrectly increasing bandwidth
|
||||
if "pcie" in current_platform_args:
|
||||
if args.pcie:
|
||||
pcie_dict = {"width": "N/A",
|
||||
"speed": "N/A",
|
||||
"bandwidth": "N/A",
|
||||
"replay_count" : "N/A",
|
||||
"l0_to_recovery_count" : "N/A",
|
||||
"replay_roll_over_count" : "N/A",
|
||||
"nak_sent_count" : "N/A",
|
||||
"nak_received_count" : "N/A",
|
||||
"current_bandwidth_sent": "N/A",
|
||||
"current_bandwidth_received": "N/A",
|
||||
"max_packet_size": "N/A"}
|
||||
|
||||
try:
|
||||
pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
|
||||
logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
|
||||
|
||||
pcie_dict['width'] = pcie_metric['pcie_width']
|
||||
|
||||
if pcie_metric['pcie_speed'] != "N/A":
|
||||
if pcie_metric['pcie_speed'] % 1000 != 0:
|
||||
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1)
|
||||
else:
|
||||
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000)
|
||||
pcie_dict['speed'] = pcie_speed_GTs_value
|
||||
|
||||
pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth']
|
||||
pcie_dict['replay_count'] = pcie_metric['pcie_replay_count']
|
||||
pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count']
|
||||
pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count']
|
||||
pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count']
|
||||
pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count']
|
||||
|
||||
pcie_speed_unit = 'GT/s'
|
||||
pcie_bw_unit = 'Mb/s'
|
||||
if self.logger.is_human_readable_format():
|
||||
if pcie_dict['speed'] != "N/A":
|
||||
pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
|
||||
if pcie_dict['bandwidth'] != "N/A":
|
||||
pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}"
|
||||
if self.logger.is_json_format():
|
||||
if pcie_dict['speed'] != "N/A":
|
||||
pcie_dict['speed'] = {"value" : pcie_dict['speed'],
|
||||
"unit" : pcie_speed_unit}
|
||||
if pcie_dict['bandwidth'] != "N/A":
|
||||
pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'],
|
||||
"unit" : pcie_bw_unit}
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
|
||||
sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
|
||||
received = pcie_bw['received'] * pcie_bw['max_pkt_sz']
|
||||
|
||||
bw_unit = "Mb/s"
|
||||
packet_size_unit = "B"
|
||||
if sent > 0:
|
||||
sent = sent // 1024 // 1024
|
||||
if received > 0:
|
||||
received = received // 1024 // 1024
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
sent = f"{sent} {bw_unit}"
|
||||
received = f"{received} {bw_unit}"
|
||||
pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}"
|
||||
if self.logger.is_json_format():
|
||||
sent = {"value" : sent,
|
||||
"unit" : bw_unit}
|
||||
received = {"value" : received,
|
||||
"unit" : bw_unit}
|
||||
pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'],
|
||||
"unit" : packet_size_unit}
|
||||
|
||||
pcie_dict['current_bandwidth_sent'] = sent
|
||||
pcie_dict['current_bandwidth_received'] = received
|
||||
pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if "usage" in current_platform_args:
|
||||
if args.usage:
|
||||
try:
|
||||
@@ -1648,89 +1735,12 @@ class AMDSMICommands():
|
||||
"unit" : temp_unit_json}
|
||||
|
||||
values_dict['temperature'] = temperatures
|
||||
|
||||
# Since pcie bw may increase based on frequent metrics calls, we add it to the output here, but the populate the values first
|
||||
if "pcie" in current_platform_args:
|
||||
if args.pcie:
|
||||
pcie_dict = {"width": "N/A",
|
||||
"speed": "N/A",
|
||||
"bandwidth": "N/A",
|
||||
"replay_count" : "N/A",
|
||||
"l0_to_recovery_count" : "N/A",
|
||||
"replay_roll_over_count" : "N/A",
|
||||
"nak_sent_count" : "N/A",
|
||||
"nak_received_count" : "N/A",
|
||||
"current_bandwidth_sent": "N/A",
|
||||
"current_bandwidth_received": "N/A",
|
||||
"max_packet_size": "N/A"}
|
||||
|
||||
try:
|
||||
pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
|
||||
logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
|
||||
|
||||
pcie_dict['width'] = pcie_metric['pcie_width']
|
||||
|
||||
if pcie_metric['pcie_speed'] != "N/A":
|
||||
if pcie_metric['pcie_speed'] % 1000 != 0:
|
||||
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1)
|
||||
else:
|
||||
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000)
|
||||
pcie_dict['speed'] = pcie_speed_GTs_value
|
||||
|
||||
pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth']
|
||||
pcie_dict['replay_count'] = pcie_metric['pcie_replay_count']
|
||||
pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count']
|
||||
pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count']
|
||||
pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count']
|
||||
pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count']
|
||||
|
||||
pcie_speed_unit = 'GT/s'
|
||||
pcie_bw_unit = 'Mb/s'
|
||||
if self.logger.is_human_readable_format():
|
||||
if pcie_dict['speed'] != "N/A":
|
||||
pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
|
||||
if pcie_dict['bandwidth'] != "N/A":
|
||||
pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}"
|
||||
if self.logger.is_json_format():
|
||||
if pcie_dict['speed'] != "N/A":
|
||||
pcie_dict['speed'] = {"value" : pcie_dict['speed'],
|
||||
"unit" : pcie_speed_unit}
|
||||
if pcie_dict['bandwidth'] != "N/A":
|
||||
pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'],
|
||||
"unit" : pcie_bw_unit}
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
|
||||
sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
|
||||
received = pcie_bw['received'] * pcie_bw['max_pkt_sz']
|
||||
|
||||
bw_unit = "Mb/s"
|
||||
packet_size_unit = "B"
|
||||
if sent > 0:
|
||||
sent = sent // 1024 // 1024
|
||||
if received > 0:
|
||||
received = received // 1024 // 1024
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
sent = f"{sent} {bw_unit}"
|
||||
received = f"{received} {bw_unit}"
|
||||
pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}"
|
||||
if self.logger.is_json_format():
|
||||
sent = {"value" : sent,
|
||||
"unit" : bw_unit}
|
||||
received = {"value" : received,
|
||||
"unit" : bw_unit}
|
||||
pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'],
|
||||
"unit" : packet_size_unit}
|
||||
|
||||
pcie_dict['current_bandwidth_sent'] = sent
|
||||
pcie_dict['current_bandwidth_received'] = received
|
||||
pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
values_dict['pcie'] = pcie_dict
|
||||
|
||||
if "ecc" in current_platform_args:
|
||||
if args.ecc:
|
||||
ecc_count = {}
|
||||
@@ -4360,6 +4370,15 @@ class AMDSMICommands():
|
||||
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
|
||||
self.logger.table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.table_header
|
||||
|
||||
# Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls
|
||||
if args.pcie:
|
||||
try:
|
||||
pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
pcie_info = "N/A"
|
||||
logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# Resume regular ordering of values
|
||||
if args.power_usage:
|
||||
try:
|
||||
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
@@ -4601,15 +4620,13 @@ class AMDSMICommands():
|
||||
self.logger.table_header += 'VRAM_USED'.rjust(11)
|
||||
self.logger.table_header += 'VRAM_TOTAL'.rjust(12)
|
||||
if args.pcie:
|
||||
try:
|
||||
pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
|
||||
if pcie_info != "N/A":
|
||||
pcie_bw_unit = 'Mb/s'
|
||||
monitor_values['pcie_bw'] = self.helpers.unit_format(self.logger, pcie_info['pcie_bandwidth'], pcie_bw_unit)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['pcie_bw'] = "N/A"
|
||||
logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
else:
|
||||
monitor_values['pcie_bw'] = pcie_info
|
||||
|
||||
self.logger.table_header += 'PCIE_BW'.rjust(10)
|
||||
self.logger.table_header += 'PCIE_BW'.rjust(12)
|
||||
|
||||
self.logger.store_output(args.gpu, 'values', monitor_values)
|
||||
|
||||
|
||||
@@ -121,7 +121,7 @@ class AMDSMILogger():
|
||||
table_values += string_value.rjust(7)
|
||||
elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'):
|
||||
table_values += string_value.rjust(11)
|
||||
elif key == 'vram_total' or 'ecc' in key:
|
||||
elif key == 'vram_total' or 'ecc' in key or key == 'pcie_bw':
|
||||
table_values += string_value.rjust(12)
|
||||
elif key in ['pcie_replay']:
|
||||
table_values += string_value.rjust(13)
|
||||
|
||||
@@ -810,7 +810,7 @@ except AmdSmiException as e:
|
||||
|
||||
### amdsmi_get_pcie_info
|
||||
|
||||
Description: Returns the pcie metric and static information for the given GPU.
|
||||
Description: Returns the pcie metric and static information for the given GPU. For accurate PCIe Bandwidth measurements it is recommended to use this function once per 1000ms
|
||||
It is not supported on virtual machine guest
|
||||
|
||||
Input parameters:
|
||||
|
||||
@@ -810,7 +810,7 @@ except AmdSmiException as e:
|
||||
|
||||
### amdsmi_get_pcie_info
|
||||
|
||||
Description: Returns the pcie metric and static information for the given GPU.
|
||||
Description: Returns the pcie metric and static information for the given GPU. For accurate PCIe Bandwidth measurements it is recommended to use this function once per 1000ms
|
||||
It is not supported on virtual machine guest
|
||||
|
||||
Input parameters:
|
||||
|
||||
@@ -4073,12 +4073,12 @@ def amdsmi_get_gpu_metrics_header_info(
|
||||
header_info = amdsmi_wrapper.amd_metrics_table_header_t()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_metrics_header_info(
|
||||
ctypes.byref(header_info)
|
||||
processor_handle, ctypes.byref(header_info)
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"structure_size": header_info.structure_size.value,
|
||||
"format_revision": header_info.format_revision.value,
|
||||
"content_revision": header_info.content_revision.value
|
||||
"structure_size": header_info.structure_size,
|
||||
"format_revision": header_info.format_revision,
|
||||
"content_revision": header_info.content_revision
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
atexit.register(amdsmi_shut_down)
|
||||
|
||||
gpus = amdsmi_get_socket_handles()
|
||||
gpus = amdsmi_get_processor_handles()
|
||||
cpus = amdsmi_get_cpusocket_handles()
|
||||
|
||||
print(f"gpus variable populated with:{gpus}")
|
||||
|
||||
Ссылка в новой задаче
Block a user