diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 40fc986d49..3099351e99 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -274,7 +274,8 @@ GPU: 1 ### Optimizations -- N/A +- **Adjusted ordering of gpu_metrics calls to ensure that pcie_bw values remain stable in `amd-smi metric` & `amd-smi monitor`**. +With this change additional padding was added to PCIE_BW `amd-smi monitor --pcie` ### Resolved issues @@ -331,6 +332,8 @@ GPU POWER GPU_TEMP MEM_TEMP VRAM_USED VRAM_TOTAL 31 227 W 51 °C 49 °C 283 MB 196300 MB ``` +- **Fixed incorrect implementation of the Python API `amdsmi_get_gpu_metrics_header_info()`**. + ### Known issues - N/A diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index c3b1a158da..82964f4776 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -361,7 +361,6 @@ class AMDSMICommands(): static_dict["asic"] = "N/A" logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) -# static["asic"] = "N/A" try: subsystem_id = amdsmi_interface.amdsmi_get_gpu_subsystem_id(args.gpu) if static_dict["asic"] != "N/A": @@ -1299,13 +1298,19 @@ class AMDSMICommands(): # Get gpu_id for logging gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) - # Put the metrics table in the debug logs - try: - gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) - gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4) - logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str) - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info) + if args.loglevel == "DEBUG": + try: + # Get GPU Metrics table version + gpu_metric_version_info = amdsmi_interface.amdsmi_get_gpu_metrics_header_info(args.gpu) + gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4) + logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str) + + # Get GPU Metrics table + gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4) + logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, gpu_metric_str) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info) logging.debug(f"Metric Arg information for GPU {gpu_id} on {self.helpers.os_info()}") logging.debug(f"Args: {current_platform_args}") @@ -1319,6 +1324,88 @@ class AMDSMICommands(): # Add timestamp and store values for specified arguments values_dict = {} + # Populate the pcie_dict first due to multiple gpu metrics calls incorrectly increasing bandwidth + if "pcie" in current_platform_args: + if args.pcie: + pcie_dict = {"width": "N/A", + "speed": "N/A", + "bandwidth": "N/A", + "replay_count" : "N/A", + "l0_to_recovery_count" : "N/A", + "replay_roll_over_count" : "N/A", + "nak_sent_count" : "N/A", + "nak_received_count" : "N/A", + "current_bandwidth_sent": "N/A", + "current_bandwidth_received": "N/A", + "max_packet_size": "N/A"} + + try: + pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric) + + pcie_dict['width'] = pcie_metric['pcie_width'] + + if pcie_metric['pcie_speed'] != "N/A": + if pcie_metric['pcie_speed'] % 1000 != 0: + pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1) + else: + pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000) + pcie_dict['speed'] = pcie_speed_GTs_value + + pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth'] + pcie_dict['replay_count'] = pcie_metric['pcie_replay_count'] + pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count'] + pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count'] + pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count'] + pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count'] + + pcie_speed_unit = 'GT/s' + pcie_bw_unit = 'Mb/s' + if self.logger.is_human_readable_format(): + if pcie_dict['speed'] != "N/A": + pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}" + if pcie_dict['bandwidth'] != "N/A": + pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}" + if self.logger.is_json_format(): + if pcie_dict['speed'] != "N/A": + pcie_dict['speed'] = {"value" : pcie_dict['speed'], + "unit" : pcie_speed_unit} + if pcie_dict['bandwidth'] != "N/A": + pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'], + "unit" : pcie_bw_unit} + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) + sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz'] + received = pcie_bw['received'] * pcie_bw['max_pkt_sz'] + + bw_unit = "Mb/s" + packet_size_unit = "B" + if sent > 0: + sent = sent // 1024 // 1024 + if received > 0: + received = received // 1024 // 1024 + + if self.logger.is_human_readable_format(): + sent = f"{sent} {bw_unit}" + received = f"{received} {bw_unit}" + pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}" + if self.logger.is_json_format(): + sent = {"value" : sent, + "unit" : bw_unit} + received = {"value" : received, + "unit" : bw_unit} + pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'], + "unit" : packet_size_unit} + + pcie_dict['current_bandwidth_sent'] = sent + pcie_dict['current_bandwidth_received'] = received + pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info()) + if "usage" in current_platform_args: if args.usage: try: @@ -1648,89 +1735,12 @@ class AMDSMICommands(): "unit" : temp_unit_json} values_dict['temperature'] = temperatures + + # Since pcie bw may increase based on frequent metrics calls, we add it to the output here, but the populate the values first if "pcie" in current_platform_args: if args.pcie: - pcie_dict = {"width": "N/A", - "speed": "N/A", - "bandwidth": "N/A", - "replay_count" : "N/A", - "l0_to_recovery_count" : "N/A", - "replay_roll_over_count" : "N/A", - "nak_sent_count" : "N/A", - "nak_received_count" : "N/A", - "current_bandwidth_sent": "N/A", - "current_bandwidth_received": "N/A", - "max_packet_size": "N/A"} - - try: - pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] - logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric) - - pcie_dict['width'] = pcie_metric['pcie_width'] - - if pcie_metric['pcie_speed'] != "N/A": - if pcie_metric['pcie_speed'] % 1000 != 0: - pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1) - else: - pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000) - pcie_dict['speed'] = pcie_speed_GTs_value - - pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth'] - pcie_dict['replay_count'] = pcie_metric['pcie_replay_count'] - pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count'] - pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count'] - pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count'] - pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count'] - - pcie_speed_unit = 'GT/s' - pcie_bw_unit = 'Mb/s' - if self.logger.is_human_readable_format(): - if pcie_dict['speed'] != "N/A": - pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}" - if pcie_dict['bandwidth'] != "N/A": - pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}" - if self.logger.is_json_format(): - if pcie_dict['speed'] != "N/A": - pcie_dict['speed'] = {"value" : pcie_dict['speed'], - "unit" : pcie_speed_unit} - if pcie_dict['bandwidth'] != "N/A": - pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'], - "unit" : pcie_bw_unit} - - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) - - try: - pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) - sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz'] - received = pcie_bw['received'] * pcie_bw['max_pkt_sz'] - - bw_unit = "Mb/s" - packet_size_unit = "B" - if sent > 0: - sent = sent // 1024 // 1024 - if received > 0: - received = received // 1024 // 1024 - - if self.logger.is_human_readable_format(): - sent = f"{sent} {bw_unit}" - received = f"{received} {bw_unit}" - pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}" - if self.logger.is_json_format(): - sent = {"value" : sent, - "unit" : bw_unit} - received = {"value" : received, - "unit" : bw_unit} - pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'], - "unit" : packet_size_unit} - - pcie_dict['current_bandwidth_sent'] = sent - pcie_dict['current_bandwidth_received'] = received - pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz'] - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info()) - values_dict['pcie'] = pcie_dict + if "ecc" in current_platform_args: if args.ecc: ecc_count = {} @@ -4360,6 +4370,15 @@ class AMDSMICommands(): self.logger.store_output(args.gpu, 'timestamp', int(time.time())) self.logger.table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.table_header + # Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls + if args.pcie: + try: + pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + except amdsmi_exception.AmdSmiLibraryException as e: + pcie_info = "N/A" + logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info()) + + # Resume regular ordering of values if args.power_usage: try: gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) @@ -4601,15 +4620,13 @@ class AMDSMICommands(): self.logger.table_header += 'VRAM_USED'.rjust(11) self.logger.table_header += 'VRAM_TOTAL'.rjust(12) if args.pcie: - try: - pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + if pcie_info != "N/A": pcie_bw_unit = 'Mb/s' monitor_values['pcie_bw'] = self.helpers.unit_format(self.logger, pcie_info['pcie_bandwidth'], pcie_bw_unit) - except amdsmi_exception.AmdSmiLibraryException as e: - monitor_values['pcie_bw'] = "N/A" - logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info()) + else: + monitor_values['pcie_bw'] = pcie_info - self.logger.table_header += 'PCIE_BW'.rjust(10) + self.logger.table_header += 'PCIE_BW'.rjust(12) self.logger.store_output(args.gpu, 'values', monitor_values) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 12fdd0faf8..08e0070852 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -121,7 +121,7 @@ class AMDSMILogger(): table_values += string_value.rjust(7) elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'): table_values += string_value.rjust(11) - elif key == 'vram_total' or 'ecc' in key: + elif key == 'vram_total' or 'ecc' in key or key == 'pcie_bw': table_values += string_value.rjust(12) elif key in ['pcie_replay']: table_values += string_value.rjust(13) diff --git a/projects/amdsmi/docs/how-to/using-amdsmi-for-python.md b/projects/amdsmi/docs/how-to/using-amdsmi-for-python.md index 7a454b1025..6cec1385a2 100644 --- a/projects/amdsmi/docs/how-to/using-amdsmi-for-python.md +++ b/projects/amdsmi/docs/how-to/using-amdsmi-for-python.md @@ -810,7 +810,7 @@ except AmdSmiException as e: ### amdsmi_get_pcie_info -Description: Returns the pcie metric and static information for the given GPU. +Description: Returns the pcie metric and static information for the given GPU. For accurate PCIe Bandwidth measurements it is recommended to use this function once per 1000ms It is not supported on virtual machine guest Input parameters: diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index 7a454b1025..6cec1385a2 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -810,7 +810,7 @@ except AmdSmiException as e: ### amdsmi_get_pcie_info -Description: Returns the pcie metric and static information for the given GPU. +Description: Returns the pcie metric and static information for the given GPU. For accurate PCIe Bandwidth measurements it is recommended to use this function once per 1000ms It is not supported on virtual machine guest Input parameters: diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 07188e7902..212571f2bd 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -4073,12 +4073,12 @@ def amdsmi_get_gpu_metrics_header_info( header_info = amdsmi_wrapper.amd_metrics_table_header_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_header_info( - ctypes.byref(header_info) + processor_handle, ctypes.byref(header_info) ) ) return { - "structure_size": header_info.structure_size.value, - "format_revision": header_info.format_revision.value, - "content_revision": header_info.content_revision.value + "structure_size": header_info.structure_size, + "format_revision": header_info.format_revision, + "content_revision": header_info.content_revision } diff --git a/projects/amdsmi/tools/amdsmi_quick_start.py b/projects/amdsmi/tools/amdsmi_quick_start.py index caa3bece26..fb08e6d1e6 100644 --- a/projects/amdsmi/tools/amdsmi_quick_start.py +++ b/projects/amdsmi/tools/amdsmi_quick_start.py @@ -45,7 +45,7 @@ signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) atexit.register(amdsmi_shut_down) -gpus = amdsmi_get_socket_handles() +gpus = amdsmi_get_processor_handles() cpus = amdsmi_get_cpusocket_handles() print(f"gpus variable populated with:{gpus}")