From 2eff0b3764f8ad2cd672b5cb9c22be50d471fbfd Mon Sep 17 00:00:00 2001 From: "Kanangot Balakrishnan, Bindhiya" Date: Fri, 30 May 2025 16:51:11 -0500 Subject: [PATCH] [SWDEV-530633] Use gpu_metric speed and BW for xgmi (#366) The xgmi command was showing pcie bit rate and bandwidth instead of xgmi. Corrected the API to get xgmi data from gpu metric. Added python API for amdsmi_get_link_metrics. Modified the amdsmi_link_metrics struct. Added check to confirm non zero partition got xgmi command. --------- Signed-off-by: Bindhiya Kanangot Balakrishnan Signed-off-by: Maisam Arif --- amdsmi_cli/amdsmi_commands.py | 69 +++++++++++++++++++++++--------- docs/reference/amdsmi-py-api.md | 43 ++++++++++++++++++++ include/amd_smi/amdsmi.h | 14 +++---- py-interface/__init__.py | 1 + py-interface/amdsmi_interface.py | 35 ++++++++++++++++ py-interface/amdsmi_wrapper.py | 4 +- src/amd_smi/amd_smi.cc | 12 ++++-- 7 files changed, 147 insertions(+), 31 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 5274bfe6d6..f693786832 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -40,7 +40,7 @@ class AMDSMICommands(): Each command function will interact with AMDSMILogger to handle displaying the output to the specified format and destination. """ - + def __init__(self, format='human_readable', destination='stdout') -> None: self.helpers = AMDSMIHelpers() self.logger = AMDSMILogger(format=format, destination=destination) @@ -2024,7 +2024,7 @@ class AMDSMICommands(): except KeyError as e: logging.debug("Failed to get current_socclk for gpu %s | %s", gpu_id, e) - + # Populate the max and min clock values from sysfs. # Min and Max values are per clock type, not per clock engine. # Populate the deep sleep value from amdsmi_get_clock_info @@ -2075,7 +2075,7 @@ class AMDSMICommands(): # Iterate through the maximum number of VCLK clocks supported for index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): vclk_index = f"vclk_{index}" # Construct the index key for the clock - + # Check if the current clock value is not "N/A" if clocks[vclk_index]["clk"] != "N/A": # Format and assign the minimum clock value for the current VCLK @@ -4480,7 +4480,7 @@ class AMDSMICommands(): future_set_count = self.helpers.get_set_count() if current_set_count == future_set_count-1: self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {user_requested_partition_args}") - + except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e @@ -5454,7 +5454,7 @@ class AMDSMICommands(): self.logger.table_header += 'MEM%'.rjust(7) - # don't populate mem clock on default output + # don't populate mem clock on default output if not args.default_output: try: mem_clock = gpu_metrics_info['current_uclk'] @@ -5875,6 +5875,17 @@ class AMDSMICommands(): # Populate the possible gpus and their bdfs xgmi_values = [] for gpu in args.gpu: + partition_id = -1 + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(gpu) + partition_id = kfd_info['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get kfd info for gpu %s | %s", gpu, e.get_error_info()) + + if partition_id != 0: + logging.debug(f"Skipping xgmi command due to non zero partition {gpu} - {partition_id}") + continue + logging.debug("check1 device_handle: %s", gpu) gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu) @@ -5906,14 +5917,9 @@ class AMDSMICommands(): } try: - pcie_static = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static'] - if pcie_static['max_pcie_speed'] % 1000 != 0: - pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000, 1) - else: - pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000) - - bitrate = pcie_speed_GTs_value - max_bandwidth = bitrate * pcie_static['max_pcie_width'] + xgmi_metrics_info = amdsmi_interface.amdsmi_get_link_metrics(src_gpu) + bitrate = xgmi_metrics_info['bit_rate'] + max_bandwidth = xgmi_metrics_info['max_bandwidth'] except amdsmi_exception.AmdSmiLibraryException as e: bitrate = "N/A" max_bandwidth = "N/A" @@ -5935,7 +5941,18 @@ class AMDSMICommands(): xgmi_dict['link_metrics']['max_bandwidth'] = max_bandwidth # Populate link metrics + link_num = 0 for dest_gpu in args.gpu: + partition_id = -1 + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(dest_gpu) + partition_id = kfd_info['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get kfd info for gpu %s | %s", dest_gpu, e.get_error_info()) + + if partition_id != 0: + continue + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) dest_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu) dest_link_dict = { @@ -5954,10 +5971,10 @@ class AMDSMICommands(): try: # Get the read write relative to the source gpu - metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(src_gpu) - read = metrics_info['xgmi_read_data_acc'][dest_gpu_id] - write = metrics_info['xgmi_write_data_acc'][dest_gpu_id] - except amdsmi_exception.AmdSmiLibraryException as e: + read = xgmi_metrics_info['links'][link_num]['read'] + write = xgmi_metrics_info['links'][link_num]['write'] + link_num += 1 + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: read = "N/A" write = "N/A" logging.debug("Failed to get read data for %s to %s | %s", @@ -6087,7 +6104,21 @@ class AMDSMICommands(): self.logger.print_output(multiple_device_enabled=True, tabular=True) self.logger.clear_multiple_devices_output() if self.logger.is_human_readable_format(): - print("\n* U:Up D:Down X:Disabled".ljust(13)) + # Populate the legend output + legend_parts = [ + "\n\nLegend:", + " SELF = Current GPU", + " N/A = Not supported", + " U / D / X = Link is Up / Down / Disabled", + " Read / Write = GPU Metric Accumulated Read / Write" + ] + legend_output = "\n".join(legend_parts) + + if self.logger.destination == 'stdout': + print(legend_output) + else: + with self.logger.destination.open('a', encoding="utf-8") as output_file: + output_file.write(legend_output + '\n') def partition(self, args, multiple_devices=False, gpu=None, current=None, memory=None, accelerator=None): @@ -6385,7 +6416,7 @@ class AMDSMICommands(): continue resource_index = 0 - for p in range(0, num_profiles): + for p in range(0, num_profiles): for r in range(0, num_resource_profiles): resource_type = partition_config_dict['profiles'][p]['resources'][r]['resource_type'] resource_instances = partition_config_dict['profiles'][p]['resources'][r]['partition_resource'] diff --git a/docs/reference/amdsmi-py-api.md b/docs/reference/amdsmi-py-api.md index 16cf3f78cc..7ee53236ea 100644 --- a/docs/reference/amdsmi-py-api.md +++ b/docs/reference/amdsmi-py-api.md @@ -3819,6 +3819,49 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_link_metrics + +Description: Returns XGMI link metrics information for the given GPU. + +Input parameters: + +* `processor_handle` — The device handle for which to query link metrics. + +Output: Dictionary with fields + +Field | Description +---|--- +`num_links` | Number of XGMI links reported +`bit_rate` | XGMI link bit rate (in appropriate units, e.g., Gbps) +`max_bandwidth` | Maximum XGMI bandwidth (in appropriate units, e.g., GB/s) +`links` | List of dictionaries, one per XGMI link, each with: +`bdf` | BDF string for the destination +`link_type` | Link type +`read` | Accumulated read data for this link (e.g., KB) +`write` | Accumulated write data for this link (e.g., KB) + +Exceptions that can be thrown by `amdsmi_get_link_metrics` function: + +* `AmdSmiLibraryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + link_metrics = amdsmi_get_link_metrics(device) + print(link_metrics['bit_rate']) + print(link_metrics['max_bandwidth']) + for idx, link in enumerate(link_metrics['links']): + print(f"{idx}: {link['bdf']}, {link['link_type']}, {link['read']} KB, {link['write']} KB") +except AmdSmiException as e: + print(e) + ### amdsmi_topo_get_link_type Description: Retrieve the hops and the connection type between 2 GPUs diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index e8353c4ba3..386bcb801b 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -967,14 +967,14 @@ typedef struct { * @cond @tag{gpu_bm_linux} @endcond */ typedef struct { - uint32_t num_links; //!< number of links + uint32_t num_links; //!< number of links + uint32_t bit_rate; //!< current link speed in Gb/s + uint32_t max_bandwidth; //!< max bandwidth of the link in Gb/s struct _links { - amdsmi_bdf_t bdf; - uint32_t bit_rate; //!< current link speed in Gb/s - uint32_t max_bandwidth; //!< max bandwidth of the link in Gb/s - amdsmi_link_type_t link_type; //!< type of the link - uint64_t read; //!< total data received for each link in KB - uint64_t write; //!< total data transfered for each link in KB + amdsmi_bdf_t bdf; //!< bdf of the destination gpu + amdsmi_link_type_t link_type; //!< type of the link + uint64_t read; //!< total data received for each link in KB + uint64_t write; //!< total data transfered for each link in KB uint64_t reserved[2]; } links[AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK]; uint64_t reserved[7]; diff --git a/py-interface/__init__.py b/py-interface/__init__.py index 95a898213b..8e0cd92c04 100644 --- a/py-interface/__init__.py +++ b/py-interface/__init__.py @@ -221,6 +221,7 @@ from .amdsmi_interface import amdsmi_get_gpu_subsystem_name from .amdsmi_interface import amdsmi_topo_get_numa_node_number from .amdsmi_interface import amdsmi_topo_get_link_weight from .amdsmi_interface import amdsmi_get_minmax_bandwidth_between_processors +from .amdsmi_interface import amdsmi_get_link_metrics from .amdsmi_interface import amdsmi_topo_get_link_type from .amdsmi_interface import amdsmi_topo_get_p2p_status from .amdsmi_interface import amdsmi_is_P2P_accessible diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 6e7f2a3b01..01837fdd7b 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -3099,6 +3099,41 @@ def amdsmi_get_minmax_bandwidth_between_processors( return {"min_bandwidth": min_bandwidth.value, "max_bandwidth": max_bandwidth.value} +def amdsmi_get_link_metrics(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + link_metrics = amdsmi_wrapper.amdsmi_link_metrics_t() + _check_res( + amdsmi_wrapper.amdsmi_get_link_metrics( + processor_handle, ctypes.byref(link_metrics) + ) + ) + + bdf = amdsmi_wrapper.amdsmi_bdf_t() + # TODO: Dummy BDF - to be replaced with destination BDF from xgmi_port_num when available + bdf.struct_amdsmi_bdf_t = amdsmi_wrapper.struct_amdsmi_bdf_t(0xFFFF, 0xFF, 0xFF, 0xF) + + links = [] + for i in range(AMDSMI_MAX_NUM_XGMI_LINKS): + link = link_metrics.links[i] + links.append({ + "bdf": _format_bdf(bdf), + "link_type": link.link_type, + "read": link.read, + "write": link.write, + }) + + return { + "num_links": AMDSMI_MAX_NUM_XGMI_LINKS, + "bit_rate": link_metrics.bit_rate, + "max_bandwidth": link_metrics.max_bandwidth, + "links": links + } + + def amdsmi_topo_get_link_type( processor_handle_src: amdsmi_wrapper.amdsmi_processor_handle, processor_handle_dst: amdsmi_wrapper.amdsmi_processor_handle, diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index f8ff10291a..c2a70fef5c 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -1153,8 +1153,6 @@ class struct__links(Structure): struct__links._pack_ = 1 # source:False struct__links._fields_ = [ ('bdf', amdsmi_bdf_t), - ('bit_rate', ctypes.c_uint32), - ('max_bandwidth', ctypes.c_uint32), ('link_type', amdsmi_link_type_t), ('PADDING_0', ctypes.c_ubyte * 4), ('read', ctypes.c_uint64), @@ -1165,6 +1163,8 @@ struct__links._fields_ = [ struct_amdsmi_link_metrics_t._pack_ = 1 # source:False struct_amdsmi_link_metrics_t._fields_ = [ ('num_links', ctypes.c_uint32), + ('bit_rate', ctypes.c_uint32), + ('max_bandwidth', ctypes.c_uint32), ('PADDING_0', ctypes.c_ubyte * 4), ('links', struct__links * 64), ('reserved', ctypes.c_uint64 * 7), diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 9465ecc948..c25cb8ad5f 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -2119,16 +2119,22 @@ amdsmi_status_t amdsmi_get_link_metrics(amdsmi_processor_handle processor_handle if (link_metrics == nullptr) return AMDSMI_STATUS_INVAL; amdsmi_gpu_metrics_t metric_info = {}; + link_metrics->max_bandwidth = std::numeric_limits::max(); + amdsmi_status_t status = amdsmi_get_gpu_metrics_info( processor_handle, &metric_info); if (status != AMDSMI_STATUS_SUCCESS) return status; link_metrics->num_links = AMDSMI_MAX_NUM_XGMI_LINKS; - for (unsigned int i = 0; i < link_metrics->num_links; i++) { + + link_metrics->bit_rate = metric_info.xgmi_link_speed; + if ((metric_info.xgmi_link_speed != std::numeric_limits::max()) && + (metric_info.xgmi_link_width != std::numeric_limits::max())) + link_metrics->max_bandwidth = metric_info.xgmi_link_speed * metric_info.xgmi_link_width; + + for (unsigned int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; i++) { link_metrics->links[i].read = metric_info.xgmi_read_data_acc[i]; link_metrics->links[i].write = metric_info.xgmi_write_data_acc[i]; - link_metrics->links[i].bit_rate = metric_info.xgmi_link_speed; - link_metrics->links[i].max_bandwidth = metric_info.xgmi_link_width; link_metrics->links[i].link_type = AMDSMI_LINK_TYPE_XGMI; }