diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 0339827317..542fd3527d 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -8,6 +8,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Added +- **Added `GPU LINK PORT STATUS` table to `amd-smi xgmi` command**. + - The `amd-smi xgmi -s` or `amd-smi xgmi --source-status` will show `GPU LINK PORT STATUS` table. + - **Added the following API's to amdsmi_interface.py**. - amdsmi_get_cpu_handle() - amdsmi_get_esmi_err_msg() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 88aa580fcd..88df3fa1fb 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -6388,13 +6388,14 @@ class AMDSMICommands(): self.logger.print_output(multiple_device_enabled=False, watching_output=watching_output, tabular=True, dual_csv_output=dual_csv_output) - def xgmi(self, args, multiple_devices=False, gpu=None, metric=None, xgmi_link_status=None): + def xgmi(self, args, multiple_devices=False, gpu=None, metric=None, xgmi_source_status=None, xgmi_link_status=None): """ Get topology information for target gpus params: args - argparser args to pass to subcommand multiple_devices (bool) - True if checking for multiple devices gpu (device_handle) - device_handle for target device metric (bool) - Value override for args.metric + xgmi_source_status (bool) - Value override for args.xgmi_source_status xgmi_link_status (bool) - Value override for args.xgmi_link_status return: @@ -6409,6 +6410,8 @@ class AMDSMICommands(): args.metric = metric if xgmi_link_status: args.link_status = xgmi_link_status + if xgmi_source_status: + args.source_status = xgmi_source_status # Handle No GPU passed if args.gpu == None: @@ -6418,9 +6421,10 @@ class AMDSMICommands(): args.gpu = [args.gpu] # Handle all args being false - if not any([args.metric, args.link_status]): + if not any([args.metric, args.link_status, args.source_status]): args.metric = True args.link_status = True + args.source_status = True # Clear the table header self.logger.table_header = ''.rjust(7) @@ -6443,8 +6447,16 @@ class AMDSMICommands(): xgmi_values.append({"gpu" : gpu_id, "bdf" : gpu_bdf}) # Populate header with just bdfs - self.logger.table_header += gpu_bdf.rjust(13) + self.logger.table_header += f"GPU{gpu_id}".rjust(13) + # Cache processor handles for each BDF + src_gpu_handles = {} + for dict in xgmi_values: + try: + src_gpu_handles[dict['bdf']] = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(dict['bdf']) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get processor handle for %s | %s", dict['bdf'], e.get_error_info()) + src_gpu_handles[dict['bdf']] = None if args.metric: # prepend link metrics header to the table header link_metrics_header = " " + "bdf".ljust(14) + \ @@ -6456,7 +6468,7 @@ class AMDSMICommands(): for xgmi_dict in xgmi_values: src_gpu_id = xgmi_dict['gpu'] src_gpu_bdf = xgmi_dict['bdf'] - src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf) + src_gpu = src_gpu_handles.get(src_gpu_bdf) logging.debug("check2 device_handle: %s", src_gpu) # This should be the same order as the check1 @@ -6596,24 +6608,24 @@ class AMDSMICommands(): if self.logger.is_json_format(): self.logger.store_xgmi_metric_json_output.append(xgmi_values) - if not args.link_status: + if not any([args.link_status, args.source_status]): self.logger.combine_arrays_to_json() elif not self.logger.is_human_readable_format(): self.logger.print_output(multiple_device_enabled=True) - if args.link_status: + if args.source_status: # Header modification self.logger.table_header = ''.rjust(7) current_header = " ".ljust(7) + \ "bdf".ljust(14) + \ - "link_status".ljust(20) + "port_num".ljust(20) self.logger.table_header = current_header + self.logger.table_header.strip() # Process each GPU tabular_output = [] for xgmi_dict in xgmi_values: src_gpu_id = xgmi_dict['gpu'] src_gpu_bdf = xgmi_dict['bdf'] - src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf) + src_gpu = src_gpu_handles.get(src_gpu_bdf) # Populate link statuses status_row = [] @@ -6639,14 +6651,81 @@ class AMDSMICommands(): if self.logger.is_human_readable_format(): xgmi_dict['link_status'] = tabular_output self.logger.multiple_device_output= tabular_output - self.logger.table_title = "\nXGMI LINK STATUS" + self.logger.table_title = "\nGPU LINK PORT STATUS" if not self.logger.is_json_format(): self.logger.print_output(multiple_device_enabled=True, tabular=True) self.logger.clear_multiple_devices_output() if self.logger.is_json_format(): self.logger.combine_arrays_to_json() + + if args.link_status: + # XGMI LINK STATUS for src_gpu to dest_gpu + header = [" ".ljust(8), "bdf".ljust(15)] + [f"GPU{d['gpu']}".ljust(14) for d in xgmi_values] + self.logger.table_header = "".join(header) + self.logger.table_title = "\nXGMI LINK STATUS" + + src_link_status_map = {} + for gpu_dict in xgmi_values: + src_gpu_id = gpu_dict['gpu'] + src_gpu_bdf = gpu_dict['bdf'] + src_gpu = src_gpu_handles.get(src_gpu_bdf) + try: + link_status = amdsmi_interface.amdsmi_get_gpu_xgmi_link_status(src_gpu) + src_link_status_map[src_gpu_bdf] = link_status['status'] + except amdsmi_exception.AmdSmiLibraryException: + src_link_status_map[src_gpu_bdf] = ["N/A"] * amdsmi_interface.AMDSMI_MAX_NUM_XGMI_LINKS + + tabular_output = [] + for src_xgmi_dict in xgmi_values: + src_gpu_id = src_xgmi_dict['gpu'] + src_gpu_bdf = src_xgmi_dict['bdf'] + src_gpu = src_gpu_handles.get(src_gpu_bdf) + try: + xgmi_metrics_info = amdsmi_interface.amdsmi_get_link_metrics(src_gpu) + except amdsmi_exception.AmdSmiLibraryException: + xgmi_metrics_info = {"links": []} + # First column: GPU# + tab + bdf, then status for each dest bdf + row_dict = {"": f"GPU{src_gpu_id}\t{src_gpu_bdf}".ljust(20)} + # Cache GPU handles for destination GPUs + dest_gpu_handles = {dest_xgmi_dict['bdf']: + amdsmi_interface.amdsmi_get_processor_handle_from_bdf(dest_xgmi_dict['bdf']) + for dest_xgmi_dict in xgmi_values} + for dest_xgmi_dict in xgmi_values: + dest_gpu_bdf = dest_xgmi_dict['bdf'] + dest_gpu = dest_gpu_handles[dest_gpu_bdf] + + # Find all link indexes in xgmi_metrics_info for this destination + link_indexes = [] + for idx, link in enumerate(xgmi_metrics_info['links']): + if link['bdf'] == dest_gpu_bdf: + link_indexes.append(idx) + + # Use the found link index to get the status if valid + if link_indexes and len(link_indexes) <= len(src_link_status_map.get(src_gpu_bdf, [])): + statuses = [] + for link_idx in link_indexes: + if link_idx < len(src_link_status_map[src_gpu_bdf]): + statuses.append(str(src_link_status_map[src_gpu_bdf][link_idx])) + + # Join multiple statuses with "/" + if statuses: + status = "/".join(statuses) + else: + status = "N/A" + elif dest_gpu_bdf == src_gpu_bdf: + status = "SELF" + else: + status = "N/A" + + row_dict[dest_gpu_bdf.ljust(14)] = str(status).ljust(14) + tabular_output.append(row_dict) + + self.logger.multiple_device_output = tabular_output + self.logger.print_output(multiple_device_enabled=True, tabular=True) + self.logger.clear_multiple_devices_output() + if self.logger.is_human_readable_format(): - # Populate the legend output + # Populate the legend output legend_parts = [ "\n\nLegend:", " SELF = Current GPU", diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index cd255870c3..78610908f9 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1437,6 +1437,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Help text for Arguments only on Guest and BM platforms metrics_help = "Metric XGMI information" + xgmi_source_status_help = "Source GPU XGMI Link information" xgmi_link_status_help = "XGMI Link Status information" # Create xgmi subparser @@ -1447,6 +1448,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Optional Args xgmi_parser.add_argument('-m', '--metric', action='store_true', required=False, help=metrics_help) + xgmi_parser.add_argument('-s', '--source-status', action='store_true', required=False, help=xgmi_source_status_help) xgmi_parser.add_argument('-l', '--link-status', action='store_true', required=False, help=xgmi_link_status_help) # Add Universal Arguments