diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index f606276c4a..6a5fcc9906 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -6,6 +6,10 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ## amd_smi_lib for ROCm 6.1.1 +### Added + +- N/A + ### Changed - **Updated metrics --clocks** @@ -143,6 +147,142 @@ GPU: 0 ... ``` +- **Updated `amd-smi topology --json` to align with host/guest** +Topology's `--json` output now is changed to align with output reported bt host/guest systems. Additionally, users can select/filter specific topology details as desired (refer to `amd-smi topology -h` for full list). See examples shown below. + +*Previous format:* +```shell +$ amd-smi topology --json +[ + { + "gpu": 0, + "link_accessibility": { + "gpu_0": "ENABLED", + "gpu_1": "DISABLED" + }, + "weight": { + "gpu_0": 0, + "gpu_1": 40 + }, + "hops": { + "gpu_0": 0, + "gpu_1": 2 + }, + "link_type": { + "gpu_0": "SELF", + "gpu_1": "PCIE" + }, + "numa_bandwidth": { + "gpu_0": "N/A", + "gpu_1": "N/A" + } + }, + { + "gpu": 1, + "link_accessibility": { + "gpu_0": "DISABLED", + "gpu_1": "ENABLED" + }, + "weight": { + "gpu_0": 40, + "gpu_1": 0 + }, + "hops": { + "gpu_0": 2, + "gpu_1": 0 + }, + "link_type": { + "gpu_0": "PCIE", + "gpu_1": "SELF" + }, + "numa_bandwidth": { + "gpu_0": "N/A", + "gpu_1": "N/A" + } + } +] +``` + +*New format:* +```shell +$ amd-smi topology --json +[ + { + "gpu": 0, + "bdf": "0000:01:00.0", + "links": [ + { + "gpu": 0, + "bdf": "0000:01:00.0", + "weight": 0, + "link_status": "ENABLED", + "link_type": "SELF", + "num_hops": 0, + "bandwidth": "N/A", + "fb_sharing": "ENABLED" + }, + { + "gpu": 1, + "bdf": "0001:01:00.0", + "weight": 15, + "link_status": "ENABLED", + "link_type": "XGMI", + "num_hops": 1, + "bandwidth": "50000-100000", + "fb_sharing": "ENABLED" + }, + ... + ] + }, + ... +] +``` +```shell +$ /opt/rocm/bin/amd-smi topology -a -t --json +[ + { + "gpu": 0, + "bdf": "0000:08:00.0", + "links": [ + { + "gpu": 0, + "bdf": "0000:08:00.0", + "link_status": "ENABLED", + "link_type": "SELF" + }, + { + "gpu": 1, + "bdf": "0000:44:00.0", + "link_status": "DISABLED", + "link_type": "PCIE" + } + ] + }, + { + "gpu": 1, + "bdf": "0000:44:00.0", + "links": [ + { + "gpu": 0, + "bdf": "0000:08:00.0", + "link_status": "DISABLED", + "link_type": "PCIE" + }, + { + "gpu": 1, + "bdf": "0000:44:00.0", + "link_status": "ENABLED", + "link_type": "SELF" + } + ] + } +] +``` + +### Optimizations + +- N/A + ### Fixed - **Fix for GPU reset error on non-amdgpu cards** diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index b9bc3a5e34..431b23faf9 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -2763,20 +2763,115 @@ class AMDSMICommands(): # Populate the possible gpus topo_values = [] - for gpu in args.gpu: - gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) - topo_values.append({"gpu" : gpu_id}) - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu) - self.logger.table_header += gpu_bdf.rjust(13) + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu) + topo_values.append({"gpu" : src_gpu_id}) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + topo_values[src_gpu_index]['bdf'] = src_gpu_bdf + self.logger.table_header += src_gpu_bdf.rjust(13) + + if not self.logger.is_json_format(): + continue # below is for JSON format only + + ########################## + # JSON formatting start # + ########################## + links = [] + # create json obj for data alignment + # dest_gpu_links = { + # "gpu": GPU # + # "bdf": BDF identification + # "weight": 0 - self (current node); weight >= 0 correlated with hops (GPU-CPU, GPU-GPU, GPU-CPU-CPU-GPU, etc..) + # "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked + # "link_type": "SELF" - current node, "PCIE", "XGMI", "N/A" - no link,"UNKNOWN" - unidentified link type + # "num_hops": num_hops - # of hops between devices + # "bandwidth": numa_bw - The NUMA "minimum bandwidth-maximum bandwidth" beween src and dest nodes + # "N/A" - self node or not connected devices + # "fb_sharing": "ENABLED/DISABLED" - same output as defined in link_status. Devices in a hive setup should + # all have sharing enabled. + # } + + for dest_gpu_index, dest_gpu in enumerate(args.gpu): + link_type = "SELF" + if src_gpu != dest_gpu: + link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type'] + if isinstance(link_type, int): + if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_UNDEFINED: + link_type = "UNKNOWN" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_PCIEXPRESS: + link_type = "PCIE" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_XGMI: + link_type = "XGMI" + else: + link_type = "N/A" + + numa_bw = "N/A" + if src_gpu != dest_gpu: + try: + bw_dict = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu) + numa_bw = f"{bw_dict['min_bandwidth']}-{bw_dict['max_bandwidth']}" + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get min max bandwidth for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + weight = 0 + num_hops = 0 + if src_gpu != dest_gpu: + weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu) + num_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops'] + link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu) + if link_status: + link_status = "ENABLED" + else: + link_status = "DISABLED" + + # fb_sharing in BM - in a hive configuration, this is + # link_status = amdsmi_is_P2P_accessible(src,dest) + dest_gpu_links = { + "gpu": self.helpers.get_gpu_id_from_device_handle(dest_gpu), + "bdf": amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu), + "weight": weight, + "link_status": link_status, + "link_type": link_type, + "num_hops": num_hops, + "bandwidth": numa_bw, + "fb_sharing": link_status + } + if not args.access: # currently includes fb_sharing + del dest_gpu_links['link_status'] + del dest_gpu_links['fb_sharing'] + if not args.weight: + del dest_gpu_links['weight'] + if not args.link_type: + del dest_gpu_links['link_type'] + if not args.hops: + del dest_gpu_links['num_hops'] + if not args.numa_bw: + del dest_gpu_links['bandwidth'] + links.append(dest_gpu_links) + isEndOfDest = dest_gpu_index+1 == len(args.gpu) + isEndOfSrc = src_gpu_index+1 == len(args.gpu) + if isEndOfDest: + topo_values[src_gpu_index]['links'] = links + continue + if isEndOfSrc: + self.logger.multiple_device_output = topo_values + self.logger.print_output(multiple_device_enabled=True, tabular=True) + return + ########################## + # JSON formatting end # + ########################## if args.access: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_links = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -2808,11 +2903,11 @@ class AMDSMICommands(): if args.weight: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_weight = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -2845,11 +2940,11 @@ class AMDSMICommands(): if args.hops: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_hops = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -2882,11 +2977,11 @@ class AMDSMICommands(): if args.link_type: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_link_type = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -2924,11 +3019,11 @@ class AMDSMICommands(): if args.numa_bw: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_link_type = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)