diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 37fbfff22c..ae22125a57 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -147,7 +147,7 @@ class AMDSMICommands(): def static(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, caps=None, - ras=None, board=None): + ras=None, board=None, numa=None): """Get Static information for target gpu Args: @@ -162,6 +162,7 @@ class AMDSMICommands(): caps (bool, optional): Value override for args.caps. Defaults to None. ras (bool, optional): Value override for args.ras. Defaults to None. board (bool, optional): Value override for args.board. Defaults to None. + numa (bool, optional): Value override for args.numa. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -188,6 +189,8 @@ class AMDSMICommands(): args.ras = ras if board: args.board = board + if numa: + args.numa = numa # Handle No GPU passed if args.gpu is None: @@ -200,8 +203,10 @@ class AMDSMICommands(): args.gpu = device_handle # If all arguments are False, it means that no argument was passed and the entire static should be printed - if not any([args.asic, args.bus, args.vbios, args.limit, args.driver, args.caps, args.ras, args.board]): - args.asic = args.bus = args.vbios = args.limit = args.driver = args.caps = args.ras = args.board = self.all_arguments = True + if not any([args.asic, args.bus, args.vbios, args.limit, args.driver, + args.caps, args.ras, args.board, args.numa]): + args.asic = args.bus = args.vbios = args.limit = args.driver = \ + args.caps = args.ras = args.board = args.numa = self.all_arguments = True static_dict = {} @@ -362,6 +367,23 @@ class AMDSMICommands(): static_dict['caps'] = e.get_error_info() if not self.all_arguments: raise e + if args.numa: + try: + numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + numa_node_number = e.get_error_info() + if not self.all_arguments: + raise e + + try: + numa_affinity = amdsmi_interface.amdsmi_topo_get_numa_affinity(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + numa_affinity = e.get_error_info() + if not self.all_arguments: + raise e + + static_dict['numa'] = {'node' : numa_node_number, + 'affinity' : numa_affinity} multiple_devices_csv_override = False # Convert and store output by pid for csv format @@ -1284,7 +1306,7 @@ class AMDSMICommands(): def topology(self, args, multiple_devices=False, gpu=None, access=None, - weight=None, hops=None, type=None, numa=None, numa_bw=None): + weight=None, hops=None, link_type=None, numa=None, numa_bw=None): """ Get topology information for target gpus The compatibility mode for this will only be in amdsmi & rocm-smi params: @@ -1309,8 +1331,8 @@ class AMDSMICommands(): args.weight = weight if hops: args.hops = hops - if type: - args.type = type + if link_type: + args.link_type = link_type if numa: args.numa = numa if numa_bw: @@ -1320,56 +1342,137 @@ class AMDSMICommands(): if args.gpu is None: args.gpu = self.device_handles - # Handle multiple GPUs - handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.topology) - if handled_multiple_gpus: - return # This function is recursive + if not isinstance(args.gpu, list): + args.gpu = [args.gpu] + + # # Handle multiple GPUs + # handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.topology) + # if handled_multiple_gpus: + # return # This function is recursive # Handle all args being false - if not any([args.access, args.weight, args.hops, args.type, args.numa, args.numa_bw]): - args.access = args.weight = args.hops = args.type = args.numa = args.numa_bw = True + if not any([args.access, args.weight, args.hops, args.link_type, args.numa, args.numa_bw]): + args.access = args.weight = args.hops = args.link_type = args.numa = args.numa_bw = True + + # Populate the possible gpus + topo_values = [] + for gpu in args.gpu: + gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) + topo_values.append({"gpu" : gpu_id}) - topo_dict = {} if args.access: - topo_dict['access'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_links = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu) + + try: + dest_gpu_link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu) + src_gpu_links[f'gpu_{dest_gpu_id}'] = bool(dest_gpu_link_status) + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_links[f'gpu_{dest_gpu_id}'] = e.get_error_info() + + topo_values[src_gpu_index]['link_accessibility'] = src_gpu_links if args.weight: - topo_dict['weight'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_weight = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu) + + if src_gpu == dest_gpu: + src_gpu_weight[f'gpu_{dest_gpu_id}'] = 0 + continue + + try: + dest_gpu_link_weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu) + src_gpu_weight[f'gpu_{dest_gpu_id}'] = dest_gpu_link_weight + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_weight[f'gpu_{dest_gpu_id}'] = e.get_error_info() + + topo_values[src_gpu_index]['weight'] = src_gpu_weight if args.hops: - topo_dict['hops'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_hops = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu) - if args.type: - topo_dict['type'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + if src_gpu == dest_gpu: + src_gpu_hops[f'gpu_{dest_gpu_id}'] = 0 + continue - if args.numa: - topo_dict['numa'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + try: + dest_gpu_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops'] + src_gpu_hops[f'gpu_{dest_gpu_id}'] = dest_gpu_hops + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_hops[f'gpu_{dest_gpu_id}'] = e.get_error_info() - # numa_numbers = c_uint32() - # for device in deviceList: - # ret = rocmsmi.rsmi_get_numa_node_number(device, byref(numa_numbers)) - # if rsmi_ret_ok(ret, device): - # printLog(device, "(Topology) Numa Node", numa_numbers.value) - # else: - # printErrLog(device, "Cannot read Numa Node") + topo_values[src_gpu_index]['hops'] = src_gpu_hops + + if args.link_type: + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_link_type = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu) + + if src_gpu == dest_gpu: + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 0 + continue + + try: + link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type'] + if isinstance(link_type, int): + if link_type == 1: + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "PCIE" + elif link_type == 2: + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "XMGI" + else: + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "XXXX" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info() + + topo_values[src_gpu_index]['link_type'] = src_gpu_link_type - # ret = rocmsmi.rsmi_numa_affinity_get(device, byref(numa_numbers)) - # if rsmi_ret_ok(ret): - # printLog(device, "(Topology) Numa Affinity", numa_numbers.value) - # else: - # printErrLog(device, 'Cannot read Numa Affinity') if args.numa_bw: - topo_dict['numa_bw'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_link_type = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu) + if src_gpu == dest_gpu: + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 'N/A' + continue - # Store values in logger.output - self.logger.store_output(args.gpu, 'values', topo_dict) + try: + link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type'] + if isinstance(link_type, int): + if link_type != 2: + non_xgmi = True + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 'N/A' + continue + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info() - if multiple_devices: - self.logger.store_multiple_device_output() - return # Skip printing when there are multiple devices + try: + min_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth(src_gpu, dest_gpu)['min_bandwidth'] + max_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth(src_gpu, dest_gpu)['max_bandwidth'] - self.logger.print_output() + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = f'{min_bw}-{max_bw}' + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info() + + topo_values[src_gpu_index]['numa_bandwidth'] = src_gpu_link_type + + self.logger.multiple_device_output = topo_values + + if self.logger.is_csv_format(): + new_output = [] + for elem in self.logger.multiple_device_output: + new_output.append(self.logger.flatten_dict(elem, topology_override=True)) + self.logger.multiple_device_output = new_output + + self.logger.print_output(multiple_device_enabled=True) def set_value(self, args, multiple_devices=False, gpu=None, clock=None, sclk=None, mclk=None, diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 054c3cae47..7b957d5d4a 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -229,7 +229,7 @@ class AMDSMIHelpers(): for device_handle in args.gpu: # Handle multiple_devices to print all output at once subcommand(args, multiple_devices=True, gpu=device_handle) - logger.print_output(multiple_devices_enabled=True) + logger.print_output(multiple_device_enabled=True) return True, args.gpu elif len(args.gpu) == 1: args.gpu = args.gpu[0] diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py index fc6a86599d..e8e2e7fd8c 100644 --- a/amdsmi_cli/amdsmi_logger.py +++ b/amdsmi_cli/amdsmi_logger.py @@ -145,7 +145,7 @@ class AMDSMILogger(): return clean_yaml_output - def flatten_dict(self, target_dict): + def flatten_dict(self, target_dict, topology_override=False): """This will flatten a dictionary out to a single level of key value stores removing key's with dictionaries and wrapping each value to in a list ex: @@ -178,7 +178,7 @@ class AMDSMILogger(): for key, value in target_dict.items(): if isinstance(value, dict): # Check number of items in the dict - if len(value.values()) > 1: + if len(value.values()) > 1 or topology_override: value_with_parent_key = {} for parent_key, child_dict in value.items(): if isinstance(child_dict, dict): @@ -189,7 +189,10 @@ class AMDSMILogger(): for child_key, value1 in child_dict.items(): value_with_parent_key[parent_key + '_' + child_key] = value1 else: - value_with_parent_key[parent_key] = child_dict + if topology_override: + value_with_parent_key[key + '_' + parent_key] = child_dict + else: + value_with_parent_key[parent_key] = child_dict value = value_with_parent_key if self.is_gpuvsmi_compatibility(): diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 026c3d6dd2..e62d817700 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -295,6 +295,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Options arguments help text for Hypervisors and Baremetal ras_help = "Displays RAS features information" board_help = "All board information" # Linux Baremetal only + numa_help = "All numa node information" # Linux Baremetal only # Options arguments help text for Hypervisors dfc_help = "All DFC FW table information" @@ -324,6 +325,7 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) if self.helpers.is_linux(): static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help) + static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) # Options to only display on a Hypervisor if self.helpers.is_hypervisor(): @@ -573,7 +575,7 @@ class AMDSMIParser(argparse.ArgumentParser): access_help = "Displays link accessibility between GPUs" weight_help = "Displays relative weight between GPUs" hops_help = "Displays the number of hops between GPUs" - type_help = "Displays the link type between GPUs" + link_type_help = "Displays the link type between GPUs" numa_help = "Display the HW Topology Information for numa nodes" numa_bw_help = "Display max and min bandwidth between nodes" @@ -591,7 +593,7 @@ class AMDSMIParser(argparse.ArgumentParser): topology_parser.add_argument('-a', '--access', action='store_true', required=False, help=access_help) topology_parser.add_argument('-w', '--weight', action='store_true', required=False, help=weight_help) topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help) - topology_parser.add_argument('-t', '--type', action='store_true', required=False, help=type_help) + topology_parser.add_argument('-t', '--link-type', action='store_true', required=False, help=link_type_help) topology_parser.add_argument('-n', '--numa', action='store_true', required=False, help=numa_help) topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help)