SWDEV-392033: Added Topology Command

Change-Id: Ib1d007aee9937e3062d0e9c9898ca9198a585132
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Šī revīzija ir iekļauta:
Maisam Arif
2023-04-21 15:10:38 -05:00
vecāks 543c573cc7
revīzija 39da929fe4
4 mainīti faili ar 153 papildinājumiem un 45 dzēšanām
+142 -39
Parādīt failu
@@ -147,7 +147,7 @@ class AMDSMICommands():
def static(self, args, multiple_devices=False, gpu=None, asic=None,
bus=None, vbios=None, limit=None, driver=None, caps=None,
ras=None, board=None):
ras=None, board=None, numa=None):
"""Get Static information for target gpu
Args:
@@ -162,6 +162,7 @@ class AMDSMICommands():
caps (bool, optional): Value override for args.caps. Defaults to None.
ras (bool, optional): Value override for args.ras. Defaults to None.
board (bool, optional): Value override for args.board. Defaults to None.
numa (bool, optional): Value override for args.numa. Defaults to None.
Raises:
IndexError: Index error if gpu list is empty
@@ -188,6 +189,8 @@ class AMDSMICommands():
args.ras = ras
if board:
args.board = board
if numa:
args.numa = numa
# Handle No GPU passed
if args.gpu is None:
@@ -200,8 +203,10 @@ class AMDSMICommands():
args.gpu = device_handle
# If all arguments are False, it means that no argument was passed and the entire static should be printed
if not any([args.asic, args.bus, args.vbios, args.limit, args.driver, args.caps, args.ras, args.board]):
args.asic = args.bus = args.vbios = args.limit = args.driver = args.caps = args.ras = args.board = self.all_arguments = True
if not any([args.asic, args.bus, args.vbios, args.limit, args.driver,
args.caps, args.ras, args.board, args.numa]):
args.asic = args.bus = args.vbios = args.limit = args.driver = \
args.caps = args.ras = args.board = args.numa = self.all_arguments = True
static_dict = {}
@@ -362,6 +367,23 @@ class AMDSMICommands():
static_dict['caps'] = e.get_error_info()
if not self.all_arguments:
raise e
if args.numa:
try:
numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
numa_node_number = e.get_error_info()
if not self.all_arguments:
raise e
try:
numa_affinity = amdsmi_interface.amdsmi_topo_get_numa_affinity(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
numa_affinity = e.get_error_info()
if not self.all_arguments:
raise e
static_dict['numa'] = {'node' : numa_node_number,
'affinity' : numa_affinity}
multiple_devices_csv_override = False
# Convert and store output by pid for csv format
@@ -1284,7 +1306,7 @@ class AMDSMICommands():
def topology(self, args, multiple_devices=False, gpu=None, access=None,
weight=None, hops=None, type=None, numa=None, numa_bw=None):
weight=None, hops=None, link_type=None, numa=None, numa_bw=None):
""" Get topology information for target gpus
The compatibility mode for this will only be in amdsmi & rocm-smi
params:
@@ -1309,8 +1331,8 @@ class AMDSMICommands():
args.weight = weight
if hops:
args.hops = hops
if type:
args.type = type
if link_type:
args.link_type = link_type
if numa:
args.numa = numa
if numa_bw:
@@ -1320,56 +1342,137 @@ class AMDSMICommands():
if args.gpu is None:
args.gpu = self.device_handles
# Handle multiple GPUs
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.topology)
if handled_multiple_gpus:
return # This function is recursive
if not isinstance(args.gpu, list):
args.gpu = [args.gpu]
# # Handle multiple GPUs
# handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.topology)
# if handled_multiple_gpus:
# return # This function is recursive
# Handle all args being false
if not any([args.access, args.weight, args.hops, args.type, args.numa, args.numa_bw]):
args.access = args.weight = args.hops = args.type = args.numa = args.numa_bw = True
if not any([args.access, args.weight, args.hops, args.link_type, args.numa, args.numa_bw]):
args.access = args.weight = args.hops = args.link_type = args.numa = args.numa_bw = True
# Populate the possible gpus
topo_values = []
for gpu in args.gpu:
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
topo_values.append({"gpu" : gpu_id})
topo_dict = {}
if args.access:
topo_dict['access'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_links = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
try:
dest_gpu_link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu)
src_gpu_links[f'gpu_{dest_gpu_id}'] = bool(dest_gpu_link_status)
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_links[f'gpu_{dest_gpu_id}'] = e.get_error_info()
topo_values[src_gpu_index]['link_accessibility'] = src_gpu_links
if args.weight:
topo_dict['weight'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_weight = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
if src_gpu == dest_gpu:
src_gpu_weight[f'gpu_{dest_gpu_id}'] = 0
continue
try:
dest_gpu_link_weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu)
src_gpu_weight[f'gpu_{dest_gpu_id}'] = dest_gpu_link_weight
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_weight[f'gpu_{dest_gpu_id}'] = e.get_error_info()
topo_values[src_gpu_index]['weight'] = src_gpu_weight
if args.hops:
topo_dict['hops'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_hops = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
if args.type:
topo_dict['type'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
if src_gpu == dest_gpu:
src_gpu_hops[f'gpu_{dest_gpu_id}'] = 0
continue
if args.numa:
topo_dict['numa'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
try:
dest_gpu_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops']
src_gpu_hops[f'gpu_{dest_gpu_id}'] = dest_gpu_hops
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_hops[f'gpu_{dest_gpu_id}'] = e.get_error_info()
# numa_numbers = c_uint32()
# for device in deviceList:
# ret = rocmsmi.rsmi_get_numa_node_number(device, byref(numa_numbers))
# if rsmi_ret_ok(ret, device):
# printLog(device, "(Topology) Numa Node", numa_numbers.value)
# else:
# printErrLog(device, "Cannot read Numa Node")
topo_values[src_gpu_index]['hops'] = src_gpu_hops
if args.link_type:
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_link_type = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
if src_gpu == dest_gpu:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 0
continue
try:
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
if isinstance(link_type, int):
if link_type == 1:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "PCIE"
elif link_type == 2:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "XMGI"
else:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "XXXX"
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info()
topo_values[src_gpu_index]['link_type'] = src_gpu_link_type
# ret = rocmsmi.rsmi_numa_affinity_get(device, byref(numa_numbers))
# if rsmi_ret_ok(ret):
# printLog(device, "(Topology) Numa Affinity", numa_numbers.value)
# else:
# printErrLog(device, 'Cannot read Numa Affinity')
if args.numa_bw:
topo_dict['numa_bw'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_link_type = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
if src_gpu == dest_gpu:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 'N/A'
continue
# Store values in logger.output
self.logger.store_output(args.gpu, 'values', topo_dict)
try:
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
if isinstance(link_type, int):
if link_type != 2:
non_xgmi = True
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 'N/A'
continue
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info()
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
try:
min_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth(src_gpu, dest_gpu)['min_bandwidth']
max_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth(src_gpu, dest_gpu)['max_bandwidth']
self.logger.print_output()
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = f'{min_bw}-{max_bw}'
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info()
topo_values[src_gpu_index]['numa_bandwidth'] = src_gpu_link_type
self.logger.multiple_device_output = topo_values
if self.logger.is_csv_format():
new_output = []
for elem in self.logger.multiple_device_output:
new_output.append(self.logger.flatten_dict(elem, topology_override=True))
self.logger.multiple_device_output = new_output
self.logger.print_output(multiple_device_enabled=True)
def set_value(self, args, multiple_devices=False, gpu=None, clock=None, sclk=None, mclk=None,
+1 -1
Parādīt failu
@@ -229,7 +229,7 @@ class AMDSMIHelpers():
for device_handle in args.gpu:
# Handle multiple_devices to print all output at once
subcommand(args, multiple_devices=True, gpu=device_handle)
logger.print_output(multiple_devices_enabled=True)
logger.print_output(multiple_device_enabled=True)
return True, args.gpu
elif len(args.gpu) == 1:
args.gpu = args.gpu[0]
+6 -3
Parādīt failu
@@ -145,7 +145,7 @@ class AMDSMILogger():
return clean_yaml_output
def flatten_dict(self, target_dict):
def flatten_dict(self, target_dict, topology_override=False):
"""This will flatten a dictionary out to a single level of key value stores
removing key's with dictionaries and wrapping each value to in a list
ex:
@@ -178,7 +178,7 @@ class AMDSMILogger():
for key, value in target_dict.items():
if isinstance(value, dict):
# Check number of items in the dict
if len(value.values()) > 1:
if len(value.values()) > 1 or topology_override:
value_with_parent_key = {}
for parent_key, child_dict in value.items():
if isinstance(child_dict, dict):
@@ -189,7 +189,10 @@ class AMDSMILogger():
for child_key, value1 in child_dict.items():
value_with_parent_key[parent_key + '_' + child_key] = value1
else:
value_with_parent_key[parent_key] = child_dict
if topology_override:
value_with_parent_key[key + '_' + parent_key] = child_dict
else:
value_with_parent_key[parent_key] = child_dict
value = value_with_parent_key
if self.is_gpuvsmi_compatibility():
+4 -2
Parādīt failu
@@ -295,6 +295,7 @@ class AMDSMIParser(argparse.ArgumentParser):
# Options arguments help text for Hypervisors and Baremetal
ras_help = "Displays RAS features information"
board_help = "All board information" # Linux Baremetal only
numa_help = "All numa node information" # Linux Baremetal only
# Options arguments help text for Hypervisors
dfc_help = "All DFC FW table information"
@@ -324,6 +325,7 @@ class AMDSMIParser(argparse.ArgumentParser):
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
if self.helpers.is_linux():
static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help)
static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help)
# Options to only display on a Hypervisor
if self.helpers.is_hypervisor():
@@ -573,7 +575,7 @@ class AMDSMIParser(argparse.ArgumentParser):
access_help = "Displays link accessibility between GPUs"
weight_help = "Displays relative weight between GPUs"
hops_help = "Displays the number of hops between GPUs"
type_help = "Displays the link type between GPUs"
link_type_help = "Displays the link type between GPUs"
numa_help = "Display the HW Topology Information for numa nodes"
numa_bw_help = "Display max and min bandwidth between nodes"
@@ -591,7 +593,7 @@ class AMDSMIParser(argparse.ArgumentParser):
topology_parser.add_argument('-a', '--access', action='store_true', required=False, help=access_help)
topology_parser.add_argument('-w', '--weight', action='store_true', required=False, help=weight_help)
topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help)
topology_parser.add_argument('-t', '--type', action='store_true', required=False, help=type_help)
topology_parser.add_argument('-t', '--link-type', action='store_true', required=False, help=link_type_help)
topology_parser.add_argument('-n', '--numa', action='store_true', required=False, help=numa_help)
topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help)