SWDEV-392033: Added Topology Command
Change-Id: Ib1d007aee9937e3062d0e9c9898ca9198a585132 Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Šī revīzija ir iekļauta:
@@ -147,7 +147,7 @@ class AMDSMICommands():
|
||||
|
||||
def static(self, args, multiple_devices=False, gpu=None, asic=None,
|
||||
bus=None, vbios=None, limit=None, driver=None, caps=None,
|
||||
ras=None, board=None):
|
||||
ras=None, board=None, numa=None):
|
||||
"""Get Static information for target gpu
|
||||
|
||||
Args:
|
||||
@@ -162,6 +162,7 @@ class AMDSMICommands():
|
||||
caps (bool, optional): Value override for args.caps. Defaults to None.
|
||||
ras (bool, optional): Value override for args.ras. Defaults to None.
|
||||
board (bool, optional): Value override for args.board. Defaults to None.
|
||||
numa (bool, optional): Value override for args.numa. Defaults to None.
|
||||
|
||||
Raises:
|
||||
IndexError: Index error if gpu list is empty
|
||||
@@ -188,6 +189,8 @@ class AMDSMICommands():
|
||||
args.ras = ras
|
||||
if board:
|
||||
args.board = board
|
||||
if numa:
|
||||
args.numa = numa
|
||||
|
||||
# Handle No GPU passed
|
||||
if args.gpu is None:
|
||||
@@ -200,8 +203,10 @@ class AMDSMICommands():
|
||||
args.gpu = device_handle
|
||||
|
||||
# If all arguments are False, it means that no argument was passed and the entire static should be printed
|
||||
if not any([args.asic, args.bus, args.vbios, args.limit, args.driver, args.caps, args.ras, args.board]):
|
||||
args.asic = args.bus = args.vbios = args.limit = args.driver = args.caps = args.ras = args.board = self.all_arguments = True
|
||||
if not any([args.asic, args.bus, args.vbios, args.limit, args.driver,
|
||||
args.caps, args.ras, args.board, args.numa]):
|
||||
args.asic = args.bus = args.vbios = args.limit = args.driver = \
|
||||
args.caps = args.ras = args.board = args.numa = self.all_arguments = True
|
||||
|
||||
static_dict = {}
|
||||
|
||||
@@ -362,6 +367,23 @@ class AMDSMICommands():
|
||||
static_dict['caps'] = e.get_error_info()
|
||||
if not self.all_arguments:
|
||||
raise e
|
||||
if args.numa:
|
||||
try:
|
||||
numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
numa_node_number = e.get_error_info()
|
||||
if not self.all_arguments:
|
||||
raise e
|
||||
|
||||
try:
|
||||
numa_affinity = amdsmi_interface.amdsmi_topo_get_numa_affinity(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
numa_affinity = e.get_error_info()
|
||||
if not self.all_arguments:
|
||||
raise e
|
||||
|
||||
static_dict['numa'] = {'node' : numa_node_number,
|
||||
'affinity' : numa_affinity}
|
||||
|
||||
multiple_devices_csv_override = False
|
||||
# Convert and store output by pid for csv format
|
||||
@@ -1284,7 +1306,7 @@ class AMDSMICommands():
|
||||
|
||||
|
||||
def topology(self, args, multiple_devices=False, gpu=None, access=None,
|
||||
weight=None, hops=None, type=None, numa=None, numa_bw=None):
|
||||
weight=None, hops=None, link_type=None, numa=None, numa_bw=None):
|
||||
""" Get topology information for target gpus
|
||||
The compatibility mode for this will only be in amdsmi & rocm-smi
|
||||
params:
|
||||
@@ -1309,8 +1331,8 @@ class AMDSMICommands():
|
||||
args.weight = weight
|
||||
if hops:
|
||||
args.hops = hops
|
||||
if type:
|
||||
args.type = type
|
||||
if link_type:
|
||||
args.link_type = link_type
|
||||
if numa:
|
||||
args.numa = numa
|
||||
if numa_bw:
|
||||
@@ -1320,56 +1342,137 @@ class AMDSMICommands():
|
||||
if args.gpu is None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
# Handle multiple GPUs
|
||||
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.topology)
|
||||
if handled_multiple_gpus:
|
||||
return # This function is recursive
|
||||
if not isinstance(args.gpu, list):
|
||||
args.gpu = [args.gpu]
|
||||
|
||||
# # Handle multiple GPUs
|
||||
# handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.topology)
|
||||
# if handled_multiple_gpus:
|
||||
# return # This function is recursive
|
||||
|
||||
# Handle all args being false
|
||||
if not any([args.access, args.weight, args.hops, args.type, args.numa, args.numa_bw]):
|
||||
args.access = args.weight = args.hops = args.type = args.numa = args.numa_bw = True
|
||||
if not any([args.access, args.weight, args.hops, args.link_type, args.numa, args.numa_bw]):
|
||||
args.access = args.weight = args.hops = args.link_type = args.numa = args.numa_bw = True
|
||||
|
||||
# Populate the possible gpus
|
||||
topo_values = []
|
||||
for gpu in args.gpu:
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
|
||||
topo_values.append({"gpu" : gpu_id})
|
||||
|
||||
topo_dict = {}
|
||||
if args.access:
|
||||
topo_dict['access'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
|
||||
for src_gpu_index, src_gpu in enumerate(args.gpu):
|
||||
src_gpu_links = {}
|
||||
for dest_gpu in args.gpu:
|
||||
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
|
||||
|
||||
try:
|
||||
dest_gpu_link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu)
|
||||
src_gpu_links[f'gpu_{dest_gpu_id}'] = bool(dest_gpu_link_status)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_links[f'gpu_{dest_gpu_id}'] = e.get_error_info()
|
||||
|
||||
topo_values[src_gpu_index]['link_accessibility'] = src_gpu_links
|
||||
|
||||
if args.weight:
|
||||
topo_dict['weight'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
|
||||
for src_gpu_index, src_gpu in enumerate(args.gpu):
|
||||
src_gpu_weight = {}
|
||||
for dest_gpu in args.gpu:
|
||||
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
|
||||
|
||||
if src_gpu == dest_gpu:
|
||||
src_gpu_weight[f'gpu_{dest_gpu_id}'] = 0
|
||||
continue
|
||||
|
||||
try:
|
||||
dest_gpu_link_weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu)
|
||||
src_gpu_weight[f'gpu_{dest_gpu_id}'] = dest_gpu_link_weight
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_weight[f'gpu_{dest_gpu_id}'] = e.get_error_info()
|
||||
|
||||
topo_values[src_gpu_index]['weight'] = src_gpu_weight
|
||||
|
||||
if args.hops:
|
||||
topo_dict['hops'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
|
||||
for src_gpu_index, src_gpu in enumerate(args.gpu):
|
||||
src_gpu_hops = {}
|
||||
for dest_gpu in args.gpu:
|
||||
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
|
||||
|
||||
if args.type:
|
||||
topo_dict['type'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
|
||||
if src_gpu == dest_gpu:
|
||||
src_gpu_hops[f'gpu_{dest_gpu_id}'] = 0
|
||||
continue
|
||||
|
||||
if args.numa:
|
||||
topo_dict['numa'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
|
||||
try:
|
||||
dest_gpu_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops']
|
||||
src_gpu_hops[f'gpu_{dest_gpu_id}'] = dest_gpu_hops
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_hops[f'gpu_{dest_gpu_id}'] = e.get_error_info()
|
||||
|
||||
# numa_numbers = c_uint32()
|
||||
# for device in deviceList:
|
||||
# ret = rocmsmi.rsmi_get_numa_node_number(device, byref(numa_numbers))
|
||||
# if rsmi_ret_ok(ret, device):
|
||||
# printLog(device, "(Topology) Numa Node", numa_numbers.value)
|
||||
# else:
|
||||
# printErrLog(device, "Cannot read Numa Node")
|
||||
topo_values[src_gpu_index]['hops'] = src_gpu_hops
|
||||
|
||||
if args.link_type:
|
||||
for src_gpu_index, src_gpu in enumerate(args.gpu):
|
||||
src_gpu_link_type = {}
|
||||
for dest_gpu in args.gpu:
|
||||
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
|
||||
|
||||
if src_gpu == dest_gpu:
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 0
|
||||
continue
|
||||
|
||||
try:
|
||||
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
|
||||
if isinstance(link_type, int):
|
||||
if link_type == 1:
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "PCIE"
|
||||
elif link_type == 2:
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "XMGI"
|
||||
else:
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "XXXX"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info()
|
||||
|
||||
topo_values[src_gpu_index]['link_type'] = src_gpu_link_type
|
||||
|
||||
# ret = rocmsmi.rsmi_numa_affinity_get(device, byref(numa_numbers))
|
||||
# if rsmi_ret_ok(ret):
|
||||
# printLog(device, "(Topology) Numa Affinity", numa_numbers.value)
|
||||
# else:
|
||||
# printErrLog(device, 'Cannot read Numa Affinity')
|
||||
if args.numa_bw:
|
||||
topo_dict['numa_bw'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info
|
||||
for src_gpu_index, src_gpu in enumerate(args.gpu):
|
||||
src_gpu_link_type = {}
|
||||
for dest_gpu in args.gpu:
|
||||
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
|
||||
|
||||
if src_gpu == dest_gpu:
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 'N/A'
|
||||
continue
|
||||
|
||||
# Store values in logger.output
|
||||
self.logger.store_output(args.gpu, 'values', topo_dict)
|
||||
try:
|
||||
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
|
||||
if isinstance(link_type, int):
|
||||
if link_type != 2:
|
||||
non_xgmi = True
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 'N/A'
|
||||
continue
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info()
|
||||
|
||||
if multiple_devices:
|
||||
self.logger.store_multiple_device_output()
|
||||
return # Skip printing when there are multiple devices
|
||||
try:
|
||||
min_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth(src_gpu, dest_gpu)['min_bandwidth']
|
||||
max_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth(src_gpu, dest_gpu)['max_bandwidth']
|
||||
|
||||
self.logger.print_output()
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = f'{min_bw}-{max_bw}'
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info()
|
||||
|
||||
topo_values[src_gpu_index]['numa_bandwidth'] = src_gpu_link_type
|
||||
|
||||
self.logger.multiple_device_output = topo_values
|
||||
|
||||
if self.logger.is_csv_format():
|
||||
new_output = []
|
||||
for elem in self.logger.multiple_device_output:
|
||||
new_output.append(self.logger.flatten_dict(elem, topology_override=True))
|
||||
self.logger.multiple_device_output = new_output
|
||||
|
||||
self.logger.print_output(multiple_device_enabled=True)
|
||||
|
||||
|
||||
def set_value(self, args, multiple_devices=False, gpu=None, clock=None, sclk=None, mclk=None,
|
||||
|
||||
@@ -229,7 +229,7 @@ class AMDSMIHelpers():
|
||||
for device_handle in args.gpu:
|
||||
# Handle multiple_devices to print all output at once
|
||||
subcommand(args, multiple_devices=True, gpu=device_handle)
|
||||
logger.print_output(multiple_devices_enabled=True)
|
||||
logger.print_output(multiple_device_enabled=True)
|
||||
return True, args.gpu
|
||||
elif len(args.gpu) == 1:
|
||||
args.gpu = args.gpu[0]
|
||||
|
||||
@@ -145,7 +145,7 @@ class AMDSMILogger():
|
||||
return clean_yaml_output
|
||||
|
||||
|
||||
def flatten_dict(self, target_dict):
|
||||
def flatten_dict(self, target_dict, topology_override=False):
|
||||
"""This will flatten a dictionary out to a single level of key value stores
|
||||
removing key's with dictionaries and wrapping each value to in a list
|
||||
ex:
|
||||
@@ -178,7 +178,7 @@ class AMDSMILogger():
|
||||
for key, value in target_dict.items():
|
||||
if isinstance(value, dict):
|
||||
# Check number of items in the dict
|
||||
if len(value.values()) > 1:
|
||||
if len(value.values()) > 1 or topology_override:
|
||||
value_with_parent_key = {}
|
||||
for parent_key, child_dict in value.items():
|
||||
if isinstance(child_dict, dict):
|
||||
@@ -189,7 +189,10 @@ class AMDSMILogger():
|
||||
for child_key, value1 in child_dict.items():
|
||||
value_with_parent_key[parent_key + '_' + child_key] = value1
|
||||
else:
|
||||
value_with_parent_key[parent_key] = child_dict
|
||||
if topology_override:
|
||||
value_with_parent_key[key + '_' + parent_key] = child_dict
|
||||
else:
|
||||
value_with_parent_key[parent_key] = child_dict
|
||||
value = value_with_parent_key
|
||||
|
||||
if self.is_gpuvsmi_compatibility():
|
||||
|
||||
@@ -295,6 +295,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
# Options arguments help text for Hypervisors and Baremetal
|
||||
ras_help = "Displays RAS features information"
|
||||
board_help = "All board information" # Linux Baremetal only
|
||||
numa_help = "All numa node information" # Linux Baremetal only
|
||||
|
||||
# Options arguments help text for Hypervisors
|
||||
dfc_help = "All DFC FW table information"
|
||||
@@ -324,6 +325,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
|
||||
if self.helpers.is_linux():
|
||||
static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help)
|
||||
static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help)
|
||||
|
||||
# Options to only display on a Hypervisor
|
||||
if self.helpers.is_hypervisor():
|
||||
@@ -573,7 +575,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
access_help = "Displays link accessibility between GPUs"
|
||||
weight_help = "Displays relative weight between GPUs"
|
||||
hops_help = "Displays the number of hops between GPUs"
|
||||
type_help = "Displays the link type between GPUs"
|
||||
link_type_help = "Displays the link type between GPUs"
|
||||
numa_help = "Display the HW Topology Information for numa nodes"
|
||||
numa_bw_help = "Display max and min bandwidth between nodes"
|
||||
|
||||
@@ -591,7 +593,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
topology_parser.add_argument('-a', '--access', action='store_true', required=False, help=access_help)
|
||||
topology_parser.add_argument('-w', '--weight', action='store_true', required=False, help=weight_help)
|
||||
topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help)
|
||||
topology_parser.add_argument('-t', '--type', action='store_true', required=False, help=type_help)
|
||||
topology_parser.add_argument('-t', '--link-type', action='store_true', required=False, help=link_type_help)
|
||||
topology_parser.add_argument('-n', '--numa', action='store_true', required=False, help=numa_help)
|
||||
topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help)
|
||||
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user