SWDEV-445668 - Align topology JSON

Updates:
    - [CLI] Updated json output to provide format
      similar to host
      eg.
      [
    {
        "gpu": 0,
        "bdf": "0000:01:00.0",
        "links": [
            {
                "gpu": 0,
                "bdf": "0000:01:00.0",
                "weight": 0,
                "link_status": "ENABLED",
                "link_type": "SELF",
                "num_hops": 0,
                "bandwidth": "N/A",
                "fb_sharing": "ENABLED"
            },
            {
                "gpu": 1,
                "bdf": "0001:01:00.0",
                "weight": 15,
                "link_status": "ENABLED",
                "link_type": "XGMI",
                "num_hops": 1,
                "bandwidth": "50000-100000",
                "fb_sharing": "ENABLED"
            },
        ...
        ]
    },
    {
    ...

Change-Id: I63217f63a4d6ebc23a8a84eaac9dbb7aff5f4cb4
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: 08a3e76b26]
Этот коммит содержится в:
Charis Poag
2024-03-27 17:10:59 -05:00
родитель 9e2b1d8a09
Коммит da77ac3975
2 изменённых файлов: 255 добавлений и 20 удалений
+140
Просмотреть файл
@@ -6,6 +6,10 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
## amd_smi_lib for ROCm 6.1.1
### Added
- N/A
### Changed
- **Updated metrics --clocks**
@@ -143,6 +147,142 @@ GPU: 0
...
```
- **Updated `amd-smi topology --json` to align with host/guest**
Topology's `--json` output now is changed to align with output reported bt host/guest systems. Additionally, users can select/filter specific topology details as desired (refer to `amd-smi topology -h` for full list). See examples shown below.
*Previous format:*
```shell
$ amd-smi topology --json
[
{
"gpu": 0,
"link_accessibility": {
"gpu_0": "ENABLED",
"gpu_1": "DISABLED"
},
"weight": {
"gpu_0": 0,
"gpu_1": 40
},
"hops": {
"gpu_0": 0,
"gpu_1": 2
},
"link_type": {
"gpu_0": "SELF",
"gpu_1": "PCIE"
},
"numa_bandwidth": {
"gpu_0": "N/A",
"gpu_1": "N/A"
}
},
{
"gpu": 1,
"link_accessibility": {
"gpu_0": "DISABLED",
"gpu_1": "ENABLED"
},
"weight": {
"gpu_0": 40,
"gpu_1": 0
},
"hops": {
"gpu_0": 2,
"gpu_1": 0
},
"link_type": {
"gpu_0": "PCIE",
"gpu_1": "SELF"
},
"numa_bandwidth": {
"gpu_0": "N/A",
"gpu_1": "N/A"
}
}
]
```
*New format:*
```shell
$ amd-smi topology --json
[
{
"gpu": 0,
"bdf": "0000:01:00.0",
"links": [
{
"gpu": 0,
"bdf": "0000:01:00.0",
"weight": 0,
"link_status": "ENABLED",
"link_type": "SELF",
"num_hops": 0,
"bandwidth": "N/A",
"fb_sharing": "ENABLED"
},
{
"gpu": 1,
"bdf": "0001:01:00.0",
"weight": 15,
"link_status": "ENABLED",
"link_type": "XGMI",
"num_hops": 1,
"bandwidth": "50000-100000",
"fb_sharing": "ENABLED"
},
...
]
},
...
]
```
```shell
$ /opt/rocm/bin/amd-smi topology -a -t --json
[
{
"gpu": 0,
"bdf": "0000:08:00.0",
"links": [
{
"gpu": 0,
"bdf": "0000:08:00.0",
"link_status": "ENABLED",
"link_type": "SELF"
},
{
"gpu": 1,
"bdf": "0000:44:00.0",
"link_status": "DISABLED",
"link_type": "PCIE"
}
]
},
{
"gpu": 1,
"bdf": "0000:44:00.0",
"links": [
{
"gpu": 0,
"bdf": "0000:08:00.0",
"link_status": "DISABLED",
"link_type": "PCIE"
},
{
"gpu": 1,
"bdf": "0000:44:00.0",
"link_status": "ENABLED",
"link_type": "SELF"
}
]
}
]
```
### Optimizations
- N/A
### Fixed
- **Fix for GPU reset error on non-amdgpu cards**
+115 -20
Просмотреть файл
@@ -2763,20 +2763,115 @@ class AMDSMICommands():
# Populate the possible gpus
topo_values = []
for gpu in args.gpu:
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
topo_values.append({"gpu" : gpu_id})
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu)
self.logger.table_header += gpu_bdf.rjust(13)
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
topo_values.append({"gpu" : src_gpu_id})
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
topo_values[src_gpu_index]['bdf'] = src_gpu_bdf
self.logger.table_header += src_gpu_bdf.rjust(13)
if not self.logger.is_json_format():
continue # below is for JSON format only
##########################
# JSON formatting start #
##########################
links = []
# create json obj for data alignment
# dest_gpu_links = {
# "gpu": GPU #
# "bdf": BDF identification
# "weight": 0 - self (current node); weight >= 0 correlated with hops (GPU-CPU, GPU-GPU, GPU-CPU-CPU-GPU, etc..)
# "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked
# "link_type": "SELF" - current node, "PCIE", "XGMI", "N/A" - no link,"UNKNOWN" - unidentified link type
# "num_hops": num_hops - # of hops between devices
# "bandwidth": numa_bw - The NUMA "minimum bandwidth-maximum bandwidth" beween src and dest nodes
# "N/A" - self node or not connected devices
# "fb_sharing": "ENABLED/DISABLED" - same output as defined in link_status. Devices in a hive setup should
# all have sharing enabled.
# }
for dest_gpu_index, dest_gpu in enumerate(args.gpu):
link_type = "SELF"
if src_gpu != dest_gpu:
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
if isinstance(link_type, int):
if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_UNDEFINED:
link_type = "UNKNOWN"
elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_PCIEXPRESS:
link_type = "PCIE"
elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_XGMI:
link_type = "XGMI"
else:
link_type = "N/A"
numa_bw = "N/A"
if src_gpu != dest_gpu:
try:
bw_dict = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu)
numa_bw = f"{bw_dict['min_bandwidth']}-{bw_dict['max_bandwidth']}"
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get min max bandwidth for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
weight = 0
num_hops = 0
if src_gpu != dest_gpu:
weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu)
num_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops']
link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu)
if link_status:
link_status = "ENABLED"
else:
link_status = "DISABLED"
# fb_sharing in BM - in a hive configuration, this is
# link_status = amdsmi_is_P2P_accessible(src,dest)
dest_gpu_links = {
"gpu": self.helpers.get_gpu_id_from_device_handle(dest_gpu),
"bdf": amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu),
"weight": weight,
"link_status": link_status,
"link_type": link_type,
"num_hops": num_hops,
"bandwidth": numa_bw,
"fb_sharing": link_status
}
if not args.access: # currently includes fb_sharing
del dest_gpu_links['link_status']
del dest_gpu_links['fb_sharing']
if not args.weight:
del dest_gpu_links['weight']
if not args.link_type:
del dest_gpu_links['link_type']
if not args.hops:
del dest_gpu_links['num_hops']
if not args.numa_bw:
del dest_gpu_links['bandwidth']
links.append(dest_gpu_links)
isEndOfDest = dest_gpu_index+1 == len(args.gpu)
isEndOfSrc = src_gpu_index+1 == len(args.gpu)
if isEndOfDest:
topo_values[src_gpu_index]['links'] = links
continue
if isEndOfSrc:
self.logger.multiple_device_output = topo_values
self.logger.print_output(multiple_device_enabled=True, tabular=True)
return
##########################
# JSON formatting end #
##########################
if args.access:
tabular_output = []
for src_gpu_index, src_gpu in enumerate(args.gpu):
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
if self.logger.is_human_readable_format():
tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
else:
tabular_output_dict = {'gpu' : gpu_bdf}
tabular_output_dict = {'gpu' : src_gpu_bdf}
src_gpu_links = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
@@ -2808,11 +2903,11 @@ class AMDSMICommands():
if args.weight:
tabular_output = []
for src_gpu_index, src_gpu in enumerate(args.gpu):
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
if self.logger.is_human_readable_format():
tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
else:
tabular_output_dict = {'gpu' : gpu_bdf}
tabular_output_dict = {'gpu' : src_gpu_bdf}
src_gpu_weight = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
@@ -2845,11 +2940,11 @@ class AMDSMICommands():
if args.hops:
tabular_output = []
for src_gpu_index, src_gpu in enumerate(args.gpu):
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
if self.logger.is_human_readable_format():
tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
else:
tabular_output_dict = {'gpu' : gpu_bdf}
tabular_output_dict = {'gpu' : src_gpu_bdf}
src_gpu_hops = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
@@ -2882,11 +2977,11 @@ class AMDSMICommands():
if args.link_type:
tabular_output = []
for src_gpu_index, src_gpu in enumerate(args.gpu):
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
if self.logger.is_human_readable_format():
tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
else:
tabular_output_dict = {'gpu' : gpu_bdf}
tabular_output_dict = {'gpu' : src_gpu_bdf}
src_gpu_link_type = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
@@ -2924,11 +3019,11 @@ class AMDSMICommands():
if args.numa_bw:
tabular_output = []
for src_gpu_index, src_gpu in enumerate(args.gpu):
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
if self.logger.is_human_readable_format():
tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
else:
tabular_output_dict = {'gpu' : gpu_bdf}
tabular_output_dict = {'gpu' : src_gpu_bdf}
src_gpu_link_type = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)