SWDEV-392033 - Topology gpu index fix

Fixed Topology csv output
	Added README.md for amd-smi

Change-Id: I05819b883af01c19383ee4f220923798fc7453e2
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>


[ROCm/amdsmi commit: 515adfb61f]
Este commit está contenido en:
Maisam Arif
2023-04-24 21:34:44 -05:00
padre bdc4abb376
commit 0c84413614
Se han modificado 5 ficheros con 377 adiciones y 43 borrados
+5 -2
Ver fichero
@@ -21,6 +21,7 @@ add_custom_command(
${PY_PACKAGE_DIR}/amdsmi_parser.py
${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py
${PY_PACKAGE_DIR}/BDF.py
${PY_PACKAGE_DIR}/README.md
DEPENDS amdsmi_cli
COMMAND mkdir -p ${PY_PACKAGE_DIR}/
COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/__init__.py ${PY_PACKAGE_DIR}/
@@ -32,7 +33,8 @@ add_custom_command(
COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_logger.py ${PY_PACKAGE_DIR}/
COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_parser.py ${PY_PACKAGE_DIR}/
COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_cli_exceptions.py ${PY_PACKAGE_DIR}/
COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/BDF.py ${PY_PACKAGE_DIR}/)
COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/BDF.py ${PY_PACKAGE_DIR}/
COMMAND ln -Pf ${CMAKE_CURRENT_SOURCE_DIR}/README.md ${PY_PACKAGE_DIR}/)
# The CLI requires the python amdsmi wrapper to be installed
add_custom_target(
@@ -47,7 +49,8 @@ add_custom_target(
${PY_PACKAGE_DIR}/amdsmi_logger.py
${PY_PACKAGE_DIR}/amdsmi_parser.py
${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py
${PY_PACKAGE_DIR}/BDF.py)
${PY_PACKAGE_DIR}/BDF.py
${PY_PACKAGE_DIR}/README.md)
install(
DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${PY_PACKAGE_DIR}
+333
Ver fichero
@@ -0,0 +1,333 @@
# AMD System Management Interface
This tool acts as a command line interface for manipulating
and monitoring the amdgpu kernel, and is intended to replace
and deprecate the existing rocm_smi CLI tool & gpuv-smi tool.
It uses Ctypes to call the amd_smi_lib API.
Recommended: At least one AMD GPU with AMD driver installed
## Requirements
* python 3.7+ 64-bit
* driver must be loaded for amdsmi_init() to pass
## Installation
- Install amdgpu driver
- Through package manager install amd-smi-lib
- cd /opt/<rocm_instance>/share/amd_smi
- pip install .
- /opt/<rocm_instance>/bin/amd-smi
### Example of Ubuntu 22.04 post amdgpu driver install
``` shell
apt install amd-smi-lib
cd /opt/rocm/share/amd_smi
pip install .
/opt/rocm/bin/amd-smi
```
Add /opt/rocm/bin to your local path to access amd-smi via the cmdline
## Usage
amd-smi will report the version and current platform detected when running the command without arguments:
``` bash
amd-smi
usage: amd-smi [-h] ...
AMD System Management Interface | Version: 0.0.4 | Platform: Linux Baremetal
optional arguments:
-h, --help show this help message and exit
AMD-SMI Commands:
Descriptions:
version Display version information
discovery (list)
Display discovery information
static Gets static information about the specified GPU
firmware (ucode)
Gets firmware information about the specified GPU
bad-pages Gets bad page information about the specified GPU
metric Gets metric/performance information about the specified GPU
process Lists general process information running on the specified GPU
topology Displays topology information of the devices.
set Set options for devices.
reset Reset options for devices.
```
More detailed verison information can be give when running `amd-smi version`
Each command will have detailed information via `amd-smi [command] --help`
## Commands
For convenience, here is the help output for each command
``` bash
amd-smi discovery --help
usage: amd-smi discovery [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
[-g GPU [GPU ...]]
Lists all the devices on the system and the links between devices.
Lists all the sockets and for each socket, GPUs and/or CPUs associated to
that socket alongside some basic information for each device.
In virtualization environments, it can also list VFs associated to each
GPU with some basic information for each VF.
optional arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
Command Modifiers:
--json Displays output in JSON format (human readable by default).
--csv Displays output in CSV format (human readable by default).
--file FILE Saves output into a file on the provided path (stdout by default).
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
```
``` bash
amd-smi firmware --help
usage: amd-smi firmware [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
[-g GPU [GPU ...]] [-f]
If no GPU is specified, return firmware information for all GPUs on the system.
Firmware Arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
-f, --ucode-list, --fw-list All FW list information
Command Modifiers:
--json Displays output in JSON format (human readable by default).
--csv Displays output in CSV format (human readable by default).
--file FILE Saves output into a file on the provided path (stdout by default).
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
```
```bash
amd-smi static --help
usage: amd-smi static [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-g GPU [GPU ...]]
[-a] [-b] [-V] [-l] [-d] [-c] [-r] [-B] [-u]
If no GPU is specified, returns static information for all GPUs on the system.
If no static argument is provided, all static information will be displayed.
Static Arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
-a, --asic All asic information
-b, --bus All bus information
-V, --vbios All video bios information (if available)
-l, --limit All limit metric values (i.e. power and thermal limits)
-d, --driver Displays driver version
-c, --caps All caps information
-r, --ras Displays RAS features information
-B, --board All board information
-u, --numa All numa node information
Command Modifiers:
--json Displays output in JSON format (human readable by default).
--csv Displays output in CSV format (human readable by default).
--file FILE Saves output into a file on the provided path (stdout by default).
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
```
```bash
amd-smi bad-pages --help
usage: amd-smi bad-pages [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
[-g GPU [GPU ...]] [-p] [-r] [-u]
If no GPU is specified, return bad page information for all GPUs on the system.
Bad Pages Arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
-p, --pending Displays all pending retired pages
-r, --retired Displays retired pages
-u, --un-res Displays unreservable pages
Command Modifiers:
--json Displays output in JSON format (human readable by default).
--csv Displays output in CSV format (human readable by default).
--file FILE Saves output into a file on the provided path (stdout by default).
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
```
```bash
amd-smi metric --help
usage: amd-smi metric [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-g GPU [GPU ...]]
[-w loop_time] [-W total_loop_time] [-i number_of_iterations] [-u]
[-b] [-p] [-c] [-t] [-e] [-P] [-V] [-f] [-C] [-o] [-M] [-l] [-r]
[-x] [-E] [-m]
If no GPU is specified, returns metric information for all GPUs on the system.
If no metric argument is provided all metric information will be displayed.
Metric arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
-w loop_time, --watch loop_time Reprint the command in a loop of Interval seconds
-W total_loop_time, --watch_time total_loop_time The total time to watch the given command
-i number_of_iterations, --iterations number_of_iterations Total number of iterations to loop on the given command
-u, --usage Displays engine usage information
-b, --fb-usage Total and used framebuffer
-p, --power Current power usage
-c, --clock Average, max, and current clock frequencies
-t, --temperature Current temperatures
-e, --ecc Number of ECC errors
-P, --pcie Current PCIe speed and width
-V, --voltage Current GPU voltages
-f, --fan Current fan speed
-C, --voltage-curve Display voltage curve
-o, --overdrive Current GPU clock overdrive level
-M, --mem-overdrive Current memory clock overdrive level
-l, --perf-level Current DPM performance level
-r, --replay-count PCIe replay count
-x, --xgmi-err XGMI error information since last read
-E, --energy Amount of energy consumed
-m, --mem-usage Memory usage per block
Command Modifiers:
--json Displays output in JSON format (human readable by default).
--csv Displays output in CSV format (human readable by default).
--file FILE Saves output into a file on the provided path (stdout by default).
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
```
```bash
amd-smi process --help
usage: amd-smi process [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-g GPU [GPU ...]]
[-w loop_time] [-W total_loop_time] [-i number_of_iterations] [-G]
[-e] [-p PID] [-n NAME]
If no GPU is specified, returns information for all GPUs on the system.
If no process argument is provided all process information will be displayed.
Process arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
-w loop_time, --watch loop_time Reprint the command in a loop of Interval seconds
-W total_loop_time, --watch_time total_loop_time The total time to watch the given command
-i number_of_iterations, --iterations number_of_iterations Total number of iterations to loop on the given command
-G, --general pid, process name, memory usage
-e, --engine All engine usages
-p PID, --pid PID Gets all process information about the specified process based on Process ID
-n NAME, --name NAME Gets all process information about the specified process based on Process Name.
If multiple processes have the same name information is returned for all of them.
Command Modifiers:
--json Displays output in JSON format (human readable by default).
--csv Displays output in CSV format (human readable by default).
--file FILE Saves output into a file on the provided path (stdout by default).
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
```
```bash
amd-smi topology --help
usage: amd-smi topology [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
[-g GPU [GPU ...]] [-a] [-w] [-o] [-t] [-b]
If no GPU is specified, returns information for all GPUs on the system.
If no topology argument is provided all topology information will be displayed.
Topology arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
-a, --access Displays link accessibility between GPUs
-w, --weight Displays relative weight between GPUs
-o, --hops Displays the number of hops between GPUs
-t, --link-type Displays the link type between GPUs
-b, --numa-bw Display max and min bandwidth between nodes
Command Modifiers:
--json Displays output in JSON format (human readable by default).
--csv Displays output in CSV format (human readable by default).
--file FILE Saves output into a file on the provided path (stdout by default).
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
```
```bash
amd-smi set --help
usage: amd-smi set [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] -g GPU [GPU ...]
[-c CLK_TYPE [CLK_LEVELS ...]] [-s CLK_LEVELS [CLK_LEVELS ...]]
[-m CLK_LEVELS [CLK_LEVELS ...]] [-p CLK_LEVELS [CLK_LEVELS ...]]
[-S SCLKLEVEL SCLK] [-M MCLKLEVEL MCLK] [-V POINT SCLK SVOLT]
[-r SCLKMIN SCLKMAX] [-R MCLKMIN MCLKMAX] [-f %] [-l LEVEL] [-o %]
[-O %] [-w WATTS] [-P SETPROFILE] [-d SCLKMAX]
A GPU must be specified to set a configuration.
A set argument must be provided; Multiple set arguments are accepted
Set Arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
-c CLK_TYPE [CLK_LEVELS ...], --clock CLK_TYPE [CLK_LEVELS ...] Sets clock frequency levels for specified clocks
-s CLK_LEVELS [CLK_LEVELS ...], --sclk CLK_LEVELS [CLK_LEVELS ...] Sets GPU clock frequency levels
-m CLK_LEVELS [CLK_LEVELS ...], --mclk CLK_LEVELS [CLK_LEVELS ...] Sets memory clock frequency levels
-p CLK_LEVELS [CLK_LEVELS ...], --pcie CLK_LEVELS [CLK_LEVELS ...] Sets PCIe Bandwith
-S SCLKLEVEL SCLK, --slevel SCLKLEVEL SCLK Change GPU clock frequency and voltage for a specific level
-M MCLKLEVEL MCLK, --mlevel MCLKLEVEL MCLK Change GPU memory frequency and voltage for a specific level
-V POINT SCLK SVOLT, --vc POINT SCLK SVOLT Change SCLK voltage curve for a specified point
-r SCLKMIN SCLKMAX, --srange SCLKMIN SCLKMAX Sets min and max SCLK speed
-R MCLKMIN MCLKMAX, --mrange MCLKMIN MCLKMAX Sets min and max MCLK speed
-f %, --fan % Sets GPU fan speed (0-255 or 0-100%)
-l LEVEL, --perflevel LEVEL Sets performance level
-o %, --overdrive % Set GPU overdrive (0-20%) ***DEPRECATED IN NEWER KERNEL VERSIONS (use --slevel instead)***
-O %, --memoverdrive % Set memory overclock overdrive level ***DEPRECATED IN NEWER KERNEL VERSIONS (use --mlevel instead)***
-w WATTS, --poweroverdrive WATTS Set the maximum GPU power using power overdrive in Watts
-P SETPROFILE, --profile SETPROFILE Set power profile level (#) or a quoted string of custom profile attributes
-d SCLKMAX, --perfdeterminism SCLKMAX Sets GPU clock frequency limit and performance level to determinism to get minimal performance variation
Command Modifiers:
--json Displays output in JSON format (human readable by default).
--csv Displays output in CSV format (human readable by default).
--file FILE Saves output into a file on the provided path (stdout by default).
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
```
```bash
amd-smi reset --help
usage: amd-smi reset [-h] [--json | --csv] [--file FILE]
[--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] -g GPU [GPU ...]
[-G] [-c] [-f] [-p] [-o] [-x] [-d]
A GPU must be specified to reset a configuration.
A reset argument must be provided; Multiple reset arguments are accepted
Reset Arguments:
-h, --help show this help message and exit
-g GPU [GPU ...], --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices:
ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-ffff-ffff-ffff-ffffffffffff
-G, --gpureset Reset the specified GPU
-c, --clocks Reset clocks and overdrive to default
-f, --fans Reset fans to automatic (driver) control
-p, --profile Reset power profile back to default
-o, --poweroverdrive Set the maximum GPU power back to the device default state
-x, --xgmierr Reset XGMI error counts
-d, --perfdeterminism Disable performance determinism
Command Modifiers:
--json Displays output in JSON format (human readable by default).
--csv Displays output in CSV format (human readable by default).
--file FILE Saves output into a file on the provided path (stdout by default).
--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level for the parser commands
```
## Disclaimer
The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein.
AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
Copyright (c) 2014-2023 Advanced Micro Devices, Inc. All rights reserved.
+1 -2
Ver fichero
@@ -37,8 +37,7 @@ def _print_error(e, destination):
f = open(destination, "w")
f.write(e)
f.close()
print("Error occured. Result written to " +
str(destination) + " file")
print("Error occured. Result written to " + str(destination) + " file")
if __name__ == "__main__":
+31 -34
Ver fichero
@@ -1306,7 +1306,7 @@ class AMDSMICommands():
def topology(self, args, multiple_devices=False, gpu=None, access=None,
weight=None, hops=None, link_type=None, numa=None, numa_bw=None):
weight=None, hops=None, link_type=None, numa_bw=None):
""" Get topology information for target gpus
The compatibility mode for this will only be in amdsmi & rocm-smi
params:
@@ -1317,7 +1317,6 @@ class AMDSMICommands():
weight (bool) - Value override for args.weight
hops (bool) - Value override for args.hops
type (bool) - Value override for args.type
numa (bool) - Value override for args.numa
numa_bw (bool) - Value override for args.numa_bw
return:
Nothing
@@ -1333,8 +1332,6 @@ class AMDSMICommands():
args.hops = hops
if link_type:
args.link_type = link_type
if numa:
args.numa = numa
if numa_bw:
args.numa_bw = numa_bw
@@ -1345,14 +1342,9 @@ class AMDSMICommands():
if not isinstance(args.gpu, list):
args.gpu = [args.gpu]
# # Handle multiple GPUs
# handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.topology)
# if handled_multiple_gpus:
# return # This function is recursive
# Handle all args being false
if not any([args.access, args.weight, args.hops, args.link_type, args.numa, args.numa_bw]):
args.access = args.weight = args.hops = args.link_type = args.numa = args.numa_bw = True
if not any([args.access, args.weight, args.hops, args.link_type, args.numa_bw]):
args.access = args.weight = args.hops = args.link_type= args.numa_bw = True
# Populate the possible gpus
topo_values = []
@@ -1364,13 +1356,14 @@ class AMDSMICommands():
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_links = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
try:
dest_gpu_link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu)
src_gpu_links[f'gpu_{dest_gpu_id}'] = bool(dest_gpu_link_status)
src_gpu_links[dest_gpu_key] = bool(dest_gpu_link_status)
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_links[f'gpu_{dest_gpu_id}'] = e.get_error_info()
src_gpu_links[dest_gpu_key] = e.get_error_info()
topo_values[src_gpu_index]['link_accessibility'] = src_gpu_links
@@ -1378,17 +1371,18 @@ class AMDSMICommands():
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_weight = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
if src_gpu == dest_gpu:
src_gpu_weight[f'gpu_{dest_gpu_id}'] = 0
src_gpu_weight[dest_gpu_key] = 0
continue
try:
dest_gpu_link_weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu)
src_gpu_weight[f'gpu_{dest_gpu_id}'] = dest_gpu_link_weight
src_gpu_weight[dest_gpu_key] = dest_gpu_link_weight
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_weight[f'gpu_{dest_gpu_id}'] = e.get_error_info()
src_gpu_weight[dest_gpu_key] = e.get_error_info()
topo_values[src_gpu_index]['weight'] = src_gpu_weight
@@ -1396,17 +1390,18 @@ class AMDSMICommands():
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_hops = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
if src_gpu == dest_gpu:
src_gpu_hops[f'gpu_{dest_gpu_id}'] = 0
src_gpu_hops[dest_gpu_key] = 0
continue
try:
dest_gpu_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops']
src_gpu_hops[f'gpu_{dest_gpu_id}'] = dest_gpu_hops
src_gpu_hops[dest_gpu_key] = dest_gpu_hops
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_hops[f'gpu_{dest_gpu_id}'] = e.get_error_info()
src_gpu_hops[dest_gpu_key] = e.get_error_info()
topo_values[src_gpu_index]['hops'] = src_gpu_hops
@@ -1414,23 +1409,24 @@ class AMDSMICommands():
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_link_type = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
if src_gpu == dest_gpu:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 0
src_gpu_link_type[dest_gpu_key] = 0
continue
try:
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
if isinstance(link_type, int):
if link_type == 1:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "PCIE"
src_gpu_link_type[dest_gpu_key] = "PCIE"
elif link_type == 2:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "XMGI"
src_gpu_link_type[dest_gpu_key] = "XMGI"
else:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = "XXXX"
src_gpu_link_type[dest_gpu_key] = "XXXX"
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info()
src_gpu_link_type[dest_gpu_key] = e.get_error_info()
topo_values[src_gpu_index]['link_type'] = src_gpu_link_type
@@ -1438,10 +1434,11 @@ class AMDSMICommands():
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_link_type = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
if src_gpu == dest_gpu:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 'N/A'
src_gpu_link_type[dest_gpu_key] = 'N/A'
continue
try:
@@ -1449,18 +1446,18 @@ class AMDSMICommands():
if isinstance(link_type, int):
if link_type != 2:
non_xgmi = True
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = 'N/A'
src_gpu_link_type[dest_gpu_key] = 'N/A'
continue
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info()
src_gpu_link_type[dest_gpu_key] = e.get_error_info()
try:
min_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth(src_gpu, dest_gpu)['min_bandwidth']
max_bw = amdsmi_interface.amdsmi_get_minmax_bandwidth(src_gpu, dest_gpu)['max_bandwidth']
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = f'{min_bw}-{max_bw}'
src_gpu_link_type[dest_gpu_key] = f'{min_bw}-{max_bw}'
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_link_type[f'gpu_{dest_gpu_id}'] = e.get_error_info()
src_gpu_link_type[dest_gpu_key] = e.get_error_info()
topo_values[src_gpu_index]['numa_bandwidth'] = src_gpu_link_type
+7 -5
Ver fichero
@@ -52,6 +52,10 @@ class AMDSMIParser(argparse.ArgumentParser):
version_string = f"Version: {__version__}"
platform_string = f"Platform: {self.helpers.os_info()}"
program_name = 'amd-smi'
if 'gpuv-smi' in sys.argv[0]:
program_name = 'gpuv-smi'
# Adjust argument parser options
super().__init__(
formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog,
@@ -59,14 +63,14 @@ class AMDSMIParser(argparse.ArgumentParser):
width=90),
description=f"AMD System Management Interface | {version_string} | {platform_string}",
add_help=True,
prog="amd-smi")
prog=program_name)
# Setup subparsers
subparsers = self.add_subparsers(
title="AMD-SMI Commands",
parser_class=argparse.ArgumentParser,
help="Descriptions:",
metavar="")
metavar='')
# Add all subparsers
self._add_version_parser(subparsers, version)
@@ -262,7 +266,7 @@ class AMDSMIParser(argparse.ArgumentParser):
discovery_subcommand_help = "Lists all the devices on the system and the links between devices.\
\nLists all the sockets and for each socket, GPUs and/or CPUs associated to\
\nthat socket alongside some basic information for each device.\
\nIn virtualization environment, it can also list VFs associated to each\
\nIn virtualization environments, it can also list VFs associated to each\
\nGPU with some basic information for each VF."
# Create discovery subparser
@@ -576,7 +580,6 @@ class AMDSMIParser(argparse.ArgumentParser):
weight_help = "Displays relative weight between GPUs"
hops_help = "Displays the number of hops between GPUs"
link_type_help = "Displays the link type between GPUs"
numa_help = "Display the HW Topology Information for numa nodes"
numa_bw_help = "Display max and min bandwidth between nodes"
# Create topology subparser
@@ -594,7 +597,6 @@ class AMDSMIParser(argparse.ArgumentParser):
topology_parser.add_argument('-w', '--weight', action='store_true', required=False, help=weight_help)
topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help)
topology_parser.add_argument('-t', '--link-type', action='store_true', required=False, help=link_type_help)
topology_parser.add_argument('-n', '--numa', action='store_true', required=False, help=numa_help)
topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help)