Add ASIC and Board information (#721)

Signed-off-by: josnarlo <Joseph.Narlo@amd.com>

[ROCm/amdsmi commit: b1eeff9992]
This commit is contained in:
Narlo, Joseph
2025-10-01 17:39:26 -05:00
committed by GitHub
parent 94e6ba68b4
commit 098aa488aa
2 changed files with 72 additions and 3 deletions
@@ -19,12 +19,13 @@
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import json
import multiprocessing
import os
import sys
import threading
import unittest
import os
# Default path for AMDSMI_CLI_PATH is "/opt/rocm/libexec/amdsmi_cli/"
amdsmi_cli_path = os.environ.get("AMDSMI_CLI_PATH", "/opt/rocm/libexec/amdsmi_cli/")
@@ -44,7 +45,39 @@ class TestAmdSmiInit(unittest.TestCase):
class TestAmdSmiPythonInterface(unittest.TestCase):
max_num_physical_devices = amdsmi.amdsmi_interface.AMDSMI_MAX_NUM_XCP * amdsmi.amdsmi_interface.AMDSMI_MAX_DEVICES
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.verbose = verbose
self.max_num_physical_devices = amdsmi.amdsmi_interface.AMDSMI_MAX_NUM_XCP * amdsmi.amdsmi_interface.AMDSMI_MAX_DEVICES
global has_info_printed
if self.verbose and has_info_printed is False:
# Execute the following to print the asic and board info once per test run
has_info_printed = True
self.setUp()
processors = amdsmi.amdsmi_get_processor_handles()
self.assertGreaterEqual(len(processors), 1)
self.assertLessEqual(len(processors), self.max_num_physical_devices)
for i in range(0, len(processors)):
try:
# Print asic info
msg = f'asic info(gpu={i})'
ret = amdsmi.amdsmi_get_gpu_asic_info(processors[i])
print(msg)
print(json.dumps(ret, sort_keys=False, indent=4), flush=True)
except amdsmi.AmdSmiLibraryException as e:
raise e
for i in range(0, len(processors)):
try:
# Print board info
msg = f'board info(gpu={i})'
ret = amdsmi.amdsmi_get_gpu_board_info(processors[i])
print(msg)
print(json.dumps(ret, sort_keys=False, indent=4), flush=True)
except amdsmi.AmdSmiLibraryException as e:
raise e
self.tearDown()
return
def _check_exception(self, e):
error_code = e.get_error_code()
@@ -1335,6 +1368,7 @@ if __name__ == '__main__':
verbose=0
elif '-v' in sys.argv or '--verbose' in sys.argv:
verbose=2
has_info_printed = False
# If no -k or --keyword argument is given, print all available tests
if not ('-k' in sys.argv or '--keyword' in sys.argv):
@@ -31,6 +31,8 @@ try:
except ImportError:
raise ImportError("Could not import /opt/rocm/libexec/amdsmi_cli/amdsmi_cli.py")
has_info_printed = False
not_supported_error_codes = ['2', '3', '49']
not_supported_error_code_names = ['AMDSMI_STATUS_NOT_SUPPORTED', 'AMDSMI_STATUS_NOT_YET_IMPLEMENTED', 'AMDSMI_STATUS_NO_HSMP_MSG_SUP']
@@ -85,6 +87,39 @@ error_map = \
class TestAmdSmiPythonBDF(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.verbose = verbose
self.max_num_physical_devices = amdsmi.amdsmi_interface.AMDSMI_MAX_NUM_XCP * amdsmi.amdsmi_interface.AMDSMI_MAX_DEVICES
global has_info_printed
if self.verbose and has_info_printed is False:
# Execute the following to print the asic and board info once per test run
has_info_printed = True
self.setUp()
processors = amdsmi.amdsmi_get_processor_handles()
self.assertGreaterEqual(len(processors), 1)
self.assertLessEqual(len(processors), self.max_num_physical_devices)
for i in range(0, len(processors)):
try:
# Print asic info
msg = f'asic info(gpu={i})'
ret = amdsmi.amdsmi_get_gpu_asic_info(processors[i])
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
raise e
for i in range(0, len(processors)):
try:
# Print board info
msg = f'board info(gpu={i})'
ret = amdsmi.amdsmi_get_gpu_board_info(processors[i])
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
raise e
self.tearDown()
return
valid_bdfs = {
"00:00.0": [0, 0, 0, 0],
"01:01.1": [0, 1, 1, 1],
@@ -452,7 +487,7 @@ class TestAmdSmiPythonBDF(unittest.TestCase):
self.assertEqual(None, result)
@classmethod
def _print(self, msg, data=None, cond=None):
def _print(self, msg, data=None):
if verbose == 2:
if not data:
print(msg, flush=True)