#!/usr/bin/env python3 # # Copyright (C) Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in # the Software without restriction, including without limitation the rights to # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of # the Software, and to permit persons to whom the Software is furnished to do so, # subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import ctypes import inspect import json import os import sys import unittest amdsmi_path = os.environ.get("AMDSMI_PATH", "/opt/rocm/share/amd_smi") if not os.path.exists(amdsmi_path): raise FileNotFoundError(f"AMDSMI_PATH '{amdsmi_path}' does not exist. Please set the correct path in your environment.") sys.path.append(amdsmi_path) try: import amdsmi except ImportError as exc: raise ImportError(f'Could not import {amdsmi_path}') from exc not_supported_error_codes = \ [ ( '2', 'AMDSMI_STATUS_NOT_SUPPORTED'), ( '3', 'AMDSMI_STATUS_NOT_YET_IMPLEMENTED'), ('49', 'AMDSMI_STATUS_NO_HSMP_MSG_SUP') ] error_map = \ { '0': 'AMDSMI_STATUS_SUCCESS', '1': 'AMDSMI_STATUS_INVAL', '2': 'AMDSMI_STATUS_NOT_SUPPORTED', '3': 'AMDSMI_STATUS_NOT_YET_IMPLEMENTED', '4': 'AMDSMI_STATUS_FAIL_LOAD_MODULE', '5': 'AMDSMI_STATUS_FAIL_LOAD_SYMBOL', '6': 'AMDSMI_STATUS_DRM_ERROR', '7': 'AMDSMI_STATUS_API_FAILED', '8': 'AMDSMI_STATUS_TIMEOUT', '9': 'AMDSMI_STATUS_RETRY', '10': 'AMDSMI_STATUS_NO_PERM', '11': 'AMDSMI_STATUS_INTERRUPT', '12': 'AMDSMI_STATUS_IO', '13': 'AMDSMI_STATUS_ADDRESS_FAULT', '14': 'AMDSMI_STATUS_FILE_ERROR', '15': 'AMDSMI_STATUS_OUT_OF_RESOURCES', '16': 'AMDSMI_STATUS_INTERNAL_EXCEPTION', '17': 'AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS', '18': 'AMDSMI_STATUS_INIT_ERROR', '19': 'AMDSMI_STATUS_REFCOUNT_OVERFLOW', '30': 'AMDSMI_STATUS_BUSY', '31': 'AMDSMI_STATUS_NOT_FOUND', '32': 'AMDSMI_STATUS_NOT_INIT', '33': 'AMDSMI_STATUS_NO_SLOT', '34': 'AMDSMI_STATUS_DRIVER_NOT_LOADED', '39': 'AMDSMI_STATUS_MORE_DATA', '40': 'AMDSMI_STATUS_NO_DATA', '41': 'AMDSMI_STATUS_INSUFFICIENT_SIZE', '42': 'AMDSMI_STATUS_UNEXPECTED_SIZE', '43': 'AMDSMI_STATUS_UNEXPECTED_DATA', '44': 'AMDSMI_STATUS_NON_AMD_CPU', '45': 'AMDSMI_STATUS_NO_ENERGY_DRV', '46': 'AMDSMI_STATUS_NO_MSR_DRV', '47': 'AMDSMI_STATUS_NO_HSMP_DRV', '48': 'AMDSMI_STATUS_NO_HSMP_SUP', '49': 'AMDSMI_STATUS_NO_HSMP_MSG_SUP', '50': 'AMDSMI_STATUS_HSMP_TIMEOUT', '51': 'AMDSMI_STATUS_NO_DRV', '52': 'AMDSMI_STATUS_FILE_NOT_FOUND', '53': 'AMDSMI_STATUS_ARG_PTR_NULL', '54': 'AMDSMI_STATUS_AMDGPU_RESTART_ERR', '55': 'AMDSMI_STATUS_SETTING_UNAVAILABLE', '56': 'AMDSMI_STATUS_CORRUPTED_EEPROM', '0xFFFFFFFE': 'AMDSMI_STATUS_MAP_ERROR', '0xFFFFFFFF': 'AMDSMI_STATUS_UNKNOWN_ERROR' } verbose=1 if '-q' in sys.argv or '--quiet' in sys.argv: verbose=0 elif '-v' in sys.argv or '--verbose' in sys.argv: verbose=2 has_info_printed = False class TestAmdSmiPythonBDF(unittest.TestCase): valid_bdfs = { "00:00.0": [0, 0, 0, 0], "01:01.1": [0, 1, 1, 1], "FF:1F.7": [0, 255, 31, 7], "FF:00.7": [0, 255, 0, 7], "11:01.2": [0, 17, 1, 2], "11:0a.2": [0, 17, 10, 2], "0000:FF:1F.7": [0, 255, 31, 7], "0001:ff:1F.7": [1, 255, 31, 7], "ffff:FF:1f.7": [65535, 255, 31, 7], } invalid_bdfs = { # invalid bdf strings, expect None None: None, "": None, "00:00:0": None, "00.00:0": None, "00:00.Z": None, "00:0Z.0": None, "0Z:00.0": None, "Z00:00.0": None, "A00:00.0": None, "0A00:00.0": None, "00:00.07": None, "00:00.8": None, "00:00.10": None, "00:00.11": None, "00:00.-1": None, "00:00.*-1": None, "00:00.123": None, "00:20.0": None, "00:45.0": None, "00:200.0": None, "00:002.0": None, "100:00.0": None, "0100:00.0": None, "00100:00.0": None, "0101:00.0": None, "00001:00.0": None, "10001:00.0": None, "45:0.0": None, ".00:00.0": None, "00.00.0": None, "00.0.0": None, "0.00.0": None, "000.00.0": None, "00 00 0": None, " 00:00.0": None, "00:00.0 ": None, "0000:00.00.0": None, "000:00:00.0": None, "00:00:00.1": None, "0:00:00.1": None, "0000 00 00 0": None, "-1-1:00:00.0": None, "AAAA:00:AA.0": None, "*1*1:00:00.0": None, "0000:00:00.07": None, "0000:00:00.8": None, "0000:00:00.10": None, "0000:00:00.11": None, "0000:00:00.-1": None, "0000:00:00.*-1": None, "0000:00:00.123": None, "0000:00:20.0": None, "0000:00:45.0": None, "0000:00:200.0": None, "0000:00:002.0": None, "0000:100:00.0": None, "0000:0100:00.0": None, "0000:00100:00.0": None, "0000:0101:00.0": None, "0000:00001:00.0": None, "0000:10001:00.0": None, "0000:45:0.0": None, ".0000.00:00.0": None, "0000.00.0.0": None, " 0000:00:00.0": None, "0000:00:00.0 ": None, } def test_parse_bdf(self): # go through all bdfs expectations = self.valid_bdfs.copy() expectations.update(self.invalid_bdfs) for bdf in expectations: expected = expectations[bdf] result = amdsmi.amdsmi_interface._parse_bdf(bdf) self.assertEqual(result, expected, "Expected {} for bdf {}, but got {}".format( expected, bdf, result)) @classmethod def _convert_bdf_to_long(cls, bdf): if len(bdf) == 12: return bdf if len(bdf) == 7: return "0000:" + bdf return None def test_format_bdf(self): # go through valid bdfs expectations = self.valid_bdfs.copy() for bdf_string in expectations: # use key as result and value as input bdf_list = expectations[bdf_string] smi_bdf = amdsmi.amdsmi_interface._make_amdsmi_bdf_from_list(bdf_list) expected = TestAmdSmiPythonBDF._convert_bdf_to_long(bdf_string) if expected: expected = expected.lower() if smi_bdf: result = amdsmi.amdsmi_interface._format_bdf(smi_bdf) else: result = "None" self.assertEqual(result, expected, "Expected {} for bdf {}, but got {}".format( expected, bdf_string, result)) def test_check_res(self): # expect retry error to raise SmiRetryException with self.assertRaises(amdsmi.AmdSmiRetryException) as retry_test: amdsmi.amdsmi_interface._check_res( (lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY)()) # except retry error to have AMDSMI_STATUS_RETRY error code self.assertEqual(retry_test.exception.get_error_code(), amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY) # expect timeout error to raise SmiTimeoutException with self.assertRaises(amdsmi.AmdSmiTimeoutException) as timeout_test: amdsmi.amdsmi_interface._check_res( (lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)()) # except timeout error to have AMDSMI_STATUS_RETRY error code self.assertEqual(timeout_test.exception.get_error_code(), amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT) # expect invalid args error to raise AmdSmiLibraryException with self.assertRaises(amdsmi.AmdSmiLibraryException) as inval_test: amdsmi.amdsmi_interface._check_res( (lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)()) # expect invalid args error to have AMDSMI_STATUS_INVAL error code self.assertEqual(inval_test.exception.get_error_code(), amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL) class TestAmdSmiPython(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) global has_info_printed if verbose and has_info_printed is False: # Execute the following to print the asic and board info once # per test run has_info_printed = True self.setUp() for i, gpu in enumerate(self.processors): try: # Print asic info msg = f'asic info(gpu={i})' ret = amdsmi.amdsmi_get_gpu_asic_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: raise e for i, gpu in enumerate(self.processors): try: # Print board info msg = f'board info(gpu={i})' ret = amdsmi.amdsmi_get_gpu_board_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: raise e self.tearDown() PASS = 'AMDSMI_STATUS_SUCCESS' FAIL = 'AMDSMI_STATUS_INVAL' max_num_physical_devices = amdsmi.amdsmi_interface.AMDSMI_MAX_NUM_XCP * amdsmi.amdsmi_interface.AMDSMI_MAX_DEVICES # Tests marked wtih either of these flags will be skipped # and need to be implemented later. TODO_SKIP_FAIL = True TODO_SKIP_NOT_COMPLETE = True status_types = \ [ ('SUCCESS', amdsmi.AmdSmiStatus.SUCCESS, PASS), ('INVAL', amdsmi.AmdSmiStatus.INVAL, PASS), ('NOT_SUPPORTED', amdsmi.AmdSmiStatus.NOT_SUPPORTED, PASS), ('NOT_YET_IMPLEMENTED', amdsmi.AmdSmiStatus.NOT_YET_IMPLEMENTED, PASS), ('FAIL_LOAD_MODULE', amdsmi.AmdSmiStatus.FAIL_LOAD_MODULE, PASS), ('FAIL_LOAD_SYMBOL', amdsmi.AmdSmiStatus.FAIL_LOAD_SYMBOL, PASS), ('DRM_ERROR', amdsmi.AmdSmiStatus.DRM_ERROR, PASS), ('API_FAILED', amdsmi.AmdSmiStatus.API_FAILED, PASS), ('TIMEOUT', amdsmi.AmdSmiStatus.TIMEOUT, PASS), ('RETRY', amdsmi.AmdSmiStatus.RETRY, PASS), ('NO_PERM', amdsmi.AmdSmiStatus.NO_PERM, PASS), ('INTERRUPT', amdsmi.AmdSmiStatus.INTERRUPT, PASS), ('IO', amdsmi.AmdSmiStatus.IO, PASS), ('ADDRESS_FAULT', amdsmi.AmdSmiStatus.ADDRESS_FAULT, PASS), ('FILE_ERROR', amdsmi.AmdSmiStatus.FILE_ERROR, PASS), ('OUT_OF_RESOURCES', amdsmi.AmdSmiStatus.OUT_OF_RESOURCES, PASS), ('INTERNAL_EXCEPTION', amdsmi.AmdSmiStatus.INTERNAL_EXCEPTION, PASS), ('INPUT_OUT_OF_BOUNDS', amdsmi.AmdSmiStatus.INPUT_OUT_OF_BOUNDS, PASS), ('INIT_ERROR', amdsmi.AmdSmiStatus.INIT_ERROR, PASS), ('REFCOUNT_OVERFLOW', amdsmi.AmdSmiStatus.REFCOUNT_OVERFLOW, PASS), ('DIRECTORY_NOT_FOUND', amdsmi.AmdSmiStatus.DIRECTORY_NOT_FOUND, PASS), ('BUSY', amdsmi.AmdSmiStatus.BUSY, PASS), ('NOT_FOUND', amdsmi.AmdSmiStatus.NOT_FOUND, PASS), ('NOT_INIT', amdsmi.AmdSmiStatus.NOT_INIT, PASS), ('NO_SLOT', amdsmi.AmdSmiStatus.NO_SLOT, PASS), ('DRIVER_NOT_LOADED', amdsmi.AmdSmiStatus.DRIVER_NOT_LOADED, PASS), ('MORE_DATA', amdsmi.AmdSmiStatus.MORE_DATA, PASS), ('NO_DATA', amdsmi.AmdSmiStatus.NO_DATA, PASS), ('INSUFFICIENT_SIZE', amdsmi.AmdSmiStatus.INSUFFICIENT_SIZE, PASS), ('UNEXPECTED_SIZE', amdsmi.AmdSmiStatus.UNEXPECTED_SIZE, PASS), ('UNEXPECTED_DATA', amdsmi.AmdSmiStatus.UNEXPECTED_DATA, PASS), ('NON_AMD_CPU', amdsmi.AmdSmiStatus.NON_AMD_CPU, PASS), ('NO_ENERGY_DRV', amdsmi.AmdSmiStatus.NO_ENERGY_DRV, PASS), ('NO_MSR_DRV', amdsmi.AmdSmiStatus.NO_MSR_DRV, PASS), ('NO_HSMP_DRV', amdsmi.AmdSmiStatus.NO_HSMP_DRV, PASS), ('NO_HSMP_SUP', amdsmi.AmdSmiStatus.NO_HSMP_SUP, PASS), ('NO_HSMP_MSG_SUP', amdsmi.AmdSmiStatus.NO_HSMP_MSG_SUP, PASS), ('HSMP_TIMEOUT', amdsmi.AmdSmiStatus.HSMP_TIMEOUT, PASS), ('NO_DRV', amdsmi.AmdSmiStatus.NO_DRV, PASS), ('FILE_NOT_FOUND', amdsmi.AmdSmiStatus.FILE_NOT_FOUND, PASS), ('ARG_PTR_NULL', amdsmi.AmdSmiStatus.ARG_PTR_NULL, PASS), ('AMDGPU_RESTART_ERR', amdsmi.AmdSmiStatus.AMDGPU_RESTART_ERR, PASS), ('SETTING_UNAVAILABLE', amdsmi.AmdSmiStatus.SETTING_UNAVAILABLE, PASS), ('CORRUPTED_EEPROM', amdsmi.AmdSmiStatus.CORRUPTED_EEPROM, PASS), ('MAP_ERROR', amdsmi.AmdSmiStatus.MAP_ERROR, PASS), ('UNKNOWN_ERROR', amdsmi.AmdSmiStatus.UNKNOWN_ERROR, PASS) ] clk_types = \ [ ('SYS', amdsmi.AmdSmiClkType.SYS, PASS), ('GFX', amdsmi.AmdSmiClkType.GFX, PASS), ('DF', amdsmi.AmdSmiClkType.DF, PASS), ('DCEF', amdsmi.AmdSmiClkType.DCEF, [PASS, FAIL]), ('SOC', amdsmi.AmdSmiClkType.SOC, PASS), ('MEM', amdsmi.AmdSmiClkType.MEM, PASS), ('PCIE', amdsmi.AmdSmiClkType.PCIE, [PASS, FAIL]), ('VCLK0', amdsmi.AmdSmiClkType.VCLK0, PASS), ('VCLK1', amdsmi.AmdSmiClkType.VCLK1, PASS), ('DCLK0', amdsmi.AmdSmiClkType.DCLK0, PASS), ('DCLK1', amdsmi.AmdSmiClkType.DCLK1, PASS) ] clk_limit_types = \ [ ('MIN', amdsmi.AmdSmiClkLimitType.MIN, PASS), ('MAX', amdsmi.AmdSmiClkLimitType.MAX, PASS) ] io_bw_encodings = \ [ ('AGG_BW0', amdsmi.amdsmi_wrapper.AGG_BW0, PASS), ('RD_BW0', amdsmi.amdsmi_wrapper.RD_BW0, PASS), ('WR_BW0', amdsmi.amdsmi_wrapper.WR_BW0, PASS) ] event_groups = \ [ ('XGMI', amdsmi.AmdSmiEventGroup.XGMI, PASS), ('XGMI_DATA_OUT', amdsmi.AmdSmiEventGroup.XGMI_DATA_OUT, PASS), ('GRP_INVALID', amdsmi.AmdSmiEventGroup.GRP_INVALID, FAIL) ] gpu_blocks = \ [ ('INVALID', amdsmi.AmdSmiGpuBlock.INVALID, FAIL), ('UMC', amdsmi.AmdSmiGpuBlock.UMC, PASS), ('SDMA', amdsmi.AmdSmiGpuBlock.SDMA, PASS), ('GFX', amdsmi.AmdSmiGpuBlock.GFX, PASS), ('MMHUB', amdsmi.AmdSmiGpuBlock.MMHUB, PASS), ('ATHUB', amdsmi.AmdSmiGpuBlock.ATHUB, PASS), ('PCIE_BIF', amdsmi.AmdSmiGpuBlock.PCIE_BIF, PASS), ('HDP', amdsmi.AmdSmiGpuBlock.HDP, PASS), ('XGMI_WAFL', amdsmi.AmdSmiGpuBlock.XGMI_WAFL, PASS), ('DF', amdsmi.AmdSmiGpuBlock.DF, PASS), ('SMN', amdsmi.AmdSmiGpuBlock.SMN, PASS), ('SEM', amdsmi.AmdSmiGpuBlock.SEM, PASS), ('MP0', amdsmi.AmdSmiGpuBlock.MP0, PASS), ('MP1', amdsmi.AmdSmiGpuBlock.MP1, PASS), ('FUSE', amdsmi.AmdSmiGpuBlock.FUSE, PASS), ('MCA', amdsmi.AmdSmiGpuBlock.MCA, PASS), ('VCN', amdsmi.AmdSmiGpuBlock.VCN, PASS), ('JPEG', amdsmi.AmdSmiGpuBlock.JPEG, PASS), ('IH', amdsmi.AmdSmiGpuBlock.IH, PASS), ('MPIO', amdsmi.AmdSmiGpuBlock.MPIO, PASS), ('RESERVED', amdsmi.AmdSmiGpuBlock.RESERVED, FAIL) ] memory_types = \ [ ('VRAM', amdsmi.AmdSmiMemoryType.VRAM, PASS), ('VIS_VRAM', amdsmi.AmdSmiMemoryType.VIS_VRAM, PASS), ('GTT', amdsmi.AmdSmiMemoryType.GTT, PASS) ] processor_types = \ [ ('UNKNOWN', amdsmi.AmdSmiProcessorType.UNKNOWN, FAIL), ('AMD_GPU', amdsmi.AmdSmiProcessorType.AMD_GPU, PASS), ('AMD_CPU', amdsmi.AmdSmiProcessorType.AMD_CPU, PASS), ('NON_AMD_GPU', amdsmi.AmdSmiProcessorType.NON_AMD_GPU, PASS), ('NON_AMD_CPU', amdsmi.AmdSmiProcessorType.NON_AMD_CPU, PASS), ('AMD_CPU_CORE', amdsmi.AmdSmiProcessorType.AMD_CPU_CORE, PASS), ('AMD_APU', amdsmi.AmdSmiProcessorType.AMD_APU, PASS) ] reg_types = \ [ ('XGMI', amdsmi.AmdSmiRegType.XGMI, PASS), ('WAFL', amdsmi.AmdSmiRegType.WAFL, PASS), ('PCIE', amdsmi.AmdSmiRegType.PCIE, PASS), ('USR', amdsmi.AmdSmiRegType.USR, PASS), ('USR1', amdsmi.AmdSmiRegType.USR1, PASS) ] voltage_metrics = \ [ ('CURRENT', amdsmi.AmdSmiVoltageMetric.CURRENT, PASS), ('MAX', amdsmi.AmdSmiVoltageMetric.MAX, PASS), ('MIN_CRIT', amdsmi.AmdSmiVoltageMetric.MIN_CRIT, PASS), ('MIN', amdsmi.AmdSmiVoltageMetric.MIN, PASS), ('MAX_CRIT', amdsmi.AmdSmiVoltageMetric.MAX_CRIT, PASS), ('AVERAGE', amdsmi.AmdSmiVoltageMetric.AVERAGE, PASS), ('LOWEST', amdsmi.AmdSmiVoltageMetric.LOWEST, PASS), ('HIGHEST', amdsmi.AmdSmiVoltageMetric.HIGHEST, PASS) ] voltage_types = \ [ ('VDDGFX', amdsmi.AmdSmiVoltageType.VDDGFX, PASS), ('INVALID', amdsmi.AmdSmiVoltageType.INVALID, FAIL) ] link_types = \ [ ('AMDSMI_LINK_TYPE_INTERNAL', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_INTERNAL, PASS), ('AMDSMI_LINK_TYPE_XGMI', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_XGMI, PASS), ('AMDSMI_LINK_TYPE_PCIE', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_PCIE, PASS), ('AMDSMI_LINK_TYPE_NOT_APPLICABLE', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_NOT_APPLICABLE, FAIL), ('AMDSMI_LINK_TYPE_UNKNOWN', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_UNKNOWN, FAIL) ] temperature_types = \ [ ('EDGE', amdsmi.AmdSmiTemperatureType.EDGE, PASS), ('HOTSPOT', amdsmi.AmdSmiTemperatureType.HOTSPOT, PASS), ('JUNCTION', amdsmi.AmdSmiTemperatureType.JUNCTION, PASS), ('VRAM', amdsmi.AmdSmiTemperatureType.VRAM, PASS), ('HBM_0', amdsmi.AmdSmiTemperatureType.HBM_0, PASS), ('HBM_1', amdsmi.AmdSmiTemperatureType.HBM_1, PASS), ('HBM_2', amdsmi.AmdSmiTemperatureType.HBM_2, PASS), ('HBM_3', amdsmi.AmdSmiTemperatureType.HBM_3, PASS), ('PLX', amdsmi.AmdSmiTemperatureType.PLX, PASS) ] temperature_metrics = \ [ ('CURRENT', amdsmi.AmdSmiTemperatureMetric.CURRENT, PASS), ('MAX', amdsmi.AmdSmiTemperatureMetric.MAX, PASS), ('MIN', amdsmi.AmdSmiTemperatureMetric.MIN, PASS), ('MAX_HYST', amdsmi.AmdSmiTemperatureMetric.MAX_HYST, PASS), ('MIN_HYST', amdsmi.AmdSmiTemperatureMetric.MIN_HYST, PASS), ('CRITICAL', amdsmi.AmdSmiTemperatureMetric.CRITICAL, PASS), ('CRITICAL_HYST', amdsmi.AmdSmiTemperatureMetric.CRITICAL_HYST, PASS), ('EMERGENCY', amdsmi.AmdSmiTemperatureMetric.EMERGENCY, PASS), ('EMERGENCY_HYST', amdsmi.AmdSmiTemperatureMetric.EMERGENCY_HYST, PASS), ('CRIT_MIN', amdsmi.AmdSmiTemperatureMetric.CRIT_MIN, PASS), ('CRIT_MIN_HYST', amdsmi.AmdSmiTemperatureMetric.CRIT_MIN_HYST, PASS), ('OFFSET', amdsmi.AmdSmiTemperatureMetric.OFFSET, PASS), ('LOWEST', amdsmi.AmdSmiTemperatureMetric.LOWEST, PASS), ('HIGHEST', amdsmi.AmdSmiTemperatureMetric.HIGHEST, PASS) ] utilization_counter_types = \ [ ('COARSE_GRAIN_GFX_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY, PASS), ('COARSE_GRAIN_MEM_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.COARSE_GRAIN_MEM_ACTIVITY, PASS), ('COARSE_DECODER_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.COARSE_DECODER_ACTIVITY, PASS), ('FINE_GRAIN_GFX_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.FINE_GRAIN_GFX_ACTIVITY, PASS), ('FINE_GRAIN_MEM_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.FINE_GRAIN_MEM_ACTIVITY, PASS), ('FINE_DECODER_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.FINE_DECODER_ACTIVITY, PASS), ('UTILIZATION_COUNTER_FIRST', amdsmi.AmdSmiUtilizationCounterType.UTILIZATION_COUNTER_FIRST, PASS), ('UTILIZATION_COUNTER_LAST', amdsmi.AmdSmiUtilizationCounterType.UTILIZATION_COUNTER_LAST, PASS) ] event_types = \ [ ('XGMI_0_NOP_TX', amdsmi.AmdSmiEventType.XGMI_0_NOP_TX, PASS), ('XGMI_0_REQUEST_TX', amdsmi.AmdSmiEventType.XGMI_0_REQUEST_TX, PASS), ('XGMI_0_RESPONSE_TX', amdsmi.AmdSmiEventType.XGMI_0_RESPONSE_TX, PASS), ('XGMI_0_BEATS_TX', amdsmi.AmdSmiEventType.XGMI_0_BEATS_TX, PASS), ('XGMI_1_NOP_TX', amdsmi.AmdSmiEventType.XGMI_1_NOP_TX, PASS), ('XGMI_1_REQUEST_TX', amdsmi.AmdSmiEventType.XGMI_1_REQUEST_TX, PASS), ('XGMI_1_RESPONSE_TX', amdsmi.AmdSmiEventType.XGMI_1_RESPONSE_TX, PASS), ('XGMI_1_BEATS_TX', amdsmi.AmdSmiEventType.XGMI_1_BEATS_TX, PASS), ('XGMI_DATA_OUT_0', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_0, PASS), ('XGMI_DATA_OUT_1', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_1, PASS), ('XGMI_DATA_OUT_2', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_2, PASS), ('XGMI_DATA_OUT_3', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_3, PASS), ('XGMI_DATA_OUT_4', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_4, PASS), ('XGMI_DATA_OUT_5', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_5, PASS) ] counter_commands = \ [ ('CMD_START', amdsmi.AmdSmiCounterCommand.CMD_START, PASS), ('CMD_STOP', amdsmi.AmdSmiCounterCommand.CMD_STOP, PASS) ] compute_partition_types = \ [ ('SPX', amdsmi.AmdSmiComputePartitionType.SPX, PASS), ('DPX', amdsmi.AmdSmiComputePartitionType.DPX, PASS), ('TPX', amdsmi.AmdSmiComputePartitionType.TPX, PASS), ('QPX', amdsmi.AmdSmiComputePartitionType.QPX, PASS), ('CPX', amdsmi.AmdSmiComputePartitionType.CPX, PASS), ('INVALID', amdsmi.AmdSmiComputePartitionType.INVALID, FAIL) ] memory_partition_types = \ [ ('NPS1', amdsmi.AmdSmiMemoryPartitionType.NPS1, PASS), ('NPS2', amdsmi.AmdSmiMemoryPartitionType.NPS2, PASS), ('NPS4', amdsmi.AmdSmiMemoryPartitionType.NPS4, PASS), ('NPS8', amdsmi.AmdSmiMemoryPartitionType.NPS8, PASS), ('UNKNOWN', amdsmi.AmdSmiMemoryPartitionType.UNKNOWN, FAIL) ] freq_inds = \ [ ('MIN', amdsmi.AmdSmiFreqInd.MIN, PASS), ('MAX', amdsmi.AmdSmiFreqInd.MAX, PASS), ('INVALID', amdsmi.AmdSmiFreqInd.INVALID, FAIL) ] dev_perf_levels = \ [ ('AUTO', amdsmi.AmdSmiDevPerfLevel.AUTO, PASS), ('LOW', amdsmi.AmdSmiDevPerfLevel.LOW, PASS), ('HIGH', amdsmi.AmdSmiDevPerfLevel.HIGH, PASS), ('MANUAL', amdsmi.AmdSmiDevPerfLevel.MANUAL, PASS), ('STABLE_STD', amdsmi.AmdSmiDevPerfLevel.STABLE_STD, PASS), ('STABLE_PEAK', amdsmi.AmdSmiDevPerfLevel.STABLE_PEAK, PASS), ('STABLE_MIN_MCLK', amdsmi.AmdSmiDevPerfLevel.STABLE_MIN_MCLK, PASS), ('STABLE_MIN_SCLK', amdsmi.AmdSmiDevPerfLevel.STABLE_MIN_SCLK, PASS), ('DETERMINISM', amdsmi.AmdSmiDevPerfLevel.DETERMINISM, PASS), ('UNKNOWN', amdsmi.AmdSmiDevPerfLevel.UNKNOWN, FAIL) ] power_profile_preset_masks = \ [ ('CUSTOM_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.CUSTOM_MASK, PASS), ('VIDEO_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.VIDEO_MASK, PASS), ('POWER_SAVING_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.POWER_SAVING_MASK, PASS), ('COMPUTE_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.COMPUTE_MASK, PASS), ('VR_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.VR_MASK, PASS), ('THREE_D_FULL_SCR_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.THREE_D_FULL_SCR_MASK, PASS), ('BOOTUP_DEFAULT', amdsmi.AmdSmiPowerProfilePresetMasks.BOOTUP_DEFAULT, PASS) ] def _print(self, msg, data=None): if verbose == 2: if data is None: print(msg, flush=True) elif any(data in value for value in not_supported_error_codes): print(f'{msg} {data}', flush=True) else: if isinstance(data, str) and data in error_map.values(): print(msg, end='') else: print(msg) if isinstance(data, dict) or isinstance(data, list): print(json.dumps(data, sort_keys=False, indent=4), flush=True) else: print(data) return def _print_func_name(self, msg): if verbose == 2: stk = inspect.stack() if stk[1].function == '_callSetUp': return print(msg, flush=True) print(f'## {stk[1].function}()', flush=True) return def get_error_code(self, e): error_code = e.get_error_code() return error_map[error_code] def _check_ret(self, msg, _e, expected_code=None, printit=True): error_code_int = int(_e.get_error_code()) error_code = str(error_code_int) if error_code in error_map: error_code_name = error_map[error_code] else: error_code_name = 'UNKNOWN_ERROR' # Check for when there are multiple passing conditions if isinstance(expected_code, list): for ec in expected_code: rc = self._check_ret(msg, _e, ec, False) # Do not print msg, otherwise multiple msgs printed if not rc: rc = self._check_ret(msg, _e, ec) # Call check again so msg is printed return rc # No expected results found print(f'{msg}\nTest FAILED with expected results {expected_code} but received {error_code_name}', flush=True) return True # Check for single passing condition if any(error_code in value for value in not_supported_error_codes): if verbose == 2 and printit: print(f'{msg}\nTest SKIPPED with result {error_code_name}', flush=True) elif error_code_name == expected_code: if verbose == 2 and printit: print(f'{msg}\nTest PASSED with expected result {expected_code}', flush=True) else: if verbose == 2 and printit: print(f'{msg}\nTest FAILED with expected result {expected_code} but received {error_code_name}', flush=True) return True return False def setUp(self): # Called before each test by unittest framework self.raise_exception = None amdsmi.amdsmi_init() self.processors = amdsmi.amdsmi_get_processor_handles() self.assertGreaterEqual(len(self.processors), 1) self.assertLessEqual(len(self.processors), self.max_num_physical_devices) return def tearDown(self): # Called after each test by unittest framework amdsmi.amdsmi_shut_down() return def test_clean_gpu_local_data(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: amdsmi.amdsmi_clean_gpu_local_data(gpu) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_cpu_apb_disable(self): self._print_func_name('') pstate = 0 for i, gpu in enumerate(self.processors): msg = f'### amdsmi_cpu_apb_disable(gpu={i}, pstate={pstate}):' try: amdsmi.amdsmi_cpu_apb_disable(gpu, pstate) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_cpu_apb_enable(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: amdsmi.amdsmi_cpu_apb_enable(gpu) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_first_online_core_on_cpu_socket(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_first_online_core_on_cpu_socket as it fails (IO Error).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_first_online_core_on_cpu_socket(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_clk_freq(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_clk_freq as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): for clk_type_name, clk_type, clk_cond in self.clk_types: msg = f'gpu({i}): Clock Type({clk_type_name}):' try: ret = amdsmi.amdsmi_get_clk_freq(gpu, clk_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, clk_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_clock_info(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_clock_info as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): for clk_type_name, clk_type, clk_cond in self.clk_types: msg = f'### test amdsmi_get_clock_info(gpu={i}, Clock Type={clk_type_name})' try: ret = amdsmi.amdsmi_get_clock_info(gpu, clk_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, clk_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_cclk_limit(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_cclk_limit(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_core_current_freq_limit(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_core_current_freq_limit(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_core_energy(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_cpu_core_energy as it fails (IO Error).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_core_energy(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_current_io_bandwidth(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for encoding_name, encoding, encoding_cond in self.io_bw_encodings: msg = f'gpu({i}): encodeing({encoding_name}):' try: ret = amdsmi.amdsmi_get_cpu_current_io_bandwidth(gpu, encoding, encoding_name) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, encoding_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_ddr_bw(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_ddr_bw(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_dimm_power_consumption(self): self._print_func_name('') # TODO Find better way to get dimm_addr dimm_addr = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_dimm_power_consumption(gpu, dimm_addr) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_dimm_temp_range_and_refresh_rate(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_cpu_dimm_temp_range_and_refresh_rate as it fails.") # TODO Find better way to get dimm_addr dimm_addr = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_dimm_temp_range_and_refresh_rate(gpu, dimm_addr) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_dimm_thermal_sensor(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_cpu_dimm_thermal_sensor as it fails.") # TODO Find better way to get dimm_addr dimm_addr = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_dimm_thermal_sensor(gpu, dimm_addr) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_family(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_cpu_family as it fails (IO Error).") msg = '' try: ret = amdsmi.amdsmi_get_cpu_family() self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_fclk_mclk(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_fclk_mclk(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_handles(self): self._print_func_name('') msg = '' try: ret = amdsmi.amdsmi_get_cpu_handles() self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_hsmp_driver_version(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_cpu_hsmp_driver_version as it fails (IO Error).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_hsmp_driver_version(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_hsmp_proto_ver(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_cpu_hsmp_proto_ver as it fails (IO Error).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_hsmp_proto_ver(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_model(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_cpu_model as it fails (IO Error).") msg = '' try: ret = amdsmi.amdsmi_get_cpu_model() self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_prochot_status(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_prochot_status(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_pwr_svi_telemetry_all_rails(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_pwr_svi_telemetry_all_rails(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_smu_fw_version(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_smu_fw_version(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_socket_c0_residency(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_socket_c0_residency(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_socket_current_active_freq_limit(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_socket_current_active_freq_limit(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_socket_energy(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_cpu_socket_energy as it fails (IO Error).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_socket_energy(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_socket_freq_range(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_socket_freq_range(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_socket_lclk_dpm_level(self): self._print_func_name('') nbio_id = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): nbio_id({nbio_id}):' try: ret = amdsmi.amdsmi_get_cpu_socket_lclk_dpm_level(gpu, nbio_id) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_socket_power(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_socket_power(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_socket_power_cap_max(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_socket_power_cap_max(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_cpu_socket_temperature(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_cpu_socket_temperature(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_energy_count(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_energy_count as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_energy_count(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_esmi_err_msg(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_esmi_err_msg as it fails (Unknown Error).") for status_type_name, status_type, status_cond in self.status_types: msg = f'status(AMDSMI_STATUS_{status_type_name}):' try: ret = amdsmi.amdsmi_get_esmi_err_msg(status_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, status_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_fw_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_fw_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_accelerator_partition_profile(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_accelerator_partition_profile(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_accelerator_partition_profile_config(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_accelerator_partition_profile_config(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_activity(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_gpu_activity as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_activity(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_asic_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'### test amdsmi_get_gpu_asic_info(gpu={i})' try: ret = amdsmi.amdsmi_get_gpu_asic_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_bad_page_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_bad_page_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_bad_page_threshold(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_bad_page_threshold(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_bdf_id(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_bdf_id(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_board_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'### test amdsmi_get_gpu_board_info(gpu={i})' try: ret = amdsmi.amdsmi_get_gpu_board_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_cache_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_cache_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_compute_partition(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_compute_partition(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_compute_process_gpus(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_get_gpu_compute_process_gpus as it is not complete (Inval Error).") # TODO Find better way to get pid pid = 0 msg = f'pid({pid}):' try: ret = amdsmi.amdsmi_get_gpu_compute_process_gpus(pid) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_compute_process_info(self): self._print_func_name('') msg = '' try: ret = amdsmi.amdsmi_get_gpu_compute_process_info() self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_compute_process_info_by_pid(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_get_gpu_compute_process_info_by_pid as it not complete (Device not found).") # TODO Find better way to get pid pid = 0 msg = f'pid({pid}):' try: ret = amdsmi.amdsmi_get_gpu_compute_process_info_by_pid(pid) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_device_bdf(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_device_bdf(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_device_uuid(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_device_uuid(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_driver_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_driver_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_ecc_count(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for gpu_block_name, gpu_block, gpu_block_cond in self.gpu_blocks: msg = f'gpu({i}): gpu_block({gpu_block_name})' try: ret = amdsmi.amdsmi_get_gpu_ecc_count(gpu, gpu_block) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, gpu_block_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_ecc_enabled(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_ecc_enabled(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_ecc_status(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_gpu_ecc_status as it fails.") for i, gpu in enumerate(self.processors): for gpu_block_name, gpu_block, gpu_block_cond in self.gpu_blocks: msg = f'gpu({i}): gpu_block({gpu_block_name})' try: ret = amdsmi.amdsmi_get_gpu_ecc_status(gpu, gpu_block) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, gpu_block_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_enumeration_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_enumeration_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_fan_rpms(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_fan_rpms(gpu, 0) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_id(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_id(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_kfd_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_kfd_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_mem_overdrive_level(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_mem_overdrive_level(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_memory_partition(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_memory_partition(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_memory_partition_config(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_gpu_memory_partition_config as it fails on MI300.") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_memory_partition_config(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_memory_reserved_pages(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_memory_reserved_pages(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_memory_total(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for memory_type_name, memory_type, memory_type_cond in self.memory_types: msg = f'gpu({i}): memory_type({memory_type_name})' try: ret = amdsmi.amdsmi_get_gpu_memory_total(gpu, memory_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, memory_type_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_memory_usage(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for memory_type_name, memory_type, memory_type_cond in self.memory_types: msg = f'gpu({i}): memory_type({memory_type_name})' try: ret = amdsmi.amdsmi_get_gpu_memory_usage(gpu, memory_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, memory_type_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_metrics_header_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_metrics_header_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_metrics_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_metrics_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_partition_metrics_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): try: msg = f'gpu({i}): ' ret = amdsmi.amdsmi_get_gpu_partition_metrics_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception def test_get_gpu_od_volt_curve_regions(self): self._print_func_name('') num_region = 10 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): num_region({num_region}):' try: ret = amdsmi.amdsmi_get_gpu_od_volt_curve_regions(gpu, num_region) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_od_volt_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_od_volt_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_overdrive_level(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_overdrive_level(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_pci_bandwidth(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_gpu_pci_bandwidth as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_pci_bandwidth(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_pci_replay_counter(self): self._print_func_name('') # TODO Check test_get_gpu_pci_replay_counter for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_pci_replay_counter(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_pci_throughput(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_pci_throughput(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_perf_level(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_perf_level(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_pm_metrics_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_pm_metrics_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_power_profile_presets(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_power_profile_presets(gpu, 0) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_process_isolation(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_process_isolation(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_process_list(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_process_list(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_ras_block_features_enabled(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_ras_block_features_enabled(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_ras_feature_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_ras_feature_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_reg_table_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for reg_type_name, reg_type, reg_type_cond in self.reg_types: msg = f'gpu({i}): reg_type({reg_type_name}):' try: ret = amdsmi.amdsmi_get_gpu_reg_table_info(gpu, reg_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, reg_type_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_revision(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_revision(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_subsystem_id(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_subsystem_id(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_subsystem_name(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_subsystem_name(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_topo_numa_affinity(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_topo_numa_affinity(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_total_ecc_count(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_total_ecc_count(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_vbios_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_vbios_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_vendor_name(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_vendor_name(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_virtualization_mode(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_virtualization_mode(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_volt_metric(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for voltage_type_name, voltage_type, voltage_type_cond in self.voltage_types: for voltage_metric_name, voltage_metric, voltage_metric_cond in self.voltage_metrics: msg = f'gpu({i}): voltage_type({voltage_type_name}) voltage_metric({voltage_metric_name}):' try: ret = amdsmi.amdsmi_get_gpu_volt_metric(gpu, voltage_type, voltage_metric) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if not voltage_type_cond == self.PASS: if self._check_ret(msg, e, voltage_type_cond): self.raise_exception = e elif not voltage_metric_cond == self.PASS: if self._check_ret(msg, e, voltage_metric_cond): self.raise_exception = e else: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_vram_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_vram_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_vram_usage(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_vram_usage(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_vram_vendor(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_vram_vendor(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_xcd_counter(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_gpu_xcd_counter as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_xcd_counter(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_xgmi_link_status(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_gpu_xgmi_link_status as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_xgmi_link_status(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_hsmp_metrics_table(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_hsmp_metrics_table(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_hsmp_metrics_table_version(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_hsmp_metrics_table_version(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_lib_version(self): self._print_func_name('') msg = '' try: ret = amdsmi.amdsmi_get_lib_version() self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_link_metrics(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_link_metrics as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_link_metrics(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_link_topology_nearest(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for link_type_name, link_type, link_type_cond in self.link_types: msg = f'gpu({i}): link_type({link_type_name})' try: ret = amdsmi.amdsmi_get_link_topology_nearest(gpu, link_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, link_type_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_minmax_bandwidth_between_processors(self): self._print_func_name('') for i, gpu_i in enumerate(self.processors): for j, gpu_j in enumerate(self.processors): msg = f'gpu({i},{j}):' try: ret = amdsmi.amdsmi_get_minmax_bandwidth_between_processors(gpu_i, gpu_j) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if i == j: if self._check_ret(msg, e, self.FAIL): self.raise_exception = e else: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_pcie_info(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_pcie_info as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_pcie_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_cpu_pcie_link_rate(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_cpu_pcie_link_rate as it is not complete.") # TODO rate_ctrl = 0 rate_ctrl = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): rate_ctrl({rate_ctrl}):' try: ret = amdsmi.amdsmi_set_cpu_pcie_link_rate(gpu, rate_ctrl) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_power_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_power_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_processor_count_from_handles(self): self._print_func_name('') msg = 'gpu():' try: ret = amdsmi.amdsmi_get_processor_count_from_handles(self.processors) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_processor_handle_from_bdf(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: bdf = amdsmi.amdsmi_get_gpu_device_bdf(gpu) ret = amdsmi.amdsmi_get_processor_handle_from_bdf(bdf) if gpu.value != ret.value: msg += f'{msg}Expected: {gpu.value}, Received: {ret.value}' self.raise_exception = amdsmi.AmdSmiLibraryException(amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL) else: self._print(msg) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_processor_handles(self): self._print_func_name('') for i, gpu in enumerate(self.processors): self._print(f' {i:2d} processor_handles: {gpu}') if self.raise_exception: raise self.raise_exception return def test_get_processor_handles_by_type(self): self._print_func_name('') socket_ids = amdsmi.amdsmi_get_socket_handles() for index, socket_id in enumerate(socket_ids): for processor_name, processor_type, processor_cond in self.processor_types: msg = f'socket({index}): processor_type({processor_name}):' try: ret = amdsmi.amdsmi_get_processor_handles_by_type(socket_id, processor_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, processor_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_processor_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_processor_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_processor_type(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_processor_type(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_socket_handles(self): self._print_func_name('') msg = '' try: ret = amdsmi.amdsmi_get_socket_handles() self._print(msg, [id(addr) for addr in ret]) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_socket_info(self): self._print_func_name('') sockets = amdsmi.amdsmi_get_socket_handles() self.assertGreaterEqual(len(sockets), 1) self.assertLessEqual(len(sockets), self.max_num_physical_devices) for i, socket in enumerate(sockets): msg = f'socket({i}):' try: ret = amdsmi.amdsmi_get_socket_info(socket) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_temp_metric(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_temp_metric as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): for temperature_type_name, temperature_type, temperature_type_cond in self.temperature_types: for temperature_metric_name, temperature_metric, temperature_metric_cond in self.temperature_metrics: msg = f'gpu({i}): temperature_type=({temperature_type_name}) temperature_metric({temperature_metric_name}):' try: ret = amdsmi.amdsmi_get_temp_metric(gpu, temperature_type, temperature_metric) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if not temperature_type_cond == self.PASS: if self._check_ret(msg, e, temperature_type_cond): self.raise_exception = e elif not temperature_metric_cond == self.PASS: if self._check_ret(msg, e, temperature_metric_cond): self.raise_exception = e else: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_threads_per_core(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_threads_per_core as it fails (IO Error).") # TODO threads_per_core msg = 'threads_per_core:' try: ret = amdsmi.amdsmi_get_threads_per_core() self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_utilization_count(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_utilization_count as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): for utilization_counter_type_name, utilization_counter_type, utilization_counter_type_cond in self.utilization_counter_types: msg = f'gpu({i}): utilization_counter_type({utilization_counter_type_name}):' try: ret = amdsmi.amdsmi_get_utilization_count(gpu, [utilization_counter_type]) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, utilization_counter_type_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_violation_status(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_get_violation_status as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_violation_status(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_xgmi_info(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_xgmi_info(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_gpu_counter(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_gpu_counter as it fails (Error opening file).") for i, gpu in enumerate(self.processors): for event_type_name, event_type, event_type_cond in self.event_types: msg = f'gpu({i}): event_type({event_type_name}):' # Create msg1 = f'{msg} Create counter:' try: event_handle = amdsmi.amdsmi_gpu_create_counter(gpu, event_type) self._print(msg1, event_handle) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, event_type_cond): self.raise_exception = e # if any exception occurs, skip the rest of the loop continue # Read msg1 = f'{msg} Read counter:' try: amdsmi.amdsmi_gpu_read_counter(event_handle) self._print(msg1) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, event_type_cond): self.raise_exception = e # Control for counter_command_name, counter_command, counter_commands_cond in self.counter_commands: msg1 = f'{msg} event_type({event_type_name}): counter_command({counter_command_name}):' try: amdsmi.amdsmi_gpu_control_counter(event_handle, counter_command) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, counter_commands_cond): self.raise_exception = e # Destroy msg1 = f'{msg} Destroy counter:' try: amdsmi.amdsmi_gpu_destroy_counter(event_handle) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, event_type_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_gpu_counter_group_supported(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for event_group_name, event_group, event_group_cond in self.event_groups: msg = f'gpu({i}): event_group({event_group_name}):' try: amdsmi.amdsmi_gpu_counter_group_supported(gpu, event_group) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, event_group_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_available_counters(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for event_group_name, event_group_type, event_group_cond in self.event_groups: msg = f'gpu({i}): event_group({event_group_name})' try: ret = amdsmi.amdsmi_get_gpu_available_counters(gpu, event_group_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, event_group_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_gpu_validate_ras_eeprom(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_gpu_validate_ras_eepromas it fails (File Error).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: amdsmi.amdsmi_gpu_validate_ras_eeprom(gpu) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_gpu_xgmi_error_status(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_gpu_xgmi_error_status as it fails on MI300.") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_gpu_xgmi_error_status(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_init(self): self._print_func_name('') msg = '' try: amdsmi.amdsmi_init() self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_shut_down(self): self._print_func_name('') msg = '' try: amdsmi.amdsmi_shut_down() self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_is_P2P_accessible(self): self._print_func_name('') for i, gpu_i in enumerate(self.processors): for j, gpu_j in enumerate(self.processors): msg = f'gpu({i},{j}):' try: ret = amdsmi.amdsmi_is_P2P_accessible(gpu_i, gpu_j) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_gpu_event(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_gpu_event as it fails (File Error).") mask = 1 << (amdsmi.AmdSmiEvtNotificationType.GPU_PRE_RESET -1) | \ 1 << (amdsmi.AmdSmiEvtNotificationType.GPU_POST_RESET -1) timeout_ms = 1000 for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' # Init try: self._print(f'{msg} amdsmi_init_gpu_event_notification()') amdsmi.amdsmi_init_gpu_event_notification(gpu) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e # Skip remaining tests on any exception when initializing continue # Is Enabled try: self._print(f'{msg} amdsmi_is_gpu_power_management_enabled()') ret = amdsmi.amdsmi_is_gpu_power_management_enabled(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e # Set Mask try: self._print(f'{msg} amdsmi_set_gpu_event_notification_mask()') amdsmi.amdsmi_set_gpu_event_notification_mask(gpu, mask) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e # Get try: self._print(f'{msg} amdsmi_get_gpu_event_notification()') ret = amdsmi.amdsmi_get_gpu_event_notification(timeout_ms) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e # Stop try: self._print(f'{msg} amdsmi_stop_gpu_event_notification()') amdsmi.amdsmi_stop_gpu_event_notification(gpu) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_reset_gpu(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_reset_gpu as it fails (MI350X, Hang).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: amdsmi.amdsmi_reset_gpu(gpu) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_reset_gpu_fan(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: amdsmi.amdsmi_reset_gpu_fan(gpu, 0) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_reset_gpu_xgmi_error(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_reset_gpu_xgmi_error as it fails on MI300.") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: amdsmi.amdsmi_reset_gpu_xgmi_error(gpu) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_clk_freq(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_set_clk_freq as it fails (Perm failure).") for i, gpu in enumerate(self.processors): for clk_type_name, clk_type, clk_cond in self.clk_types: msg = f'gpu({i}): Get Clock Info({clk_type_name}):' try: ret = amdsmi.amdsmi_get_clk_freq(gpu, clk_type) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, clk_cond): self.raise_exception = e continue clk_freq_info = ret current = clk_freq_info['current'] num_supported = clk_freq_info['num_supported'] frequency = clk_freq_info['frequency'] if num_supported == 0: self._print(f'No supported frequencies for clk_type={clk_type_name}') continue found_error = False for index in range(0, num_supported): msg = f'\tgpu({i}):' try: freq_bitmask = frequency[index] msg = f'{msg} Set clk_type({clk_type_name}): freq_bitmask({freq_bitmask}):' amdsmi.amdsmi_set_clk_freq(gpu, clk_type_name, freq_bitmask) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: found_error = True if self._check_ret(msg, e, clk_cond): self.raise_exception = e if not found_error: amdsmi.amdsmi_set_clk_freq(gpu, clk_type_name, frequency[current]) if self.raise_exception: raise self.raise_exception return def test_cpu_core_boostlimit(self): for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' msg1 = f'{msg} amdsmi_get_cpu_core_boostlimit():' try: boost_limit = amdsmi.amdsmi_get_cpu_core_boostlimit(gpu) msg1 = f'{msg1} boost_limit={boost_limit}' self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e continue msg1 = f'{msg} amdsmi_set_cpu_core_boostlimit():' try: amdsmi.amdsmi_set_cpu_core_boostlimit(gpu, boost_limit) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_cpu_df_pstate_range(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_cpu_df_pstate_range as it is not complete.") # TODO max_pstate = 0, min_pstate = 0 max_pstate = 0 min_pstate = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): max_pstate({max_pstate}) min_pstate({min_pstate}):' try: amdsmi.amdsmi_set_cpu_df_pstate_range(gpu, max_pstate, min_pstate) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_cpu_gmi3_link_width_range(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_cpu_gmi3_link_width_range as it is not complete.") # TODO min_link_width = 0, max_link_width = 0 min_link_width = 0 max_link_width = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): min_link_width({min_link_width}) max_link_width({max_link_width}):' try: amdsmi.amdsmi_set_cpu_gmi3_link_width_range(gpu, min_link_width, max_link_width) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_cpu_pwr_efficiency_mode(self): self._print_func_name('') modes = [0, 1, 2] for i, gpu in enumerate(self.processors): for mode in modes: msg = f'gpu({i}): mode({mode}):' try: amdsmi.amdsmi_set_cpu_pwr_efficiency_mode(gpu, mode) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_cpu_socket_boostlimit(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_cpu_socket_boostlimit as it is not complete.") # TODO boost_limit = 0 boost_limit = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' msg1 = f'{msg} boost_limit({boost_limit}):' try: amdsmi.amdsmi_set_cpu_socket_boostlimit(gpu, boost_limit) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_cpu_socket_lclk_dpm_level(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_cpu_socket_lclk_dpm_level as it is not complete.") # TODO nbio_id = 0, min_val = 0, max_val = 0 nbio_id = 0 min_val = 0 max_val = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): nbio_id({nbio_id}) min_val({min_val}) max_val({max_val}):' try: amdsmi.amdsmi_set_cpu_socket_lclk_dpm_level(gpu, nbio_id, min_val, max_val) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_cpu_socket_power_cap(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' msg1 = f'{msg} amdsmi_get_cpu_socket_power_cap():' try: power_cap = amdsmi.amdsmi_get_cpu_socket_power_cap(gpu) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e continue msg1 = f'{msg} power_cap={power_cap}' try: amdsmi.amdsmi_set_cpu_socket_power_cap(gpu, power_cap) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_cpu_xgmi_width(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_cpu_xgmi_width as it is not complete.") # TODO min_width = 0, max_width = 0 min_width = 0 max_width = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): min_width({min_width} max_width({max_width}): )' try: amdsmi.amdsmi_set_cpu_xgmi_width(gpu, min_width , max_width) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_accelerator_partition_profile(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_gpu_accelerator_partition_profile as it is not complete.") # TODO profile_index = 0 profile_index = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): profile_index({profile_index}):' try: amdsmi.amdsmi_set_gpu_accelerator_partition_profile(gpu, profile_index) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_clk_limit(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_gpu_clk_limit as it is not complete.") # TODO Find better way to set value value = 0 for i, gpu in enumerate(self.processors): for clk_type_name, clk_type, clk_cond in self.clk_types: for clk_limit_type_name, clk_limit_type, clk_limit_cond in self.clk_limit_types: msg = f'gpu({i}): value({value}) clock_type=({clk_type_name}) clock_limit_type({clk_limit_type_name}):' try: amdsmi.amdsmi_set_gpu_clk_limit(gpu, clk_type_name, clk_limit_type_name, value) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if not clk_cond == self.PASS: self._check_ret(msg, e, clk_cond) self.raise_exception = e elif not clk_limit_type == self.PASS: self._check_ret(msg, e, clk_limit_type) self.raise_exception = e else: self._check_ret(msg, e, self.PASS) self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_clk_range(self): self._print_func_name('') # TODO Find better way to set min_clk_value, max_clk_value min_clk_value = 100 max_clk_value = 200 for i, gpu in enumerate(self.processors): for clk_type_name, clk_type, clk_cond in self.clk_types: msg = f'gpu({i}): min_clk_value({min_clk_value}) max_clk_value({max_clk_value}) clk_type({clk_type_name}):' try: amdsmi.amdsmi_set_gpu_clk_range(gpu, min_clk_value, max_clk_value, clk_type) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, clk_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_compute_partition(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_set_gpu_compute_partition as it fails on MI300.") for i, gpu in enumerate(self.processors): default_compute_partition_type = self.compute_partition_types[0][1] msg = f'gpu({i}): amdsmi_get_gpu_compute_partition()' try: default_compute_partition_name = amdsmi.amdsmi_get_gpu_compute_partition(gpu) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e continue for compute_partition_type_name, compute_partition_type, compute_partition_type_cond in self.compute_partition_types: if default_compute_partition_name == compute_partition_type_name: default_compute_partition_type = compute_partition_type msg = f'gpu({i}): compute_partition_type({compute_partition_type_name}):' try: amdsmi.amdsmi_set_gpu_compute_partition(gpu, compute_partition_type) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, compute_partition_type_cond): self.raise_exception = e msg = f'gpu({i}): amdsmi_set_gpu_compute_partition({default_compute_partition_name})' try: amdsmi.amdsmi_set_gpu_compute_partition(gpu, default_compute_partition_type) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e continue if self.raise_exception: raise self.raise_exception return def test_gpu_fan_speed(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' msg1 = f'{msg} amdsmi_get_gpu_fan_speed()' try: # Determine current fan speed fan_speed_current = amdsmi.amdsmi_get_gpu_fan_speed(gpu, 0) msg1 = f'{msg1} fan_speed={fan_speed_current}' self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e continue try: # Determine max fan speed msg1 = f'{msg} amdsmi_get_gpu_fan_speed_max()' fan_speed_max = amdsmi.amdsmi_get_gpu_fan_speed_max(gpu, 0) msg1 = f'{msg1} fan_speed_max={fan_speed_max}' if fan_speed_current == fan_speed_max: fan_speed = int(fan_speed_max/2) else: fan_speed = fan_speed_max except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e continue try: # Set fan speed msg1 = f'{msg} fan_speed({fan_speed}):' amdsmi.amdsmi_set_gpu_fan_speed(gpu, 0, fan_speed) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e try: # Set to original fan speed msg1 = f'{msg} fan_speed({fan_speed_current}):' amdsmi.amdsmi_set_gpu_fan_speed(gpu, 0, fan_speed_current) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_memory_partition(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_set_gpu_memory_partition as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): for memory_partition_type_name, memory_partition_type, memory_partition_type_cond in self.memory_partition_types: msg = f'gpu({i}): memory_partition_type({memory_partition_type_name}):' try: amdsmi.amdsmi_set_gpu_memory_partition(gpu, memory_partition_type) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, memory_partition_type_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_memory_partition_mode(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_set_gpu_memory_partition_mode as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): for memory_partition_type_name, memory_partition_type, memory_partition_type_cond in self.memory_partition_types: msg = f'gpu({i}): memory_partition_type({memory_partition_type_name}):' try: amdsmi.amdsmi_set_gpu_memory_partition_mode(gpu, memory_partition_type) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, memory_partition_type_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_od_clk_info(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_gpu_od_clk_info as it is not complete.") # TODO value = 0 value = 200 for i, gpu in enumerate(self.processors): for freq_ind_name, freq_ind, freq_ind_cond in self.freq_inds: for clk_type_name, clk_type, clk_cond in self.clk_types: msg = f'gpu({i}): freq_ind({freq_ind_name}) value({value}) clk_type({clk_type_name}):' try: amdsmi.amdsmi_set_gpu_od_clk_info(gpu, freq_ind, value, clk_type) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if not freq_ind_cond == self.PASS: self._check_ret(msg, e, freq_ind_cond) self.raise_exception = e elif not clk_cond == self.PASS: self._check_ret(msg, e, clk_cond) self.raise_exception = e else: self._check_ret(msg, e, self.PASS) self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_od_volt_info(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_gpu_od_volt_info as it is not complete.") # TODO vpoint = 0 clk_value = 0 volt_value = 0 vpoint = 0 clk_value = 0 volt_value = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): vpoint({vpoint}) clk_value({clk_value}) volt_value({volt_value}):' try: amdsmi.amdsmi_set_gpu_od_volt_info(gpu, vpoint, clk_value, volt_value) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_overdrive_level(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: # Find current overdrive value overdrive_value_current = amdsmi.amdsmi_get_gpu_overdrive_level(gpu) if overdrive_value_current != 1: overdrive_value = 1 else: overdrive_value = 2 # Set overdrive value msg = f'gpu({i}): overdrive_value({overdrive_value}):' amdsmi.amdsmi_set_gpu_overdrive_level(gpu, overdrive_value) self._print(msg, '') # Set back to original overdrive value amdsmi.amdsmi_set_gpu_overdrive_level(gpu, overdrive_value_current) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_pci_bandwidth(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_set_gpu_pci_bandwidth as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: # Get current PCI bandwidth info bandwidth_info = amdsmi.amdsmi_get_gpu_pci_bandwidth(gpu) current_bandwidth_index = bandwidth_info['transfer_rate']['current'] if current_bandwidth_index > 0: bitmask = 1 << (current_bandwidth_index - 1) else: bitmask = 1 << (current_bandwidth_index) # Set PCI bandwidth msg = f'gpu({i}): bitmask({bitmask}):' amdsmi.amdsmi_set_gpu_pci_bandwidth(gpu, bitmask) self._print(msg, '') # Set back to original PCI bandwidth bitmask = 1 << (current_bandwidth_index) amdsmi.amdsmi_set_gpu_pci_bandwidth(gpu, bitmask) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_perf_determinism_mode(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_gpu_perf_determinism_mode as it is not complete.") # TODO clk_value = 0 clk_value = 0 for i, gpu in enumerate(self.processors): msg = f'gpu({i}): clk_value({clk_value}):' try: amdsmi.amdsmi_set_gpu_perf_determinism_mode(gpu, clk_value) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_perf_level(self): self._print_func_name('') if self.TODO_SKIP_NOT_COMPLETE: self.skipTest("Skipping test_set_gpu_perf_level as it is not complete.") dev_perf_level_current = self.dev_perf_levels[0][1] for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: dev_perf_level_name_current = amdsmi.amdsmi_get_gpu_perf_level(gpu) items = dev_perf_level_name_current.split('_') dev_perf_level_name_current = items[-1] except amdsmi.AmdSmiLibraryException as e: self._print(msg, e) continue for dev_perf_level_name, dev_perf_level, dev_perf_level_cond in self.dev_perf_levels: msg = f'gpu({i}):' try: if dev_perf_level_name_current == dev_perf_level_name: dev_perf_level_current = dev_perf_level msg = f'{msg} dev_perf_level({dev_perf_level_name}):' amdsmi.amdsmi_set_gpu_perf_level(gpu, dev_perf_level) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, dev_perf_level_cond): self.raise_exception = e try: amdsmi.amdsmi_set_gpu_perf_level(gpu, dev_perf_level_current) except amdsmi.AmdSmiLibraryException as e: self._print(msg, e) continue if self.raise_exception: raise self.raise_exception return def test_set_gpu_power_profile(self): self._print_func_name('') for i, gpu in enumerate(self.processors): for power_profile_preset_mask_name, power_profile_preset_mask, power_profile_preset_masks_cond in self.power_profile_preset_masks: msg = f'gpu({i}): power_profile_preset_mask({power_profile_preset_mask_name}):' try: amdsmi.amdsmi_set_gpu_power_profile(gpu, 0, power_profile_preset_mask) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, power_profile_preset_masks_cond): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_set_gpu_process_isolation(self): self._print_func_name('') pisolates = [1, 0] for i, gpu in enumerate(self.processors): for pisolate in pisolates: msg = f'gpu({i}): pisolate({pisolate})' try: amdsmi.amdsmi_set_gpu_process_isolation(gpu, pisolate) self._print(msg) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_power_cap(self): '''test power cap''' self._print_func_name('') for i, gpu in enumerate(self.processors): # Get Power Cap Info msg = f'### test amdsmi_get_power_cap_info(gpu={i})' try: power_cap_info = amdsmi.amdsmi_get_power_cap_info(gpu) self._print(msg, power_cap_info) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e # Have to be able to get info before setting continue # Set to Average Power Cap try: cap = int((power_cap_info['max_power_cap'] + power_cap_info['min_power_cap']) / 2) msg = f'### test amdsmi_set_power_cap(gpu={i}, 0, cap={cap})' amdsmi.amdsmi_set_power_cap(gpu, 0, cap) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e # Restore Power Cap try: cap = power_cap_info['power_cap'] msg = f'### test amdsmi_set_power_cap(gpu={i}, 0, cap={cap})' amdsmi.amdsmi_set_power_cap(gpu, 0, cap) self._print(msg, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_soc_pstate(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' # Get current policy info msg1 = f'{msg} amdsmi_get_soc_pstate' try: policy_info = amdsmi.amdsmi_get_soc_pstate(gpu) self._print(msg1, '') num_supported = policy_info['num_supported'] if not isinstance(num_supported, int): self._print('Cannot determine num_supported={num_supported}', '') continue policy_id_current = policy_info['current_id'] if not isinstance(policy_id_current, int): self._print('Cannot determine policy_id_current={policy_id_current}', '') continue policy_id_orig = policy_info['policies'][policy_id_current]['policy_id'] if not isinstance(policy_id_orig, int): self._print('Cannot determine orig policy_id={policy_id_orig}', '') continue index = 0 if num_supported >= 2: if policy_id_current != 0: index = 1 policy_id = policy_info['policies'][index]['policy_id'] except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e continue # Set SOC Pstate policy msg1 = f'{msg} policy_id({policy_id}):' try: amdsmi.amdsmi_set_soc_pstate(gpu, policy_id) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e continue # Set back to original policy msg1 = f'{msg} policy_id({policy_id_orig}):' try: amdsmi.amdsmi_set_soc_pstate(gpu, policy_id_orig) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_xgmi_plpd(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_set_xgmi_plpd as it fails on MI300.") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' # Get current policy info msg1 = f'{msg} amdsmi_get_xgmi_plpd()' try: policy_info = amdsmi.amdsmi_get_xgmi_plpd(gpu) self._print(msg1, '') num_supported = policy_info['num_supported'] if not isinstance(num_supported, int): self._print('Cannot determine num_supported={num_supported}', '') continue policy_id_current = policy_info['current_id'] if not isinstance(policy_id_current, int): self._print('Cannot determine policy_id_current={policy_id_current}', '') continue policy_id_orig = policy_info['policies'][policy_id_current]['policy_id'] if not isinstance(policy_id_orig, int): self._print('Cannot determine orig policy_id={policy_id_orig}', '') continue index = 0 if num_supported >= 2: if policy_id_current != 0: index = 1 policy_id = policy_info['policies'][index]['policy_id'] if not isinstance(policy_id, int): self._print('Cannot determine policy_id={policy_id}', '') continue except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e continue # Set policy msg1 = f'{msg} policy_id({policy_id}):' try: amdsmi.amdsmi_set_xgmi_plpd(gpu, policy_id) self._print(msg1, '') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e # Set back to original policy try: msg1 = f'{msg} policy_id({policy_id_orig}):' amdsmi.amdsmi_set_xgmi_plpd(gpu, policy_id_orig) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg1, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_status_code_to_string(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_status_code_to_string as it fails (Unhashable type).") for error_num, error_name in error_map.items(): msg = f'{error_name}({error_num}):' try: ret = amdsmi.amdsmi_status_code_to_string(ctypes.c_uint32(int(error_num))) self._print(f'{msg} {ret}') except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_topo_get_link_type(self): self._print_func_name('') for i, gpu_i in enumerate(self.processors): for j, gpu_j in enumerate(self.processors): msg = f'gpu({i},{j}):' try: ret = amdsmi.amdsmi_topo_get_link_type(gpu_i, gpu_j) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_topo_get_link_weight(self): self._print_func_name('') for i, gpu_i in enumerate(self.processors): for j, gpu_j in enumerate(self.processors): msg = f'gpu({i},{j}):' try: ret = amdsmi.amdsmi_topo_get_link_weight(gpu_i, gpu_j) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_topo_get_numa_node_number(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_topo_get_numa_node_number(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_topo_get_p2p_status(self): self._print_func_name('') if self.TODO_SKIP_FAIL: self.skipTest("Skipping test_topo_get_p2p_status as it fails (Inval parameters).") for i, gpu_i in enumerate(self.processors): for j, gpu_j in enumerate(self.processors): msg = f'gpu({i},{j}):' try: ret = amdsmi.amdsmi_topo_get_p2p_status(gpu_i, gpu_j) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return def test_get_gpu_busy_percent(self): self._print_func_name('') for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: ret = amdsmi.amdsmi_get_gpu_busy_percent(gpu) self._print(msg, ret) except amdsmi.AmdSmiLibraryException as e: if self._check_ret(msg, e, self.PASS): self.raise_exception = e if self.raise_exception: raise self.raise_exception return if __name__ == '__main__': unittest.main()