Files
rocm-systems/tests/python_unittest/unit_tests.py
T
2025-10-27 14:43:31 -05:00

3440 строки
139 KiB
Python
Исполняемый файл

#!/usr/bin/env python3
#
# Copyright (C) Advanced Micro Devices. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import ctypes
import inspect
import json
import os
import sys
import unittest
amdsmi_path = os.environ.get("AMDSMI_PATH", "/opt/rocm/share/amd_smi")
if not os.path.exists(amdsmi_path):
raise FileNotFoundError(f"AMDSMI_PATH '{amdsmi_path}' does not exist. Please set the correct path in your environment.")
sys.path.append(amdsmi_path)
try:
import amdsmi
except ImportError as exc:
raise ImportError(f'Could not import {amdsmi_path}') from exc
not_supported_error_codes = \
[
( '2', 'AMDSMI_STATUS_NOT_SUPPORTED'),
( '3', 'AMDSMI_STATUS_NOT_YET_IMPLEMENTED'),
('49', 'AMDSMI_STATUS_NO_HSMP_MSG_SUP')
]
error_map = \
{
'0': 'AMDSMI_STATUS_SUCCESS',
'1': 'AMDSMI_STATUS_INVAL',
'2': 'AMDSMI_STATUS_NOT_SUPPORTED',
'3': 'AMDSMI_STATUS_NOT_YET_IMPLEMENTED',
'4': 'AMDSMI_STATUS_FAIL_LOAD_MODULE',
'5': 'AMDSMI_STATUS_FAIL_LOAD_SYMBOL',
'6': 'AMDSMI_STATUS_DRM_ERROR',
'7': 'AMDSMI_STATUS_API_FAILED',
'8': 'AMDSMI_STATUS_TIMEOUT',
'9': 'AMDSMI_STATUS_RETRY',
'10': 'AMDSMI_STATUS_NO_PERM',
'11': 'AMDSMI_STATUS_INTERRUPT',
'12': 'AMDSMI_STATUS_IO',
'13': 'AMDSMI_STATUS_ADDRESS_FAULT',
'14': 'AMDSMI_STATUS_FILE_ERROR',
'15': 'AMDSMI_STATUS_OUT_OF_RESOURCES',
'16': 'AMDSMI_STATUS_INTERNAL_EXCEPTION',
'17': 'AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS',
'18': 'AMDSMI_STATUS_INIT_ERROR',
'19': 'AMDSMI_STATUS_REFCOUNT_OVERFLOW',
'30': 'AMDSMI_STATUS_BUSY',
'31': 'AMDSMI_STATUS_NOT_FOUND',
'32': 'AMDSMI_STATUS_NOT_INIT',
'33': 'AMDSMI_STATUS_NO_SLOT',
'34': 'AMDSMI_STATUS_DRIVER_NOT_LOADED',
'39': 'AMDSMI_STATUS_MORE_DATA',
'40': 'AMDSMI_STATUS_NO_DATA',
'41': 'AMDSMI_STATUS_INSUFFICIENT_SIZE',
'42': 'AMDSMI_STATUS_UNEXPECTED_SIZE',
'43': 'AMDSMI_STATUS_UNEXPECTED_DATA',
'44': 'AMDSMI_STATUS_NON_AMD_CPU',
'45': 'AMDSMI_STATUS_NO_ENERGY_DRV',
'46': 'AMDSMI_STATUS_NO_MSR_DRV',
'47': 'AMDSMI_STATUS_NO_HSMP_DRV',
'48': 'AMDSMI_STATUS_NO_HSMP_SUP',
'49': 'AMDSMI_STATUS_NO_HSMP_MSG_SUP',
'50': 'AMDSMI_STATUS_HSMP_TIMEOUT',
'51': 'AMDSMI_STATUS_NO_DRV',
'52': 'AMDSMI_STATUS_FILE_NOT_FOUND',
'53': 'AMDSMI_STATUS_ARG_PTR_NULL',
'54': 'AMDSMI_STATUS_AMDGPU_RESTART_ERR',
'55': 'AMDSMI_STATUS_SETTING_UNAVAILABLE',
'56': 'AMDSMI_STATUS_CORRUPTED_EEPROM',
'0xFFFFFFFE': 'AMDSMI_STATUS_MAP_ERROR',
'0xFFFFFFFF': 'AMDSMI_STATUS_UNKNOWN_ERROR'
}
verbose=1
if '-q' in sys.argv or '--quiet' in sys.argv:
verbose=0
elif '-v' in sys.argv or '--verbose' in sys.argv:
verbose=2
has_info_printed = False
class TestAmdSmiPythonBDF(unittest.TestCase):
valid_bdfs = {
"00:00.0": [0, 0, 0, 0],
"01:01.1": [0, 1, 1, 1],
"FF:1F.7": [0, 255, 31, 7],
"FF:00.7": [0, 255, 0, 7],
"11:01.2": [0, 17, 1, 2],
"11:0a.2": [0, 17, 10, 2],
"0000:FF:1F.7": [0, 255, 31, 7],
"0001:ff:1F.7": [1, 255, 31, 7],
"ffff:FF:1f.7": [65535, 255, 31, 7],
}
invalid_bdfs = {
# invalid bdf strings, expect None
None: None,
"": None,
"00:00:0": None,
"00.00:0": None,
"00:00.Z": None,
"00:0Z.0": None,
"0Z:00.0": None,
"Z00:00.0": None,
"A00:00.0": None,
"0A00:00.0": None,
"00:00.07": None,
"00:00.8": None,
"00:00.10": None,
"00:00.11": None,
"00:00.-1": None,
"00:00.*-1": None,
"00:00.123": None,
"00:20.0": None,
"00:45.0": None,
"00:200.0": None,
"00:002.0": None,
"100:00.0": None,
"0100:00.0": None,
"00100:00.0": None,
"0101:00.0": None,
"00001:00.0": None,
"10001:00.0": None,
"45:0.0": None,
".00:00.0": None,
"00.00.0": None,
"00.0.0": None,
"0.00.0": None,
"000.00.0": None,
"00 00 0": None,
" 00:00.0": None,
"00:00.0 ": None,
"0000:00.00.0": None,
"000:00:00.0": None,
"00:00:00.1": None,
"0:00:00.1": None,
"0000 00 00 0": None,
"-1-1:00:00.0": None,
"AAAA:00:AA.0": None,
"*1*1:00:00.0": None,
"0000:00:00.07": None,
"0000:00:00.8": None,
"0000:00:00.10": None,
"0000:00:00.11": None,
"0000:00:00.-1": None,
"0000:00:00.*-1": None,
"0000:00:00.123": None,
"0000:00:20.0": None,
"0000:00:45.0": None,
"0000:00:200.0": None,
"0000:00:002.0": None,
"0000:100:00.0": None,
"0000:0100:00.0": None,
"0000:00100:00.0": None,
"0000:0101:00.0": None,
"0000:00001:00.0": None,
"0000:10001:00.0": None,
"0000:45:0.0": None,
".0000.00:00.0": None,
"0000.00.0.0": None,
" 0000:00:00.0": None,
"0000:00:00.0 ": None,
}
def test_parse_bdf(self):
# go through all bdfs
expectations = self.valid_bdfs.copy()
expectations.update(self.invalid_bdfs)
for bdf in expectations:
expected = expectations[bdf]
result = amdsmi.amdsmi_interface._parse_bdf(bdf)
self.assertEqual(result, expected,
"Expected {} for bdf {}, but got {}".format(
expected, bdf, result))
@classmethod
def _convert_bdf_to_long(cls, bdf):
if len(bdf) == 12:
return bdf
if len(bdf) == 7:
return "0000:" + bdf
return None
def test_format_bdf(self):
# go through valid bdfs
expectations = self.valid_bdfs.copy()
for bdf_string in expectations:
# use key as result and value as input
bdf_list = expectations[bdf_string]
smi_bdf = amdsmi.amdsmi_interface._make_amdsmi_bdf_from_list(bdf_list)
expected = TestAmdSmiPythonBDF._convert_bdf_to_long(bdf_string)
if expected:
expected = expected.lower()
if smi_bdf:
result = amdsmi.amdsmi_interface._format_bdf(smi_bdf)
else:
result = "None"
self.assertEqual(result, expected,
"Expected {} for bdf {}, but got {}".format(
expected, bdf_string, result))
def test_check_res(self):
# expect retry error to raise SmiRetryException
with self.assertRaises(amdsmi.AmdSmiRetryException) as retry_test:
amdsmi.amdsmi_interface._check_res(
(lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY)())
# except retry error to have AMDSMI_STATUS_RETRY error code
self.assertEqual(retry_test.exception.get_error_code(),
amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY)
# expect timeout error to raise SmiTimeoutException
with self.assertRaises(amdsmi.AmdSmiTimeoutException) as timeout_test:
amdsmi.amdsmi_interface._check_res(
(lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)())
# except timeout error to have AMDSMI_STATUS_RETRY error code
self.assertEqual(timeout_test.exception.get_error_code(),
amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)
# expect invalid args error to raise AmdSmiLibraryException
with self.assertRaises(amdsmi.AmdSmiLibraryException) as inval_test:
amdsmi.amdsmi_interface._check_res(
(lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)())
# expect invalid args error to have AMDSMI_STATUS_INVAL error code
self.assertEqual(inval_test.exception.get_error_code(),
amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
class TestAmdSmiPython(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
global has_info_printed
if verbose and has_info_printed is False:
# Execute the following to print the asic and board info once
# per test run
has_info_printed = True
self.setUp()
for i, gpu in enumerate(self.processors):
try:
# Print asic info
msg = f'asic info(gpu={i})'
ret = amdsmi.amdsmi_get_gpu_asic_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
raise e
for i, gpu in enumerate(self.processors):
try:
# Print board info
msg = f'board info(gpu={i})'
ret = amdsmi.amdsmi_get_gpu_board_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
raise e
self.tearDown()
PASS = 'AMDSMI_STATUS_SUCCESS'
FAIL = 'AMDSMI_STATUS_INVAL'
max_num_physical_devices = amdsmi.amdsmi_interface.AMDSMI_MAX_NUM_XCP * amdsmi.amdsmi_interface.AMDSMI_MAX_DEVICES
# Tests marked wtih either of these flags will be skipped
# and need to be implemented later.
TODO_SKIP_FAIL = True
TODO_SKIP_NOT_COMPLETE = True
status_types = \
[
('SUCCESS', amdsmi.AmdSmiStatus.SUCCESS, PASS),
('INVAL', amdsmi.AmdSmiStatus.INVAL, PASS),
('NOT_SUPPORTED', amdsmi.AmdSmiStatus.NOT_SUPPORTED, PASS),
('NOT_YET_IMPLEMENTED', amdsmi.AmdSmiStatus.NOT_YET_IMPLEMENTED, PASS),
('FAIL_LOAD_MODULE', amdsmi.AmdSmiStatus.FAIL_LOAD_MODULE, PASS),
('FAIL_LOAD_SYMBOL', amdsmi.AmdSmiStatus.FAIL_LOAD_SYMBOL, PASS),
('DRM_ERROR', amdsmi.AmdSmiStatus.DRM_ERROR, PASS),
('API_FAILED', amdsmi.AmdSmiStatus.API_FAILED, PASS),
('TIMEOUT', amdsmi.AmdSmiStatus.TIMEOUT, PASS),
('RETRY', amdsmi.AmdSmiStatus.RETRY, PASS),
('NO_PERM', amdsmi.AmdSmiStatus.NO_PERM, PASS),
('INTERRUPT', amdsmi.AmdSmiStatus.INTERRUPT, PASS),
('IO', amdsmi.AmdSmiStatus.IO, PASS),
('ADDRESS_FAULT', amdsmi.AmdSmiStatus.ADDRESS_FAULT, PASS),
('FILE_ERROR', amdsmi.AmdSmiStatus.FILE_ERROR, PASS),
('OUT_OF_RESOURCES', amdsmi.AmdSmiStatus.OUT_OF_RESOURCES, PASS),
('INTERNAL_EXCEPTION', amdsmi.AmdSmiStatus.INTERNAL_EXCEPTION, PASS),
('INPUT_OUT_OF_BOUNDS', amdsmi.AmdSmiStatus.INPUT_OUT_OF_BOUNDS, PASS),
('INIT_ERROR', amdsmi.AmdSmiStatus.INIT_ERROR, PASS),
('REFCOUNT_OVERFLOW', amdsmi.AmdSmiStatus.REFCOUNT_OVERFLOW, PASS),
('DIRECTORY_NOT_FOUND', amdsmi.AmdSmiStatus.DIRECTORY_NOT_FOUND, PASS),
('BUSY', amdsmi.AmdSmiStatus.BUSY, PASS),
('NOT_FOUND', amdsmi.AmdSmiStatus.NOT_FOUND, PASS),
('NOT_INIT', amdsmi.AmdSmiStatus.NOT_INIT, PASS),
('NO_SLOT', amdsmi.AmdSmiStatus.NO_SLOT, PASS),
('DRIVER_NOT_LOADED', amdsmi.AmdSmiStatus.DRIVER_NOT_LOADED, PASS),
('MORE_DATA', amdsmi.AmdSmiStatus.MORE_DATA, PASS),
('NO_DATA', amdsmi.AmdSmiStatus.NO_DATA, PASS),
('INSUFFICIENT_SIZE', amdsmi.AmdSmiStatus.INSUFFICIENT_SIZE, PASS),
('UNEXPECTED_SIZE', amdsmi.AmdSmiStatus.UNEXPECTED_SIZE, PASS),
('UNEXPECTED_DATA', amdsmi.AmdSmiStatus.UNEXPECTED_DATA, PASS),
('NON_AMD_CPU', amdsmi.AmdSmiStatus.NON_AMD_CPU, PASS),
('NO_ENERGY_DRV', amdsmi.AmdSmiStatus.NO_ENERGY_DRV, PASS),
('NO_MSR_DRV', amdsmi.AmdSmiStatus.NO_MSR_DRV, PASS),
('NO_HSMP_DRV', amdsmi.AmdSmiStatus.NO_HSMP_DRV, PASS),
('NO_HSMP_SUP', amdsmi.AmdSmiStatus.NO_HSMP_SUP, PASS),
('NO_HSMP_MSG_SUP', amdsmi.AmdSmiStatus.NO_HSMP_MSG_SUP, PASS),
('HSMP_TIMEOUT', amdsmi.AmdSmiStatus.HSMP_TIMEOUT, PASS),
('NO_DRV', amdsmi.AmdSmiStatus.NO_DRV, PASS),
('FILE_NOT_FOUND', amdsmi.AmdSmiStatus.FILE_NOT_FOUND, PASS),
('ARG_PTR_NULL', amdsmi.AmdSmiStatus.ARG_PTR_NULL, PASS),
('AMDGPU_RESTART_ERR', amdsmi.AmdSmiStatus.AMDGPU_RESTART_ERR, PASS),
('SETTING_UNAVAILABLE', amdsmi.AmdSmiStatus.SETTING_UNAVAILABLE, PASS),
('CORRUPTED_EEPROM', amdsmi.AmdSmiStatus.CORRUPTED_EEPROM, PASS),
('MAP_ERROR', amdsmi.AmdSmiStatus.MAP_ERROR, PASS),
('UNKNOWN_ERROR', amdsmi.AmdSmiStatus.UNKNOWN_ERROR, PASS)
]
clk_types = \
[
('SYS', amdsmi.AmdSmiClkType.SYS, PASS),
('GFX', amdsmi.AmdSmiClkType.GFX, PASS),
('DF', amdsmi.AmdSmiClkType.DF, PASS),
('DCEF', amdsmi.AmdSmiClkType.DCEF, [PASS, FAIL]),
('SOC', amdsmi.AmdSmiClkType.SOC, PASS),
('MEM', amdsmi.AmdSmiClkType.MEM, PASS),
('PCIE', amdsmi.AmdSmiClkType.PCIE, [PASS, FAIL]),
('VCLK0', amdsmi.AmdSmiClkType.VCLK0, PASS),
('VCLK1', amdsmi.AmdSmiClkType.VCLK1, PASS),
('DCLK0', amdsmi.AmdSmiClkType.DCLK0, PASS),
('DCLK1', amdsmi.AmdSmiClkType.DCLK1, PASS)
]
clk_limit_types = \
[
('MIN', amdsmi.AmdSmiClkLimitType.MIN, PASS),
('MAX', amdsmi.AmdSmiClkLimitType.MAX, PASS)
]
io_bw_encodings = \
[
('AGG_BW0', amdsmi.amdsmi_wrapper.AGG_BW0, PASS),
('RD_BW0', amdsmi.amdsmi_wrapper.RD_BW0, PASS),
('WR_BW0', amdsmi.amdsmi_wrapper.WR_BW0, PASS)
]
event_groups = \
[
('XGMI', amdsmi.AmdSmiEventGroup.XGMI, PASS),
('XGMI_DATA_OUT', amdsmi.AmdSmiEventGroup.XGMI_DATA_OUT, PASS),
('GRP_INVALID', amdsmi.AmdSmiEventGroup.GRP_INVALID, FAIL)
]
gpu_blocks = \
[
('INVALID', amdsmi.AmdSmiGpuBlock.INVALID, FAIL),
('UMC', amdsmi.AmdSmiGpuBlock.UMC, PASS),
('SDMA', amdsmi.AmdSmiGpuBlock.SDMA, PASS),
('GFX', amdsmi.AmdSmiGpuBlock.GFX, PASS),
('MMHUB', amdsmi.AmdSmiGpuBlock.MMHUB, PASS),
('ATHUB', amdsmi.AmdSmiGpuBlock.ATHUB, PASS),
('PCIE_BIF', amdsmi.AmdSmiGpuBlock.PCIE_BIF, PASS),
('HDP', amdsmi.AmdSmiGpuBlock.HDP, PASS),
('XGMI_WAFL', amdsmi.AmdSmiGpuBlock.XGMI_WAFL, PASS),
('DF', amdsmi.AmdSmiGpuBlock.DF, PASS),
('SMN', amdsmi.AmdSmiGpuBlock.SMN, PASS),
('SEM', amdsmi.AmdSmiGpuBlock.SEM, PASS),
('MP0', amdsmi.AmdSmiGpuBlock.MP0, PASS),
('MP1', amdsmi.AmdSmiGpuBlock.MP1, PASS),
('FUSE', amdsmi.AmdSmiGpuBlock.FUSE, PASS),
('MCA', amdsmi.AmdSmiGpuBlock.MCA, PASS),
('VCN', amdsmi.AmdSmiGpuBlock.VCN, PASS),
('JPEG', amdsmi.AmdSmiGpuBlock.JPEG, PASS),
('IH', amdsmi.AmdSmiGpuBlock.IH, PASS),
('MPIO', amdsmi.AmdSmiGpuBlock.MPIO, PASS),
('RESERVED', amdsmi.AmdSmiGpuBlock.RESERVED, FAIL)
]
memory_types = \
[
('VRAM', amdsmi.AmdSmiMemoryType.VRAM, PASS),
('VIS_VRAM', amdsmi.AmdSmiMemoryType.VIS_VRAM, PASS),
('GTT', amdsmi.AmdSmiMemoryType.GTT, PASS)
]
processor_types = \
[
('UNKNOWN', amdsmi.AmdSmiProcessorType.UNKNOWN, FAIL),
('AMD_GPU', amdsmi.AmdSmiProcessorType.AMD_GPU, PASS),
('AMD_CPU', amdsmi.AmdSmiProcessorType.AMD_CPU, PASS),
('NON_AMD_GPU', amdsmi.AmdSmiProcessorType.NON_AMD_GPU, PASS),
('NON_AMD_CPU', amdsmi.AmdSmiProcessorType.NON_AMD_CPU, PASS),
('AMD_CPU_CORE', amdsmi.AmdSmiProcessorType.AMD_CPU_CORE, PASS),
('AMD_APU', amdsmi.AmdSmiProcessorType.AMD_APU, PASS)
]
reg_types = \
[
('XGMI', amdsmi.AmdSmiRegType.XGMI, PASS),
('WAFL', amdsmi.AmdSmiRegType.WAFL, PASS),
('PCIE', amdsmi.AmdSmiRegType.PCIE, PASS),
('USR', amdsmi.AmdSmiRegType.USR, PASS),
('USR1', amdsmi.AmdSmiRegType.USR1, PASS)
]
voltage_metrics = \
[
('CURRENT', amdsmi.AmdSmiVoltageMetric.CURRENT, PASS),
('MAX', amdsmi.AmdSmiVoltageMetric.MAX, PASS),
('MIN_CRIT', amdsmi.AmdSmiVoltageMetric.MIN_CRIT, PASS),
('MIN', amdsmi.AmdSmiVoltageMetric.MIN, PASS),
('MAX_CRIT', amdsmi.AmdSmiVoltageMetric.MAX_CRIT, PASS),
('AVERAGE', amdsmi.AmdSmiVoltageMetric.AVERAGE, PASS),
('LOWEST', amdsmi.AmdSmiVoltageMetric.LOWEST, PASS),
('HIGHEST', amdsmi.AmdSmiVoltageMetric.HIGHEST, PASS)
]
voltage_types = \
[
('VDDGFX', amdsmi.AmdSmiVoltageType.VDDGFX, PASS),
('INVALID', amdsmi.AmdSmiVoltageType.INVALID, FAIL)
]
link_types = \
[
('AMDSMI_LINK_TYPE_INTERNAL', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_INTERNAL, PASS),
('AMDSMI_LINK_TYPE_XGMI', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_XGMI, PASS),
('AMDSMI_LINK_TYPE_PCIE', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_PCIE, PASS),
('AMDSMI_LINK_TYPE_NOT_APPLICABLE', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_NOT_APPLICABLE, FAIL),
('AMDSMI_LINK_TYPE_UNKNOWN', amdsmi.AmdSmiLinkType.AMDSMI_LINK_TYPE_UNKNOWN, FAIL)
]
temperature_types = \
[
('EDGE', amdsmi.AmdSmiTemperatureType.EDGE, PASS),
('HOTSPOT', amdsmi.AmdSmiTemperatureType.HOTSPOT, PASS),
('JUNCTION', amdsmi.AmdSmiTemperatureType.JUNCTION, PASS),
('VRAM', amdsmi.AmdSmiTemperatureType.VRAM, PASS),
('HBM_0', amdsmi.AmdSmiTemperatureType.HBM_0, PASS),
('HBM_1', amdsmi.AmdSmiTemperatureType.HBM_1, PASS),
('HBM_2', amdsmi.AmdSmiTemperatureType.HBM_2, PASS),
('HBM_3', amdsmi.AmdSmiTemperatureType.HBM_3, PASS),
('PLX', amdsmi.AmdSmiTemperatureType.PLX, PASS)
]
temperature_metrics = \
[
('CURRENT', amdsmi.AmdSmiTemperatureMetric.CURRENT, PASS),
('MAX', amdsmi.AmdSmiTemperatureMetric.MAX, PASS),
('MIN', amdsmi.AmdSmiTemperatureMetric.MIN, PASS),
('MAX_HYST', amdsmi.AmdSmiTemperatureMetric.MAX_HYST, PASS),
('MIN_HYST', amdsmi.AmdSmiTemperatureMetric.MIN_HYST, PASS),
('CRITICAL', amdsmi.AmdSmiTemperatureMetric.CRITICAL, PASS),
('CRITICAL_HYST', amdsmi.AmdSmiTemperatureMetric.CRITICAL_HYST, PASS),
('EMERGENCY', amdsmi.AmdSmiTemperatureMetric.EMERGENCY, PASS),
('EMERGENCY_HYST', amdsmi.AmdSmiTemperatureMetric.EMERGENCY_HYST, PASS),
('CRIT_MIN', amdsmi.AmdSmiTemperatureMetric.CRIT_MIN, PASS),
('CRIT_MIN_HYST', amdsmi.AmdSmiTemperatureMetric.CRIT_MIN_HYST, PASS),
('OFFSET', amdsmi.AmdSmiTemperatureMetric.OFFSET, PASS),
('LOWEST', amdsmi.AmdSmiTemperatureMetric.LOWEST, PASS),
('HIGHEST', amdsmi.AmdSmiTemperatureMetric.HIGHEST, PASS)
]
utilization_counter_types = \
[
('COARSE_GRAIN_GFX_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY, PASS),
('COARSE_GRAIN_MEM_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.COARSE_GRAIN_MEM_ACTIVITY, PASS),
('COARSE_DECODER_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.COARSE_DECODER_ACTIVITY, PASS),
('FINE_GRAIN_GFX_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.FINE_GRAIN_GFX_ACTIVITY, PASS),
('FINE_GRAIN_MEM_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.FINE_GRAIN_MEM_ACTIVITY, PASS),
('FINE_DECODER_ACTIVITY', amdsmi.AmdSmiUtilizationCounterType.FINE_DECODER_ACTIVITY, PASS),
('UTILIZATION_COUNTER_FIRST', amdsmi.AmdSmiUtilizationCounterType.UTILIZATION_COUNTER_FIRST, PASS),
('UTILIZATION_COUNTER_LAST', amdsmi.AmdSmiUtilizationCounterType.UTILIZATION_COUNTER_LAST, PASS)
]
event_types = \
[
('XGMI_0_NOP_TX', amdsmi.AmdSmiEventType.XGMI_0_NOP_TX, PASS),
('XGMI_0_REQUEST_TX', amdsmi.AmdSmiEventType.XGMI_0_REQUEST_TX, PASS),
('XGMI_0_RESPONSE_TX', amdsmi.AmdSmiEventType.XGMI_0_RESPONSE_TX, PASS),
('XGMI_0_BEATS_TX', amdsmi.AmdSmiEventType.XGMI_0_BEATS_TX, PASS),
('XGMI_1_NOP_TX', amdsmi.AmdSmiEventType.XGMI_1_NOP_TX, PASS),
('XGMI_1_REQUEST_TX', amdsmi.AmdSmiEventType.XGMI_1_REQUEST_TX, PASS),
('XGMI_1_RESPONSE_TX', amdsmi.AmdSmiEventType.XGMI_1_RESPONSE_TX, PASS),
('XGMI_1_BEATS_TX', amdsmi.AmdSmiEventType.XGMI_1_BEATS_TX, PASS),
('XGMI_DATA_OUT_0', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_0, PASS),
('XGMI_DATA_OUT_1', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_1, PASS),
('XGMI_DATA_OUT_2', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_2, PASS),
('XGMI_DATA_OUT_3', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_3, PASS),
('XGMI_DATA_OUT_4', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_4, PASS),
('XGMI_DATA_OUT_5', amdsmi.AmdSmiEventType.XGMI_DATA_OUT_5, PASS)
]
counter_commands = \
[
('CMD_START', amdsmi.AmdSmiCounterCommand.CMD_START, PASS),
('CMD_STOP', amdsmi.AmdSmiCounterCommand.CMD_STOP, PASS)
]
compute_partition_types = \
[
('SPX', amdsmi.AmdSmiComputePartitionType.SPX, PASS),
('DPX', amdsmi.AmdSmiComputePartitionType.DPX, PASS),
('TPX', amdsmi.AmdSmiComputePartitionType.TPX, PASS),
('QPX', amdsmi.AmdSmiComputePartitionType.QPX, PASS),
('CPX', amdsmi.AmdSmiComputePartitionType.CPX, PASS),
('INVALID', amdsmi.AmdSmiComputePartitionType.INVALID, FAIL)
]
memory_partition_types = \
[
('NPS1', amdsmi.AmdSmiMemoryPartitionType.NPS1, PASS),
('NPS2', amdsmi.AmdSmiMemoryPartitionType.NPS2, PASS),
('NPS4', amdsmi.AmdSmiMemoryPartitionType.NPS4, PASS),
('NPS8', amdsmi.AmdSmiMemoryPartitionType.NPS8, PASS),
('UNKNOWN', amdsmi.AmdSmiMemoryPartitionType.UNKNOWN, FAIL)
]
freq_inds = \
[
('MIN', amdsmi.AmdSmiFreqInd.MIN, PASS),
('MAX', amdsmi.AmdSmiFreqInd.MAX, PASS),
('INVALID', amdsmi.AmdSmiFreqInd.INVALID, FAIL)
]
dev_perf_levels = \
[
('AUTO', amdsmi.AmdSmiDevPerfLevel.AUTO, PASS),
('LOW', amdsmi.AmdSmiDevPerfLevel.LOW, PASS),
('HIGH', amdsmi.AmdSmiDevPerfLevel.HIGH, PASS),
('MANUAL', amdsmi.AmdSmiDevPerfLevel.MANUAL, PASS),
('STABLE_STD', amdsmi.AmdSmiDevPerfLevel.STABLE_STD, PASS),
('STABLE_PEAK', amdsmi.AmdSmiDevPerfLevel.STABLE_PEAK, PASS),
('STABLE_MIN_MCLK', amdsmi.AmdSmiDevPerfLevel.STABLE_MIN_MCLK, PASS),
('STABLE_MIN_SCLK', amdsmi.AmdSmiDevPerfLevel.STABLE_MIN_SCLK, PASS),
('DETERMINISM', amdsmi.AmdSmiDevPerfLevel.DETERMINISM, PASS),
('UNKNOWN', amdsmi.AmdSmiDevPerfLevel.UNKNOWN, FAIL)
]
power_profile_preset_masks = \
[
('CUSTOM_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.CUSTOM_MASK, PASS),
('VIDEO_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.VIDEO_MASK, PASS),
('POWER_SAVING_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.POWER_SAVING_MASK, PASS),
('COMPUTE_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.COMPUTE_MASK, PASS),
('VR_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.VR_MASK, PASS),
('THREE_D_FULL_SCR_MASK', amdsmi.AmdSmiPowerProfilePresetMasks.THREE_D_FULL_SCR_MASK, PASS),
('BOOTUP_DEFAULT', amdsmi.AmdSmiPowerProfilePresetMasks.BOOTUP_DEFAULT, PASS)
]
def _print(self, msg, data=None):
if verbose == 2:
if data is None:
print(msg, flush=True)
elif any(data in value for value in not_supported_error_codes):
print(f'{msg} {data}', flush=True)
else:
if isinstance(data, str) and data in error_map.values():
print(msg, end='')
else:
print(msg)
if isinstance(data, dict) or isinstance(data, list):
print(json.dumps(data, sort_keys=False, indent=4), flush=True)
else:
print(data)
return
def _print_func_name(self, msg):
if verbose == 2:
stk = inspect.stack()
if stk[1].function == '_callSetUp':
return
print(msg, flush=True)
print(f'## {stk[1].function}()', flush=True)
return
def get_error_code(self, e):
error_code = e.get_error_code()
return error_map[error_code]
def _check_ret(self, msg, _e, expected_code=None, printit=True):
error_code_int = int(_e.get_error_code())
error_code = str(error_code_int)
if error_code in error_map:
error_code_name = error_map[error_code]
else:
error_code_name = 'UNKNOWN_ERROR'
# Check for when there are multiple passing conditions
if isinstance(expected_code, list):
for ec in expected_code:
rc = self._check_ret(msg, _e, ec, False) # Do not print msg, otherwise multiple msgs printed
if not rc:
rc = self._check_ret(msg, _e, ec) # Call check again so msg is printed
return rc
# No expected results found
print(f'{msg}\nTest FAILED with expected results {expected_code} but received {error_code_name}', flush=True)
return True
# Check for single passing condition
if any(error_code in value for value in not_supported_error_codes):
if verbose == 2 and printit:
print(f'{msg}\nTest SKIPPED with result {error_code_name}', flush=True)
elif error_code_name == expected_code:
if verbose == 2 and printit:
print(f'{msg}\nTest PASSED with expected result {expected_code}', flush=True)
else:
if verbose == 2 and printit:
print(f'{msg}\nTest FAILED with expected result {expected_code} but received {error_code_name}', flush=True)
return True
return False
def setUp(self):
# Called before each test by unittest framework
self.raise_exception = None
amdsmi.amdsmi_init()
self.processors = amdsmi.amdsmi_get_processor_handles()
self.assertGreaterEqual(len(self.processors), 1)
self.assertLessEqual(len(self.processors), self.max_num_physical_devices)
return
def tearDown(self):
# Called after each test by unittest framework
amdsmi.amdsmi_shut_down()
return
def test_clean_gpu_local_data(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
amdsmi.amdsmi_clean_gpu_local_data(gpu)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_cpu_apb_disable(self):
self._print_func_name('')
pstate = 0
for i, gpu in enumerate(self.processors):
msg = f'### amdsmi_cpu_apb_disable(gpu={i}, pstate={pstate}):'
try:
amdsmi.amdsmi_cpu_apb_disable(gpu, pstate)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_cpu_apb_enable(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
amdsmi.amdsmi_cpu_apb_enable(gpu)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_first_online_core_on_cpu_socket(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_first_online_core_on_cpu_socket as it fails (IO Error).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_first_online_core_on_cpu_socket(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_clk_freq(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_clk_freq as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
for clk_type_name, clk_type, clk_cond in self.clk_types:
msg = f'gpu({i}): Clock Type({clk_type_name}):'
try:
ret = amdsmi.amdsmi_get_clk_freq(gpu, clk_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, clk_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_clock_info(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_clock_info as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
for clk_type_name, clk_type, clk_cond in self.clk_types:
msg = f'### test amdsmi_get_clock_info(gpu={i}, Clock Type={clk_type_name})'
try:
ret = amdsmi.amdsmi_get_clock_info(gpu, clk_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, clk_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_cclk_limit(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_cclk_limit(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_core_current_freq_limit(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_core_current_freq_limit(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_core_energy(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_cpu_core_energy as it fails (IO Error).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_core_energy(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_current_io_bandwidth(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for encoding_name, encoding, encoding_cond in self.io_bw_encodings:
msg = f'gpu({i}): encodeing({encoding_name}):'
try:
ret = amdsmi.amdsmi_get_cpu_current_io_bandwidth(gpu, encoding, encoding_name)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, encoding_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_ddr_bw(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_ddr_bw(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_dimm_power_consumption(self):
self._print_func_name('')
# TODO Find better way to get dimm_addr
dimm_addr = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_dimm_power_consumption(gpu, dimm_addr)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_dimm_temp_range_and_refresh_rate(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_cpu_dimm_temp_range_and_refresh_rate as it fails.")
# TODO Find better way to get dimm_addr
dimm_addr = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_dimm_temp_range_and_refresh_rate(gpu, dimm_addr)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_dimm_thermal_sensor(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_cpu_dimm_thermal_sensor as it fails.")
# TODO Find better way to get dimm_addr
dimm_addr = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_dimm_thermal_sensor(gpu, dimm_addr)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_family(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_cpu_family as it fails (IO Error).")
msg = ''
try:
ret = amdsmi.amdsmi_get_cpu_family()
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_fclk_mclk(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_fclk_mclk(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_handles(self):
self._print_func_name('')
msg = ''
try:
ret = amdsmi.amdsmi_get_cpu_handles()
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_hsmp_driver_version(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_cpu_hsmp_driver_version as it fails (IO Error).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_hsmp_driver_version(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_hsmp_proto_ver(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_cpu_hsmp_proto_ver as it fails (IO Error).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_hsmp_proto_ver(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_model(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_cpu_model as it fails (IO Error).")
msg = ''
try:
ret = amdsmi.amdsmi_get_cpu_model()
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_prochot_status(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_prochot_status(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_pwr_svi_telemetry_all_rails(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_pwr_svi_telemetry_all_rails(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_smu_fw_version(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_smu_fw_version(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_socket_c0_residency(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_socket_c0_residency(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_socket_current_active_freq_limit(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_socket_current_active_freq_limit(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_socket_energy(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_cpu_socket_energy as it fails (IO Error).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_socket_energy(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_socket_freq_range(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_socket_freq_range(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_socket_lclk_dpm_level(self):
self._print_func_name('')
nbio_id = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): nbio_id({nbio_id}):'
try:
ret = amdsmi.amdsmi_get_cpu_socket_lclk_dpm_level(gpu, nbio_id)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_socket_power(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_socket_power(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_socket_power_cap_max(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_socket_power_cap_max(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_cpu_socket_temperature(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_cpu_socket_temperature(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_energy_count(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_energy_count as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_energy_count(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_esmi_err_msg(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_esmi_err_msg as it fails (Unknown Error).")
for status_type_name, status_type, status_cond in self.status_types:
msg = f'status(AMDSMI_STATUS_{status_type_name}):'
try:
ret = amdsmi.amdsmi_get_esmi_err_msg(status_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, status_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_fw_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_fw_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_accelerator_partition_profile(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_accelerator_partition_profile(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_accelerator_partition_profile_config(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_accelerator_partition_profile_config(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_activity(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_gpu_activity as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_activity(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_asic_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'### test amdsmi_get_gpu_asic_info(gpu={i})'
try:
ret = amdsmi.amdsmi_get_gpu_asic_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_bad_page_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_bad_page_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_bad_page_threshold(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_bad_page_threshold(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_bdf_id(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_bdf_id(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_board_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'### test amdsmi_get_gpu_board_info(gpu={i})'
try:
ret = amdsmi.amdsmi_get_gpu_board_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_cache_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_cache_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_compute_partition(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_compute_partition(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_compute_process_gpus(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_get_gpu_compute_process_gpus as it is not complete (Inval Error).")
# TODO Find better way to get pid
pid = 0
msg = f'pid({pid}):'
try:
ret = amdsmi.amdsmi_get_gpu_compute_process_gpus(pid)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_compute_process_info(self):
self._print_func_name('')
msg = ''
try:
ret = amdsmi.amdsmi_get_gpu_compute_process_info()
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_compute_process_info_by_pid(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_get_gpu_compute_process_info_by_pid as it not complete (Device not found).")
# TODO Find better way to get pid
pid = 0
msg = f'pid({pid}):'
try:
ret = amdsmi.amdsmi_get_gpu_compute_process_info_by_pid(pid)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_device_bdf(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_device_bdf(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_device_uuid(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_device_uuid(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_driver_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_driver_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_ecc_count(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for gpu_block_name, gpu_block, gpu_block_cond in self.gpu_blocks:
msg = f'gpu({i}): gpu_block({gpu_block_name})'
try:
ret = amdsmi.amdsmi_get_gpu_ecc_count(gpu, gpu_block)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, gpu_block_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_ecc_enabled(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_ecc_enabled(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_ecc_status(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_gpu_ecc_status as it fails.")
for i, gpu in enumerate(self.processors):
for gpu_block_name, gpu_block, gpu_block_cond in self.gpu_blocks:
msg = f'gpu({i}): gpu_block({gpu_block_name})'
try:
ret = amdsmi.amdsmi_get_gpu_ecc_status(gpu, gpu_block)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, gpu_block_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_enumeration_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_enumeration_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_fan_rpms(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_fan_rpms(gpu, 0)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_id(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_id(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_kfd_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_kfd_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_mem_overdrive_level(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_mem_overdrive_level(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_memory_partition(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_memory_partition(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_memory_partition_config(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_gpu_memory_partition_config as it fails on MI300.")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_memory_partition_config(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_memory_reserved_pages(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_memory_reserved_pages(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_memory_total(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for memory_type_name, memory_type, memory_type_cond in self.memory_types:
msg = f'gpu({i}): memory_type({memory_type_name})'
try:
ret = amdsmi.amdsmi_get_gpu_memory_total(gpu, memory_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, memory_type_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_memory_usage(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for memory_type_name, memory_type, memory_type_cond in self.memory_types:
msg = f'gpu({i}): memory_type({memory_type_name})'
try:
ret = amdsmi.amdsmi_get_gpu_memory_usage(gpu, memory_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, memory_type_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_metrics_header_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_metrics_header_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_metrics_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_metrics_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_partition_metrics_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
try:
msg = f'gpu({i}): '
ret = amdsmi.amdsmi_get_gpu_partition_metrics_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
def test_get_gpu_od_volt_curve_regions(self):
self._print_func_name('')
num_region = 10
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): num_region({num_region}):'
try:
ret = amdsmi.amdsmi_get_gpu_od_volt_curve_regions(gpu, num_region)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_od_volt_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_od_volt_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_overdrive_level(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_overdrive_level(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_pci_bandwidth(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_gpu_pci_bandwidth as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_pci_bandwidth(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_pci_replay_counter(self):
self._print_func_name('')
# TODO Check test_get_gpu_pci_replay_counter
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_pci_replay_counter(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_pci_throughput(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_pci_throughput(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_perf_level(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_perf_level(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_pm_metrics_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_pm_metrics_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_power_profile_presets(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_power_profile_presets(gpu, 0)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_process_isolation(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_process_isolation(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_process_list(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_process_list(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_ras_block_features_enabled(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_ras_block_features_enabled(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_ras_feature_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_ras_feature_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_reg_table_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for reg_type_name, reg_type, reg_type_cond in self.reg_types:
msg = f'gpu({i}): reg_type({reg_type_name}):'
try:
ret = amdsmi.amdsmi_get_gpu_reg_table_info(gpu, reg_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, reg_type_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_revision(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_revision(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_subsystem_id(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_subsystem_id(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_subsystem_name(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_subsystem_name(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_topo_numa_affinity(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_topo_numa_affinity(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_total_ecc_count(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_total_ecc_count(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_vbios_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_vbios_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_vendor_name(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_vendor_name(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_virtualization_mode(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_virtualization_mode(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_volt_metric(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for voltage_type_name, voltage_type, voltage_type_cond in self.voltage_types:
for voltage_metric_name, voltage_metric, voltage_metric_cond in self.voltage_metrics:
msg = f'gpu({i}): voltage_type({voltage_type_name}) voltage_metric({voltage_metric_name}):'
try:
ret = amdsmi.amdsmi_get_gpu_volt_metric(gpu, voltage_type, voltage_metric)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if not voltage_type_cond == self.PASS:
if self._check_ret(msg, e, voltage_type_cond):
self.raise_exception = e
elif not voltage_metric_cond == self.PASS:
if self._check_ret(msg, e, voltage_metric_cond):
self.raise_exception = e
else:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_vram_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_vram_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_vram_usage(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_vram_usage(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_vram_vendor(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_vram_vendor(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_xcd_counter(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_gpu_xcd_counter as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_xcd_counter(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_xgmi_link_status(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_gpu_xgmi_link_status as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_xgmi_link_status(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_hsmp_metrics_table(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_hsmp_metrics_table(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_hsmp_metrics_table_version(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_hsmp_metrics_table_version(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_lib_version(self):
self._print_func_name('')
msg = ''
try:
ret = amdsmi.amdsmi_get_lib_version()
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_link_metrics(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_link_metrics as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_link_metrics(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_link_topology_nearest(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for link_type_name, link_type, link_type_cond in self.link_types:
msg = f'gpu({i}): link_type({link_type_name})'
try:
ret = amdsmi.amdsmi_get_link_topology_nearest(gpu, link_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, link_type_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_minmax_bandwidth_between_processors(self):
self._print_func_name('')
for i, gpu_i in enumerate(self.processors):
for j, gpu_j in enumerate(self.processors):
msg = f'gpu({i},{j}):'
try:
ret = amdsmi.amdsmi_get_minmax_bandwidth_between_processors(gpu_i, gpu_j)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if i == j:
if self._check_ret(msg, e, self.FAIL):
self.raise_exception = e
else:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_pcie_info(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_pcie_info as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_pcie_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_cpu_pcie_link_rate(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_cpu_pcie_link_rate as it is not complete.")
# TODO rate_ctrl = 0
rate_ctrl = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): rate_ctrl({rate_ctrl}):'
try:
ret = amdsmi.amdsmi_set_cpu_pcie_link_rate(gpu, rate_ctrl)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_power_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_power_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_processor_count_from_handles(self):
self._print_func_name('')
msg = 'gpu():'
try:
ret = amdsmi.amdsmi_get_processor_count_from_handles(self.processors)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_processor_handle_from_bdf(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
bdf = amdsmi.amdsmi_get_gpu_device_bdf(gpu)
ret = amdsmi.amdsmi_get_processor_handle_from_bdf(bdf)
if gpu.value != ret.value:
msg += f'{msg}Expected: {gpu.value}, Received: {ret.value}'
self.raise_exception = amdsmi.AmdSmiLibraryException(amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
else:
self._print(msg)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_processor_handles(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
self._print(f' {i:2d} processor_handles: {gpu}')
if self.raise_exception:
raise self.raise_exception
return
def test_get_processor_handles_by_type(self):
self._print_func_name('')
socket_ids = amdsmi.amdsmi_get_socket_handles()
for index, socket_id in enumerate(socket_ids):
for processor_name, processor_type, processor_cond in self.processor_types:
msg = f'socket({index}): processor_type({processor_name}):'
try:
ret = amdsmi.amdsmi_get_processor_handles_by_type(socket_id, processor_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, processor_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_processor_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_processor_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_processor_type(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_processor_type(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_socket_handles(self):
self._print_func_name('')
msg = ''
try:
ret = amdsmi.amdsmi_get_socket_handles()
self._print(msg, [id(addr) for addr in ret])
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_socket_info(self):
self._print_func_name('')
sockets = amdsmi.amdsmi_get_socket_handles()
self.assertGreaterEqual(len(sockets), 1)
self.assertLessEqual(len(sockets), self.max_num_physical_devices)
for i, socket in enumerate(sockets):
msg = f'socket({i}):'
try:
ret = amdsmi.amdsmi_get_socket_info(socket)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_temp_metric(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_temp_metric as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
for temperature_type_name, temperature_type, temperature_type_cond in self.temperature_types:
for temperature_metric_name, temperature_metric, temperature_metric_cond in self.temperature_metrics:
msg = f'gpu({i}): temperature_type=({temperature_type_name}) temperature_metric({temperature_metric_name}):'
try:
ret = amdsmi.amdsmi_get_temp_metric(gpu, temperature_type, temperature_metric)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if not temperature_type_cond == self.PASS:
if self._check_ret(msg, e, temperature_type_cond):
self.raise_exception = e
elif not temperature_metric_cond == self.PASS:
if self._check_ret(msg, e, temperature_metric_cond):
self.raise_exception = e
else:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_threads_per_core(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_threads_per_core as it fails (IO Error).")
# TODO threads_per_core
msg = 'threads_per_core:'
try:
ret = amdsmi.amdsmi_get_threads_per_core()
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_utilization_count(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_utilization_count as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
for utilization_counter_type_name, utilization_counter_type, utilization_counter_type_cond in self.utilization_counter_types:
msg = f'gpu({i}): utilization_counter_type({utilization_counter_type_name}):'
try:
ret = amdsmi.amdsmi_get_utilization_count(gpu, [utilization_counter_type])
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, utilization_counter_type_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_violation_status(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_violation_status as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_violation_status(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_xgmi_info(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_xgmi_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_gpu_counter(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_gpu_counter as it fails (Error opening file).")
for i, gpu in enumerate(self.processors):
for event_type_name, event_type, event_type_cond in self.event_types:
msg = f'gpu({i}): event_type({event_type_name}):'
# Create
msg1 = f'{msg} Create counter:'
try:
event_handle = amdsmi.amdsmi_gpu_create_counter(gpu, event_type)
self._print(msg1, event_handle)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, event_type_cond):
self.raise_exception = e
# if any exception occurs, skip the rest of the loop
continue
# Read
msg1 = f'{msg} Read counter:'
try:
amdsmi.amdsmi_gpu_read_counter(event_handle)
self._print(msg1)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, event_type_cond):
self.raise_exception = e
# Control
for counter_command_name, counter_command, counter_commands_cond in self.counter_commands:
msg1 = f'{msg} event_type({event_type_name}): counter_command({counter_command_name}):'
try:
amdsmi.amdsmi_gpu_control_counter(event_handle, counter_command)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, counter_commands_cond):
self.raise_exception = e
# Destroy
msg1 = f'{msg} Destroy counter:'
try:
amdsmi.amdsmi_gpu_destroy_counter(event_handle)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, event_type_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_gpu_counter_group_supported(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for event_group_name, event_group, event_group_cond in self.event_groups:
msg = f'gpu({i}): event_group({event_group_name}):'
try:
amdsmi.amdsmi_gpu_counter_group_supported(gpu, event_group)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, event_group_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_available_counters(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for event_group_name, event_group_type, event_group_cond in self.event_groups:
msg = f'gpu({i}): event_group({event_group_name})'
try:
ret = amdsmi.amdsmi_get_gpu_available_counters(gpu, event_group_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, event_group_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_gpu_validate_ras_eeprom(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_gpu_validate_ras_eepromas it fails (File Error).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
amdsmi.amdsmi_gpu_validate_ras_eeprom(gpu)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_gpu_xgmi_error_status(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_gpu_xgmi_error_status as it fails on MI300.")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_gpu_xgmi_error_status(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_init(self):
self._print_func_name('')
msg = ''
try:
amdsmi.amdsmi_init()
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_shut_down(self):
self._print_func_name('')
msg = ''
try:
amdsmi.amdsmi_shut_down()
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_is_P2P_accessible(self):
self._print_func_name('')
for i, gpu_i in enumerate(self.processors):
for j, gpu_j in enumerate(self.processors):
msg = f'gpu({i},{j}):'
try:
ret = amdsmi.amdsmi_is_P2P_accessible(gpu_i, gpu_j)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_gpu_event(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_gpu_event as it fails (File Error).")
mask = 1 << (amdsmi.AmdSmiEvtNotificationType.GPU_PRE_RESET -1) | \
1 << (amdsmi.AmdSmiEvtNotificationType.GPU_POST_RESET -1)
timeout_ms = 1000
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
# Init
try:
self._print(f'{msg} amdsmi_init_gpu_event_notification()')
amdsmi.amdsmi_init_gpu_event_notification(gpu)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
# Skip remaining tests on any exception when initializing
continue
# Is Enabled
try:
self._print(f'{msg} amdsmi_is_gpu_power_management_enabled()')
ret = amdsmi.amdsmi_is_gpu_power_management_enabled(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
# Set Mask
try:
self._print(f'{msg} amdsmi_set_gpu_event_notification_mask()')
amdsmi.amdsmi_set_gpu_event_notification_mask(gpu, mask)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
# Get
try:
self._print(f'{msg} amdsmi_get_gpu_event_notification()')
ret = amdsmi.amdsmi_get_gpu_event_notification(timeout_ms)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
# Stop
try:
self._print(f'{msg} amdsmi_stop_gpu_event_notification()')
amdsmi.amdsmi_stop_gpu_event_notification(gpu)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_reset_gpu(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_reset_gpu as it fails (MI350X, Hang).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
amdsmi.amdsmi_reset_gpu(gpu)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_reset_gpu_fan(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
amdsmi.amdsmi_reset_gpu_fan(gpu, 0)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_reset_gpu_xgmi_error(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_reset_gpu_xgmi_error as it fails on MI300.")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
amdsmi.amdsmi_reset_gpu_xgmi_error(gpu)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_clk_freq(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_set_clk_freq as it fails (Perm failure).")
for i, gpu in enumerate(self.processors):
for clk_type_name, clk_type, clk_cond in self.clk_types:
msg = f'gpu({i}): Get Clock Info({clk_type_name}):'
try:
ret = amdsmi.amdsmi_get_clk_freq(gpu, clk_type)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, clk_cond):
self.raise_exception = e
continue
clk_freq_info = ret
current = clk_freq_info['current']
num_supported = clk_freq_info['num_supported']
frequency = clk_freq_info['frequency']
if num_supported == 0:
self._print(f'No supported frequencies for clk_type={clk_type_name}')
continue
found_error = False
for index in range(0, num_supported):
msg = f'\tgpu({i}):'
try:
freq_bitmask = frequency[index]
msg = f'{msg} Set clk_type({clk_type_name}): freq_bitmask({freq_bitmask}):'
amdsmi.amdsmi_set_clk_freq(gpu, clk_type_name, freq_bitmask)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
found_error = True
if self._check_ret(msg, e, clk_cond):
self.raise_exception = e
if not found_error:
amdsmi.amdsmi_set_clk_freq(gpu, clk_type_name, frequency[current])
if self.raise_exception:
raise self.raise_exception
return
def test_cpu_core_boostlimit(self):
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
msg1 = f'{msg} amdsmi_get_cpu_core_boostlimit():'
try:
boost_limit = amdsmi.amdsmi_get_cpu_core_boostlimit(gpu)
msg1 = f'{msg1} boost_limit={boost_limit}'
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
continue
msg1 = f'{msg} amdsmi_set_cpu_core_boostlimit():'
try:
amdsmi.amdsmi_set_cpu_core_boostlimit(gpu, boost_limit)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_cpu_df_pstate_range(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_cpu_df_pstate_range as it is not complete.")
# TODO max_pstate = 0, min_pstate = 0
max_pstate = 0
min_pstate = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): max_pstate({max_pstate}) min_pstate({min_pstate}):'
try:
amdsmi.amdsmi_set_cpu_df_pstate_range(gpu, max_pstate, min_pstate)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_cpu_gmi3_link_width_range(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_cpu_gmi3_link_width_range as it is not complete.")
# TODO min_link_width = 0, max_link_width = 0
min_link_width = 0
max_link_width = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): min_link_width({min_link_width}) max_link_width({max_link_width}):'
try:
amdsmi.amdsmi_set_cpu_gmi3_link_width_range(gpu, min_link_width, max_link_width)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_cpu_pwr_efficiency_mode(self):
self._print_func_name('')
modes = [0, 1, 2]
for i, gpu in enumerate(self.processors):
for mode in modes:
msg = f'gpu({i}): mode({mode}):'
try:
amdsmi.amdsmi_set_cpu_pwr_efficiency_mode(gpu, mode)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_cpu_socket_boostlimit(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_cpu_socket_boostlimit as it is not complete.")
# TODO boost_limit = 0
boost_limit = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
msg1 = f'{msg} boost_limit({boost_limit}):'
try:
amdsmi.amdsmi_set_cpu_socket_boostlimit(gpu, boost_limit)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_cpu_socket_lclk_dpm_level(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_cpu_socket_lclk_dpm_level as it is not complete.")
# TODO nbio_id = 0, min_val = 0, max_val = 0
nbio_id = 0
min_val = 0
max_val = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): nbio_id({nbio_id}) min_val({min_val}) max_val({max_val}):'
try:
amdsmi.amdsmi_set_cpu_socket_lclk_dpm_level(gpu, nbio_id, min_val, max_val)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_cpu_socket_power_cap(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
msg1 = f'{msg} amdsmi_get_cpu_socket_power_cap():'
try:
power_cap = amdsmi.amdsmi_get_cpu_socket_power_cap(gpu)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
continue
msg1 = f'{msg} power_cap={power_cap}'
try:
amdsmi.amdsmi_set_cpu_socket_power_cap(gpu, power_cap)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_cpu_xgmi_width(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_cpu_xgmi_width as it is not complete.")
# TODO min_width = 0, max_width = 0
min_width = 0
max_width = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): min_width({min_width} max_width({max_width}): )'
try:
amdsmi.amdsmi_set_cpu_xgmi_width(gpu, min_width , max_width)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_accelerator_partition_profile(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_gpu_accelerator_partition_profile as it is not complete.")
# TODO profile_index = 0
profile_index = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): profile_index({profile_index}):'
try:
amdsmi.amdsmi_set_gpu_accelerator_partition_profile(gpu, profile_index)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_clk_limit(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_gpu_clk_limit as it is not complete.")
# TODO Find better way to set value
value = 0
for i, gpu in enumerate(self.processors):
for clk_type_name, clk_type, clk_cond in self.clk_types:
for clk_limit_type_name, clk_limit_type, clk_limit_cond in self.clk_limit_types:
msg = f'gpu({i}): value({value}) clock_type=({clk_type_name}) clock_limit_type({clk_limit_type_name}):'
try:
amdsmi.amdsmi_set_gpu_clk_limit(gpu, clk_type_name, clk_limit_type_name, value)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if not clk_cond == self.PASS:
self._check_ret(msg, e, clk_cond)
self.raise_exception = e
elif not clk_limit_type == self.PASS:
self._check_ret(msg, e, clk_limit_type)
self.raise_exception = e
else:
self._check_ret(msg, e, self.PASS)
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_clk_range(self):
self._print_func_name('')
# TODO Find better way to set min_clk_value, max_clk_value
min_clk_value = 100
max_clk_value = 200
for i, gpu in enumerate(self.processors):
for clk_type_name, clk_type, clk_cond in self.clk_types:
msg = f'gpu({i}): min_clk_value({min_clk_value}) max_clk_value({max_clk_value}) clk_type({clk_type_name}):'
try:
amdsmi.amdsmi_set_gpu_clk_range(gpu, min_clk_value, max_clk_value, clk_type)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, clk_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_compute_partition(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_set_gpu_compute_partition as it fails on MI300.")
for i, gpu in enumerate(self.processors):
default_compute_partition_type = self.compute_partition_types[0][1]
msg = f'gpu({i}): amdsmi_get_gpu_compute_partition()'
try:
default_compute_partition_name = amdsmi.amdsmi_get_gpu_compute_partition(gpu)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
continue
for compute_partition_type_name, compute_partition_type, compute_partition_type_cond in self.compute_partition_types:
if default_compute_partition_name == compute_partition_type_name:
default_compute_partition_type = compute_partition_type
msg = f'gpu({i}): compute_partition_type({compute_partition_type_name}):'
try:
amdsmi.amdsmi_set_gpu_compute_partition(gpu, compute_partition_type)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, compute_partition_type_cond):
self.raise_exception = e
msg = f'gpu({i}): amdsmi_set_gpu_compute_partition({default_compute_partition_name})'
try:
amdsmi.amdsmi_set_gpu_compute_partition(gpu, default_compute_partition_type)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
continue
if self.raise_exception:
raise self.raise_exception
return
def test_gpu_fan_speed(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
msg1 = f'{msg} amdsmi_get_gpu_fan_speed()'
try:
# Determine current fan speed
fan_speed_current = amdsmi.amdsmi_get_gpu_fan_speed(gpu, 0)
msg1 = f'{msg1} fan_speed={fan_speed_current}'
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
continue
try:
# Determine max fan speed
msg1 = f'{msg} amdsmi_get_gpu_fan_speed_max()'
fan_speed_max = amdsmi.amdsmi_get_gpu_fan_speed_max(gpu, 0)
msg1 = f'{msg1} fan_speed_max={fan_speed_max}'
if fan_speed_current == fan_speed_max:
fan_speed = int(fan_speed_max/2)
else:
fan_speed = fan_speed_max
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
continue
try:
# Set fan speed
msg1 = f'{msg} fan_speed({fan_speed}):'
amdsmi.amdsmi_set_gpu_fan_speed(gpu, 0, fan_speed)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
try:
# Set to original fan speed
msg1 = f'{msg} fan_speed({fan_speed_current}):'
amdsmi.amdsmi_set_gpu_fan_speed(gpu, 0, fan_speed_current)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_memory_partition(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_set_gpu_memory_partition as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
for memory_partition_type_name, memory_partition_type, memory_partition_type_cond in self.memory_partition_types:
msg = f'gpu({i}): memory_partition_type({memory_partition_type_name}):'
try:
amdsmi.amdsmi_set_gpu_memory_partition(gpu, memory_partition_type)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, memory_partition_type_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_memory_partition_mode(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_set_gpu_memory_partition_mode as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
for memory_partition_type_name, memory_partition_type, memory_partition_type_cond in self.memory_partition_types:
msg = f'gpu({i}): memory_partition_type({memory_partition_type_name}):'
try:
amdsmi.amdsmi_set_gpu_memory_partition_mode(gpu, memory_partition_type)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, memory_partition_type_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_od_clk_info(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_gpu_od_clk_info as it is not complete.")
# TODO value = 0
value = 200
for i, gpu in enumerate(self.processors):
for freq_ind_name, freq_ind, freq_ind_cond in self.freq_inds:
for clk_type_name, clk_type, clk_cond in self.clk_types:
msg = f'gpu({i}): freq_ind({freq_ind_name}) value({value}) clk_type({clk_type_name}):'
try:
amdsmi.amdsmi_set_gpu_od_clk_info(gpu, freq_ind, value, clk_type)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if not freq_ind_cond == self.PASS:
self._check_ret(msg, e, freq_ind_cond)
self.raise_exception = e
elif not clk_cond == self.PASS:
self._check_ret(msg, e, clk_cond)
self.raise_exception = e
else:
self._check_ret(msg, e, self.PASS)
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_od_volt_info(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_gpu_od_volt_info as it is not complete.")
# TODO vpoint = 0 clk_value = 0 volt_value = 0
vpoint = 0
clk_value = 0
volt_value = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): vpoint({vpoint}) clk_value({clk_value}) volt_value({volt_value}):'
try:
amdsmi.amdsmi_set_gpu_od_volt_info(gpu, vpoint, clk_value, volt_value)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_overdrive_level(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
# Find current overdrive value
overdrive_value_current = amdsmi.amdsmi_get_gpu_overdrive_level(gpu)
if overdrive_value_current != 1:
overdrive_value = 1
else:
overdrive_value = 2
# Set overdrive value
msg = f'gpu({i}): overdrive_value({overdrive_value}):'
amdsmi.amdsmi_set_gpu_overdrive_level(gpu, overdrive_value)
self._print(msg, '')
# Set back to original overdrive value
amdsmi.amdsmi_set_gpu_overdrive_level(gpu, overdrive_value_current)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_pci_bandwidth(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_set_gpu_pci_bandwidth as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
# Get current PCI bandwidth info
bandwidth_info = amdsmi.amdsmi_get_gpu_pci_bandwidth(gpu)
current_bandwidth_index = bandwidth_info['transfer_rate']['current']
if current_bandwidth_index > 0:
bitmask = 1 << (current_bandwidth_index - 1)
else:
bitmask = 1 << (current_bandwidth_index)
# Set PCI bandwidth
msg = f'gpu({i}): bitmask({bitmask}):'
amdsmi.amdsmi_set_gpu_pci_bandwidth(gpu, bitmask)
self._print(msg, '')
# Set back to original PCI bandwidth
bitmask = 1 << (current_bandwidth_index)
amdsmi.amdsmi_set_gpu_pci_bandwidth(gpu, bitmask)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_perf_determinism_mode(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_gpu_perf_determinism_mode as it is not complete.")
# TODO clk_value = 0
clk_value = 0
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}): clk_value({clk_value}):'
try:
amdsmi.amdsmi_set_gpu_perf_determinism_mode(gpu, clk_value)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_perf_level(self):
self._print_func_name('')
if self.TODO_SKIP_NOT_COMPLETE:
self.skipTest("Skipping test_set_gpu_perf_level as it is not complete.")
dev_perf_level_current = self.dev_perf_levels[0][1]
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
dev_perf_level_name_current = amdsmi.amdsmi_get_gpu_perf_level(gpu)
items = dev_perf_level_name_current.split('_')
dev_perf_level_name_current = items[-1]
except amdsmi.AmdSmiLibraryException as e:
self._print(msg, e)
continue
for dev_perf_level_name, dev_perf_level, dev_perf_level_cond in self.dev_perf_levels:
msg = f'gpu({i}):'
try:
if dev_perf_level_name_current == dev_perf_level_name:
dev_perf_level_current = dev_perf_level
msg = f'{msg} dev_perf_level({dev_perf_level_name}):'
amdsmi.amdsmi_set_gpu_perf_level(gpu, dev_perf_level)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, dev_perf_level_cond):
self.raise_exception = e
try:
amdsmi.amdsmi_set_gpu_perf_level(gpu, dev_perf_level_current)
except amdsmi.AmdSmiLibraryException as e:
self._print(msg, e)
continue
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_power_profile(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
for power_profile_preset_mask_name, power_profile_preset_mask, power_profile_preset_masks_cond in self.power_profile_preset_masks:
msg = f'gpu({i}): power_profile_preset_mask({power_profile_preset_mask_name}):'
try:
amdsmi.amdsmi_set_gpu_power_profile(gpu, 0, power_profile_preset_mask)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, power_profile_preset_masks_cond):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_set_gpu_process_isolation(self):
self._print_func_name('')
pisolates = [1, 0]
for i, gpu in enumerate(self.processors):
for pisolate in pisolates:
msg = f'gpu({i}): pisolate({pisolate})'
try:
amdsmi.amdsmi_set_gpu_process_isolation(gpu, pisolate)
self._print(msg)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_power_cap(self):
'''test power cap'''
self._print_func_name('')
for i, gpu in enumerate(self.processors):
# Get Power Cap Info
msg = f'### test amdsmi_get_power_cap_info(gpu={i})'
try:
power_cap_info = amdsmi.amdsmi_get_power_cap_info(gpu)
self._print(msg, power_cap_info)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
# Have to be able to get info before setting
continue
# Set to Average Power Cap
try:
cap = int((power_cap_info['max_power_cap'] + power_cap_info['min_power_cap']) / 2)
msg = f'### test amdsmi_set_power_cap(gpu={i}, 0, cap={cap})'
amdsmi.amdsmi_set_power_cap(gpu, 0, cap)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
# Restore Power Cap
try:
cap = power_cap_info['power_cap']
msg = f'### test amdsmi_set_power_cap(gpu={i}, 0, cap={cap})'
amdsmi.amdsmi_set_power_cap(gpu, 0, cap)
self._print(msg, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_soc_pstate(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
# Get current policy info
msg1 = f'{msg} amdsmi_get_soc_pstate'
try:
policy_info = amdsmi.amdsmi_get_soc_pstate(gpu)
self._print(msg1, '')
num_supported = policy_info['num_supported']
if not isinstance(num_supported, int):
self._print('Cannot determine num_supported={num_supported}', '')
continue
policy_id_current = policy_info['current_id']
if not isinstance(policy_id_current, int):
self._print('Cannot determine policy_id_current={policy_id_current}', '')
continue
policy_id_orig = policy_info['policies'][policy_id_current]['policy_id']
if not isinstance(policy_id_orig, int):
self._print('Cannot determine orig policy_id={policy_id_orig}', '')
continue
index = 0
if num_supported >= 2:
if policy_id_current != 0:
index = 1
policy_id = policy_info['policies'][index]['policy_id']
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
continue
# Set SOC Pstate policy
msg1 = f'{msg} policy_id({policy_id}):'
try:
amdsmi.amdsmi_set_soc_pstate(gpu, policy_id)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
continue
# Set back to original policy
msg1 = f'{msg} policy_id({policy_id_orig}):'
try:
amdsmi.amdsmi_set_soc_pstate(gpu, policy_id_orig)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_xgmi_plpd(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_set_xgmi_plpd as it fails on MI300.")
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
# Get current policy info
msg1 = f'{msg} amdsmi_get_xgmi_plpd()'
try:
policy_info = amdsmi.amdsmi_get_xgmi_plpd(gpu)
self._print(msg1, '')
num_supported = policy_info['num_supported']
if not isinstance(num_supported, int):
self._print('Cannot determine num_supported={num_supported}', '')
continue
policy_id_current = policy_info['current_id']
if not isinstance(policy_id_current, int):
self._print('Cannot determine policy_id_current={policy_id_current}', '')
continue
policy_id_orig = policy_info['policies'][policy_id_current]['policy_id']
if not isinstance(policy_id_orig, int):
self._print('Cannot determine orig policy_id={policy_id_orig}', '')
continue
index = 0
if num_supported >= 2:
if policy_id_current != 0:
index = 1
policy_id = policy_info['policies'][index]['policy_id']
if not isinstance(policy_id, int):
self._print('Cannot determine policy_id={policy_id}', '')
continue
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
continue
# Set policy
msg1 = f'{msg} policy_id({policy_id}):'
try:
amdsmi.amdsmi_set_xgmi_plpd(gpu, policy_id)
self._print(msg1, '')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
# Set back to original policy
try:
msg1 = f'{msg} policy_id({policy_id_orig}):'
amdsmi.amdsmi_set_xgmi_plpd(gpu, policy_id_orig)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg1, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_status_code_to_string(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_status_code_to_string as it fails (Unhashable type).")
for error_num, error_name in error_map.items():
msg = f'{error_name}({error_num}):'
try:
ret = amdsmi.amdsmi_status_code_to_string(ctypes.c_uint32(int(error_num)))
self._print(f'{msg} {ret}')
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_topo_get_link_type(self):
self._print_func_name('')
for i, gpu_i in enumerate(self.processors):
for j, gpu_j in enumerate(self.processors):
msg = f'gpu({i},{j}):'
try:
ret = amdsmi.amdsmi_topo_get_link_type(gpu_i, gpu_j)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_topo_get_link_weight(self):
self._print_func_name('')
for i, gpu_i in enumerate(self.processors):
for j, gpu_j in enumerate(self.processors):
msg = f'gpu({i},{j}):'
try:
ret = amdsmi.amdsmi_topo_get_link_weight(gpu_i, gpu_j)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_topo_get_numa_node_number(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_topo_get_numa_node_number(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_topo_get_p2p_status(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_topo_get_p2p_status as it fails (Inval parameters).")
for i, gpu_i in enumerate(self.processors):
for j, gpu_j in enumerate(self.processors):
msg = f'gpu({i},{j}):'
try:
ret = amdsmi.amdsmi_topo_get_p2p_status(gpu_i, gpu_j)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
def test_get_gpu_busy_percent(self):
self._print_func_name('')
for i, gpu in enumerate(self.processors):
msg = f'gpu({i}):'
try:
ret = amdsmi.amdsmi_get_gpu_busy_percent(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
if self._check_ret(msg, e, self.PASS):
self.raise_exception = e
if self.raise_exception:
raise self.raise_exception
return
if __name__ == '__main__':
unittest.main()