From ced7d12395e78ba151e57e903f136e180dcdce9a Mon Sep 17 00:00:00 2001 From: "Narlo, Joseph" Date: Mon, 27 Oct 2025 14:43:31 -0500 Subject: [PATCH] [SWDEV-553416] Fix amdsmi_get_gpu_reg_table_info and amdsmi_get_gpu_pm_metrics_info(#787) Signed-off-by: Narlo, Joseph --- py-interface/amdsmi_interface.py | 95 ++++++++++++++++++----- tests/python_unittest/integration_test.py | 18 ++--- tests/python_unittest/unit_tests.py | 71 +++++++++-------- 3 files changed, 122 insertions(+), 62 deletions(-) diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 08dd540da2..348a341fc8 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -281,7 +281,7 @@ class AmdSmiTemperatureType(IntEnum): GPUBOARD_NODE_OAM_X_04_HBM_D_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR # OAM X 0.4V HBM D voltage regulator temperature GPUBOARD_NODE_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST - # GPU Board VR (Voltage Regulator) temperature + # GPU Board VR (Voltage Regulator) temperature GPUBOARD_VDDCR_VDD0 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0 # VDDCR VDD0 voltage regulator temperature GPUBOARD_VDDCR_VDD1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1 # VDDCR VDD1 voltage regulator temperature GPUBOARD_VDDCR_VDD2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2 # VDDCR VDD2 voltage regulator temperature @@ -297,7 +297,7 @@ class AmdSmiTemperatureType(IntEnum): GPUBOARD_VDDIO_11_E32 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32 # VDDIO 1.1V E32 voltage regulator temperature GPUBOARD_VR_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST - # Baseboard System temperature + # Baseboard System temperature BASEBOARD_UBB_FPGA = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA # UBB FPGA temperature BASEBOARD_UBB_FRONT = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT # UBB front temperature BASEBOARD_UBB_BACK = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK # UBB back temperature @@ -2056,19 +2056,20 @@ def amdsmi_get_cpu_affinity_with_scope( socket_count = amdsmi_get_cpu_socket_count() sock_info = amdsmi_get_cpu_cores_per_socket(socket_count) core_count = sock_info['cores_per_socket'] - + size = ctypes.c_uint32(0) size = (socket_count * core_count)/ (ctypes.sizeof(ctypes.c_uint64) * 8) size = int(math.ceil(size)) size = ctypes.c_uint32(size) cpu_set = (ctypes.c_uint64 * size.value)() - + _check_res( amdsmi_wrapper.amdsmi_get_cpu_affinity_with_scope( processor_handle, size, cpu_set, scope) ) return cpu_set + def amdsmi_get_gpu_asic_info( processor_handle: processor_handle_t, ) -> Dict[str, Any]: @@ -2176,6 +2177,67 @@ def amdsmi_get_power_cap_info( "max_power_cap": power_cap_info.max_power_cap} +def _get_name_value(num, data) -> List[Dict[str, int]]: + """ + Extracts a list of name-value pairs from a ctypes array buffer. + + This function works around a ctypes array issue where direct field access + to the `amdsmi_name_value_t` structure is unreliable. Instead, it uses + memory operations to extract the 'name' (a 64-byte char array) and 'value' + (a uint64) from each structure in the array. + + Parameters: + num (ctypes.c_uint32): Number of elements in the array. + data (ctypes.c_void_p): Pointer to the start of the array buffer containing + `amdsmi_name_value_t` structures. + + Returns: + List[Dict[str, int]]: A list of dictionaries, each with keys 'name' (str) + and 'value' (int) extracted from the buffer. + + Workaround: + Direct access to the fields of the ctypes array is broken, so the function + uses memory alignment and pointer arithmetic to extract the fields manually. + """ + + # Work around ctypes array issue by using memory access + # Use 4 byte alignment for amdsmi_name_value_t.name char array, 64=256/4 + # Use 8 bytes for amdsmi_name_value_t.value uint64 + aligned_name_size = int(AMDSMI_MAX_STRING_LENGTH / 4) + value_size_bytes = 8 + struct_alignment = aligned_name_size + value_size_bytes + + # Access name,value field using memory operations since direct access is broken + struct_ptr = ctypes.cast(data, ctypes.POINTER(ctypes.c_char * struct_alignment)) + + results = [] + for i in range(num.value): + # Offset into structure array + current_struct = struct_ptr[i] + + # Cast address for name member with max chars to read + name_ptr = ctypes.cast(ctypes.addressof(current_struct), ctypes.POINTER(ctypes.c_char * AMDSMI_MAX_STRING_LENGTH)) + # Data buffer in bytes + name_bytes = ctypes.string_at(name_ptr.contents) + # Get string + name_str = name_bytes.rstrip(b'\x00').decode('utf-8', errors='replace') + + # Address for value member + addr_value = ctypes.addressof(current_struct) + struct_alignment + # Cast data buffer to a uint64 + int64_ptr = ctypes.cast(addr_value, ctypes.POINTER(ctypes.c_uint64)) + # Get value + value = int64_ptr.contents.value + + item = { + 'name': name_str, + 'value': value + } + results.append(item) + + return results + + def amdsmi_get_gpu_pm_metrics_info( processor_handle: processor_handle_t, ) -> List[Dict[str, Any]]: @@ -2185,7 +2247,7 @@ def amdsmi_get_gpu_pm_metrics_info( ) pm_metrics = POINTER(amdsmi_wrapper.amdsmi_name_value_t)() - num_mets = ctypes.c_uint32() + num_mets = ctypes.c_uint32(0) _check_res( amdsmi_wrapper.amdsmi_get_gpu_pm_metrics_info( @@ -2193,16 +2255,11 @@ def amdsmi_get_gpu_pm_metrics_info( ) ) - results = [] - for i in range(num_mets.value): - item = { - 'name': pm_metrics[i].name.decode('utf-8'), - 'value': pm_metrics[i].value - } - results.append(item) + results = _get_name_value(num_mets, pm_metrics) # Free the allocated memory amdsmi_wrapper.amdsmi_free_name_value_pairs(pm_metrics) + return results @@ -2219,18 +2276,15 @@ def amdsmi_get_gpu_reg_table_info( _check_res( amdsmi_wrapper.amdsmi_get_gpu_reg_table_info( - processor_handle, reg_type, reg_metrics, ctypes.byref(num_regs) + processor_handle, reg_type, ctypes.byref(reg_metrics), ctypes.byref(num_regs) ) ) - results = [] - for i in range(num_regs.value): - item = { - 'name': reg_metrics[i].name, - 'value': reg_metrics[i].value - } - results.append(item) + results = _get_name_value(num_regs, reg_metrics) + + # Free the allocated memory amdsmi_wrapper.amdsmi_free_name_value_pairs(reg_metrics) + return results @@ -5692,3 +5746,4 @@ def amdsmi_get_gpu_busy_percent(processor_handle: processor_handle_t): gpu_busy_percent = ctypes.c_uint32(0) _check_res(amdsmi_wrapper.amdsmi_get_gpu_busy_percent(processor_handle, ctypes.byref(gpu_busy_percent))) return gpu_busy_percent.value + diff --git a/tests/python_unittest/integration_test.py b/tests/python_unittest/integration_test.py index 6107906f33..578a92417a 100755 --- a/tests/python_unittest/integration_test.py +++ b/tests/python_unittest/integration_test.py @@ -27,24 +27,22 @@ import threading import unittest -# Default path for AMDSMI_CLI_PATH is "/opt/rocm/libexec/amdsmi_cli/" -amdsmi_cli_path = os.environ.get("AMDSMI_CLI_PATH", "/opt/rocm/libexec/amdsmi_cli/") -if not os.path.exists(amdsmi_cli_path): - raise FileNotFoundError(f"AMDSMI_CLI_PATH '{amdsmi_cli_path}' does not exist. Please set the correct path in your environment.") -sys.path.append(amdsmi_cli_path) +amdsmi_path = os.environ.get("AMDSMI_PATH", "/opt/rocm/share/amd_smi") +if not os.path.exists(amdsmi_path): + raise FileNotFoundError(f"AMDSMI_PATH '{amdsmi_path}' does not exist. Please set the correct path in your environment.") +sys.path.append(amdsmi_path) + try: - import amdsmi, amdsmi.amdsmi_wrapper -except ImportError: - raise ImportError(f"Could not import the 'amdsmi' module from '{amdsmi_cli_path}'") + import amdsmi +except ImportError as exc: + raise ImportError(f'Could not import {amdsmi_path}') from exc class TestAmdSmiInit(unittest.TestCase): - def test_init(self): amdsmi.amdsmi_init() amdsmi.amdsmi_shut_down() class TestAmdSmiPythonInterface(unittest.TestCase): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/tests/python_unittest/unit_tests.py b/tests/python_unittest/unit_tests.py index 08532ff985..2f3678e819 100755 --- a/tests/python_unittest/unit_tests.py +++ b/tests/python_unittest/unit_tests.py @@ -1,36 +1,41 @@ #!/usr/bin/env python3 # -# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) Advanced Micro Devices. All rights reserved. # -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: # -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import ctypes import inspect import json +import os import sys import unittest -sys.path.append("/opt/rocm/libexec/amdsmi_cli/") + + +amdsmi_path = os.environ.get("AMDSMI_PATH", "/opt/rocm/share/amd_smi") +if not os.path.exists(amdsmi_path): + raise FileNotFoundError(f"AMDSMI_PATH '{amdsmi_path}' does not exist. Please set the correct path in your environment.") +sys.path.append(amdsmi_path) try: import amdsmi except ImportError as exc: - raise ImportError("Could not import /opt/rocm/libexec/amdsmi_cli/amdsmi_cli.py") from exc + raise ImportError(f'Could not import {amdsmi_path}') from exc not_supported_error_codes = \ [ @@ -220,26 +225,26 @@ class TestAmdSmiPythonBDF(unittest.TestCase): # expect retry error to raise SmiRetryException with self.assertRaises(amdsmi.AmdSmiRetryException) as retry_test: amdsmi.amdsmi_interface._check_res( - (lambda: amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_RETRY)()) + (lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY)()) # except retry error to have AMDSMI_STATUS_RETRY error code self.assertEqual(retry_test.exception.get_error_code(), - amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_RETRY) + amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY) # expect timeout error to raise SmiTimeoutException with self.assertRaises(amdsmi.AmdSmiTimeoutException) as timeout_test: amdsmi.amdsmi_interface._check_res( - (lambda: amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)()) + (lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)()) # except timeout error to have AMDSMI_STATUS_RETRY error code self.assertEqual(timeout_test.exception.get_error_code(), - amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT) + amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT) # expect invalid args error to raise AmdSmiLibraryException with self.assertRaises(amdsmi.AmdSmiLibraryException) as inval_test: amdsmi.amdsmi_interface._check_res( - (lambda: amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL)()) + (lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)()) # expect invalid args error to have AMDSMI_STATUS_INVAL error code self.assertEqual(inval_test.exception.get_error_code(), - amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL) + amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL) class TestAmdSmiPython(unittest.TestCase): @@ -257,6 +262,10 @@ class TestAmdSmiPython(unittest.TestCase): msg = f'asic info(gpu={i})' ret = amdsmi.amdsmi_get_gpu_asic_info(gpu) self._print(msg, ret) + except amdsmi.AmdSmiLibraryException as e: + raise e + for i, gpu in enumerate(self.processors): + try: # Print board info msg = f'board info(gpu={i})' ret = amdsmi.amdsmi_get_gpu_board_info(gpu) @@ -347,9 +356,9 @@ class TestAmdSmiPython(unittest.TestCase): io_bw_encodings = \ [ - ('AGG_BW0', amdsmi.amdsmi_interface.amdsmi_wrapper.AGG_BW0, PASS), - ('RD_BW0', amdsmi.amdsmi_interface.amdsmi_wrapper.RD_BW0, PASS), - ('WR_BW0', amdsmi.amdsmi_interface.amdsmi_wrapper.WR_BW0, PASS) + ('AGG_BW0', amdsmi.amdsmi_wrapper.AGG_BW0, PASS), + ('RD_BW0', amdsmi.amdsmi_wrapper.RD_BW0, PASS), + ('WR_BW0', amdsmi.amdsmi_wrapper.WR_BW0, PASS) ] event_groups = \ @@ -567,10 +576,10 @@ class TestAmdSmiPython(unittest.TestCase): print(msg, end='') else: print(msg) - if isinstance(data, str) or isinstance(data, int): - print(data) - else: + if isinstance(data, dict) or isinstance(data, list): print(json.dumps(data, sort_keys=False, indent=4), flush=True) + else: + print(data) return def _print_func_name(self, msg): @@ -1794,8 +1803,6 @@ class TestAmdSmiPython(unittest.TestCase): def test_get_gpu_reg_table_info(self): self._print_func_name('') - if self.TODO_SKIP_FAIL: - self.skipTest("Skipping test_get_gpu_reg_table_info as it fails on MI300.") for i, gpu in enumerate(self.processors): for reg_type_name, reg_type, reg_type_cond in self.reg_types: msg = f'gpu({i}): reg_type({reg_type_name}):' @@ -2179,7 +2186,7 @@ class TestAmdSmiPython(unittest.TestCase): ret = amdsmi.amdsmi_get_processor_handle_from_bdf(bdf) if gpu.value != ret.value: msg += f'{msg}Expected: {gpu.value}, Received: {ret.value}' - self.raise_exception = amdsmi.AmdSmiLibraryException(amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL) + self.raise_exception = amdsmi.AmdSmiLibraryException(amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL) else: self._print(msg) except amdsmi.AmdSmiLibraryException as e: