[SWDEV-553416] Fix amdsmi_get_gpu_reg_table_info and amdsmi_get_gpu_pm_metrics_info(#787)

Signed-off-by: Narlo, Joseph <Joseph.Narlo@amd.com>

[ROCm/amdsmi commit: ced7d12395]
Этот коммит содержится в:
Narlo, Joseph
2025-10-27 14:43:31 -05:00
коммит произвёл GitHub
родитель be53750aa3
Коммит 54317f3fe8
3 изменённых файлов: 122 добавлений и 62 удалений
+75 -20
Просмотреть файл
@@ -281,7 +281,7 @@ class AmdSmiTemperatureType(IntEnum):
GPUBOARD_NODE_OAM_X_04_HBM_D_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR # OAM X 0.4V HBM D voltage regulator temperature
GPUBOARD_NODE_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST
# GPU Board VR (Voltage Regulator) temperature
# GPU Board VR (Voltage Regulator) temperature
GPUBOARD_VDDCR_VDD0 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0 # VDDCR VDD0 voltage regulator temperature
GPUBOARD_VDDCR_VDD1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1 # VDDCR VDD1 voltage regulator temperature
GPUBOARD_VDDCR_VDD2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2 # VDDCR VDD2 voltage regulator temperature
@@ -297,7 +297,7 @@ class AmdSmiTemperatureType(IntEnum):
GPUBOARD_VDDIO_11_E32 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32 # VDDIO 1.1V E32 voltage regulator temperature
GPUBOARD_VR_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST
# Baseboard System temperature
# Baseboard System temperature
BASEBOARD_UBB_FPGA = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA # UBB FPGA temperature
BASEBOARD_UBB_FRONT = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT # UBB front temperature
BASEBOARD_UBB_BACK = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK # UBB back temperature
@@ -2056,19 +2056,20 @@ def amdsmi_get_cpu_affinity_with_scope(
socket_count = amdsmi_get_cpu_socket_count()
sock_info = amdsmi_get_cpu_cores_per_socket(socket_count)
core_count = sock_info['cores_per_socket']
size = ctypes.c_uint32(0)
size = (socket_count * core_count)/ (ctypes.sizeof(ctypes.c_uint64) * 8)
size = int(math.ceil(size))
size = ctypes.c_uint32(size)
cpu_set = (ctypes.c_uint64 * size.value)()
_check_res(
amdsmi_wrapper.amdsmi_get_cpu_affinity_with_scope(
processor_handle, size, cpu_set, scope)
)
return cpu_set
def amdsmi_get_gpu_asic_info(
processor_handle: processor_handle_t,
) -> Dict[str, Any]:
@@ -2176,6 +2177,67 @@ def amdsmi_get_power_cap_info(
"max_power_cap": power_cap_info.max_power_cap}
def _get_name_value(num, data) -> List[Dict[str, int]]:
"""
Extracts a list of name-value pairs from a ctypes array buffer.
This function works around a ctypes array issue where direct field access
to the `amdsmi_name_value_t` structure is unreliable. Instead, it uses
memory operations to extract the 'name' (a 64-byte char array) and 'value'
(a uint64) from each structure in the array.
Parameters:
num (ctypes.c_uint32): Number of elements in the array.
data (ctypes.c_void_p): Pointer to the start of the array buffer containing
`amdsmi_name_value_t` structures.
Returns:
List[Dict[str, int]]: A list of dictionaries, each with keys 'name' (str)
and 'value' (int) extracted from the buffer.
Workaround:
Direct access to the fields of the ctypes array is broken, so the function
uses memory alignment and pointer arithmetic to extract the fields manually.
"""
# Work around ctypes array issue by using memory access
# Use 4 byte alignment for amdsmi_name_value_t.name char array, 64=256/4
# Use 8 bytes for amdsmi_name_value_t.value uint64
aligned_name_size = int(AMDSMI_MAX_STRING_LENGTH / 4)
value_size_bytes = 8
struct_alignment = aligned_name_size + value_size_bytes
# Access name,value field using memory operations since direct access is broken
struct_ptr = ctypes.cast(data, ctypes.POINTER(ctypes.c_char * struct_alignment))
results = []
for i in range(num.value):
# Offset into structure array
current_struct = struct_ptr[i]
# Cast address for name member with max chars to read
name_ptr = ctypes.cast(ctypes.addressof(current_struct), ctypes.POINTER(ctypes.c_char * AMDSMI_MAX_STRING_LENGTH))
# Data buffer in bytes
name_bytes = ctypes.string_at(name_ptr.contents)
# Get string
name_str = name_bytes.rstrip(b'\x00').decode('utf-8', errors='replace')
# Address for value member
addr_value = ctypes.addressof(current_struct) + struct_alignment
# Cast data buffer to a uint64
int64_ptr = ctypes.cast(addr_value, ctypes.POINTER(ctypes.c_uint64))
# Get value
value = int64_ptr.contents.value
item = {
'name': name_str,
'value': value
}
results.append(item)
return results
def amdsmi_get_gpu_pm_metrics_info(
processor_handle: processor_handle_t,
) -> List[Dict[str, Any]]:
@@ -2185,7 +2247,7 @@ def amdsmi_get_gpu_pm_metrics_info(
)
pm_metrics = POINTER(amdsmi_wrapper.amdsmi_name_value_t)()
num_mets = ctypes.c_uint32()
num_mets = ctypes.c_uint32(0)
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_pm_metrics_info(
@@ -2193,16 +2255,11 @@ def amdsmi_get_gpu_pm_metrics_info(
)
)
results = []
for i in range(num_mets.value):
item = {
'name': pm_metrics[i].name.decode('utf-8'),
'value': pm_metrics[i].value
}
results.append(item)
results = _get_name_value(num_mets, pm_metrics)
# Free the allocated memory
amdsmi_wrapper.amdsmi_free_name_value_pairs(pm_metrics)
return results
@@ -2219,18 +2276,15 @@ def amdsmi_get_gpu_reg_table_info(
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_reg_table_info(
processor_handle, reg_type, reg_metrics, ctypes.byref(num_regs)
processor_handle, reg_type, ctypes.byref(reg_metrics), ctypes.byref(num_regs)
)
)
results = []
for i in range(num_regs.value):
item = {
'name': reg_metrics[i].name,
'value': reg_metrics[i].value
}
results.append(item)
results = _get_name_value(num_regs, reg_metrics)
# Free the allocated memory
amdsmi_wrapper.amdsmi_free_name_value_pairs(reg_metrics)
return results
@@ -5692,3 +5746,4 @@ def amdsmi_get_gpu_busy_percent(processor_handle: processor_handle_t):
gpu_busy_percent = ctypes.c_uint32(0)
_check_res(amdsmi_wrapper.amdsmi_get_gpu_busy_percent(processor_handle, ctypes.byref(gpu_busy_percent)))
return gpu_busy_percent.value
+8 -10
Просмотреть файл
@@ -27,24 +27,22 @@ import threading
import unittest
# Default path for AMDSMI_CLI_PATH is "/opt/rocm/libexec/amdsmi_cli/"
amdsmi_cli_path = os.environ.get("AMDSMI_CLI_PATH", "/opt/rocm/libexec/amdsmi_cli/")
if not os.path.exists(amdsmi_cli_path):
raise FileNotFoundError(f"AMDSMI_CLI_PATH '{amdsmi_cli_path}' does not exist. Please set the correct path in your environment.")
sys.path.append(amdsmi_cli_path)
amdsmi_path = os.environ.get("AMDSMI_PATH", "/opt/rocm/share/amd_smi")
if not os.path.exists(amdsmi_path):
raise FileNotFoundError(f"AMDSMI_PATH '{amdsmi_path}' does not exist. Please set the correct path in your environment.")
sys.path.append(amdsmi_path)
try:
import amdsmi, amdsmi.amdsmi_wrapper
except ImportError:
raise ImportError(f"Could not import the 'amdsmi' module from '{amdsmi_cli_path}'")
import amdsmi
except ImportError as exc:
raise ImportError(f'Could not import {amdsmi_path}') from exc
class TestAmdSmiInit(unittest.TestCase):
def test_init(self):
amdsmi.amdsmi_init()
amdsmi.amdsmi_shut_down()
class TestAmdSmiPythonInterface(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+39 -32
Просмотреть файл
@@ -1,36 +1,41 @@
#!/usr/bin/env python3
#
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) Advanced Micro Devices. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import ctypes
import inspect
import json
import os
import sys
import unittest
sys.path.append("/opt/rocm/libexec/amdsmi_cli/")
amdsmi_path = os.environ.get("AMDSMI_PATH", "/opt/rocm/share/amd_smi")
if not os.path.exists(amdsmi_path):
raise FileNotFoundError(f"AMDSMI_PATH '{amdsmi_path}' does not exist. Please set the correct path in your environment.")
sys.path.append(amdsmi_path)
try:
import amdsmi
except ImportError as exc:
raise ImportError("Could not import /opt/rocm/libexec/amdsmi_cli/amdsmi_cli.py") from exc
raise ImportError(f'Could not import {amdsmi_path}') from exc
not_supported_error_codes = \
[
@@ -220,26 +225,26 @@ class TestAmdSmiPythonBDF(unittest.TestCase):
# expect retry error to raise SmiRetryException
with self.assertRaises(amdsmi.AmdSmiRetryException) as retry_test:
amdsmi.amdsmi_interface._check_res(
(lambda: amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_RETRY)())
(lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY)())
# except retry error to have AMDSMI_STATUS_RETRY error code
self.assertEqual(retry_test.exception.get_error_code(),
amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_RETRY)
amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY)
# expect timeout error to raise SmiTimeoutException
with self.assertRaises(amdsmi.AmdSmiTimeoutException) as timeout_test:
amdsmi.amdsmi_interface._check_res(
(lambda: amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)())
(lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)())
# except timeout error to have AMDSMI_STATUS_RETRY error code
self.assertEqual(timeout_test.exception.get_error_code(),
amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)
amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)
# expect invalid args error to raise AmdSmiLibraryException
with self.assertRaises(amdsmi.AmdSmiLibraryException) as inval_test:
amdsmi.amdsmi_interface._check_res(
(lambda: amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL)())
(lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)())
# expect invalid args error to have AMDSMI_STATUS_INVAL error code
self.assertEqual(inval_test.exception.get_error_code(),
amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
class TestAmdSmiPython(unittest.TestCase):
@@ -257,6 +262,10 @@ class TestAmdSmiPython(unittest.TestCase):
msg = f'asic info(gpu={i})'
ret = amdsmi.amdsmi_get_gpu_asic_info(gpu)
self._print(msg, ret)
except amdsmi.AmdSmiLibraryException as e:
raise e
for i, gpu in enumerate(self.processors):
try:
# Print board info
msg = f'board info(gpu={i})'
ret = amdsmi.amdsmi_get_gpu_board_info(gpu)
@@ -347,9 +356,9 @@ class TestAmdSmiPython(unittest.TestCase):
io_bw_encodings = \
[
('AGG_BW0', amdsmi.amdsmi_interface.amdsmi_wrapper.AGG_BW0, PASS),
('RD_BW0', amdsmi.amdsmi_interface.amdsmi_wrapper.RD_BW0, PASS),
('WR_BW0', amdsmi.amdsmi_interface.amdsmi_wrapper.WR_BW0, PASS)
('AGG_BW0', amdsmi.amdsmi_wrapper.AGG_BW0, PASS),
('RD_BW0', amdsmi.amdsmi_wrapper.RD_BW0, PASS),
('WR_BW0', amdsmi.amdsmi_wrapper.WR_BW0, PASS)
]
event_groups = \
@@ -567,10 +576,10 @@ class TestAmdSmiPython(unittest.TestCase):
print(msg, end='')
else:
print(msg)
if isinstance(data, str) or isinstance(data, int):
print(data)
else:
if isinstance(data, dict) or isinstance(data, list):
print(json.dumps(data, sort_keys=False, indent=4), flush=True)
else:
print(data)
return
def _print_func_name(self, msg):
@@ -1794,8 +1803,6 @@ class TestAmdSmiPython(unittest.TestCase):
def test_get_gpu_reg_table_info(self):
self._print_func_name('')
if self.TODO_SKIP_FAIL:
self.skipTest("Skipping test_get_gpu_reg_table_info as it fails on MI300.")
for i, gpu in enumerate(self.processors):
for reg_type_name, reg_type, reg_type_cond in self.reg_types:
msg = f'gpu({i}): reg_type({reg_type_name}):'
@@ -2179,7 +2186,7 @@ class TestAmdSmiPython(unittest.TestCase):
ret = amdsmi.amdsmi_get_processor_handle_from_bdf(bdf)
if gpu.value != ret.value:
msg += f'{msg}Expected: {gpu.value}, Received: {ret.value}'
self.raise_exception = amdsmi.AmdSmiLibraryException(amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
self.raise_exception = amdsmi.AmdSmiLibraryException(amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
else:
self._print(msg)
except amdsmi.AmdSmiLibraryException as e: