[SWDEV-553416] Fix amdsmi_get_gpu_reg_table_info and amdsmi_get_gpu_pm_metrics_info(#787)
Signed-off-by: Narlo, Joseph <Joseph.Narlo@amd.com>
This commit is contained in:
@@ -281,7 +281,7 @@ class AmdSmiTemperatureType(IntEnum):
|
||||
GPUBOARD_NODE_OAM_X_04_HBM_D_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR # OAM X 0.4V HBM D voltage regulator temperature
|
||||
GPUBOARD_NODE_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST
|
||||
|
||||
# GPU Board VR (Voltage Regulator) temperature
|
||||
# GPU Board VR (Voltage Regulator) temperature
|
||||
GPUBOARD_VDDCR_VDD0 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0 # VDDCR VDD0 voltage regulator temperature
|
||||
GPUBOARD_VDDCR_VDD1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1 # VDDCR VDD1 voltage regulator temperature
|
||||
GPUBOARD_VDDCR_VDD2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2 # VDDCR VDD2 voltage regulator temperature
|
||||
@@ -297,7 +297,7 @@ class AmdSmiTemperatureType(IntEnum):
|
||||
GPUBOARD_VDDIO_11_E32 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32 # VDDIO 1.1V E32 voltage regulator temperature
|
||||
GPUBOARD_VR_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST
|
||||
|
||||
# Baseboard System temperature
|
||||
# Baseboard System temperature
|
||||
BASEBOARD_UBB_FPGA = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA # UBB FPGA temperature
|
||||
BASEBOARD_UBB_FRONT = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT # UBB front temperature
|
||||
BASEBOARD_UBB_BACK = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK # UBB back temperature
|
||||
@@ -2056,19 +2056,20 @@ def amdsmi_get_cpu_affinity_with_scope(
|
||||
socket_count = amdsmi_get_cpu_socket_count()
|
||||
sock_info = amdsmi_get_cpu_cores_per_socket(socket_count)
|
||||
core_count = sock_info['cores_per_socket']
|
||||
|
||||
|
||||
size = ctypes.c_uint32(0)
|
||||
size = (socket_count * core_count)/ (ctypes.sizeof(ctypes.c_uint64) * 8)
|
||||
size = int(math.ceil(size))
|
||||
size = ctypes.c_uint32(size)
|
||||
cpu_set = (ctypes.c_uint64 * size.value)()
|
||||
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_cpu_affinity_with_scope(
|
||||
processor_handle, size, cpu_set, scope)
|
||||
)
|
||||
return cpu_set
|
||||
|
||||
|
||||
def amdsmi_get_gpu_asic_info(
|
||||
processor_handle: processor_handle_t,
|
||||
) -> Dict[str, Any]:
|
||||
@@ -2176,6 +2177,67 @@ def amdsmi_get_power_cap_info(
|
||||
"max_power_cap": power_cap_info.max_power_cap}
|
||||
|
||||
|
||||
def _get_name_value(num, data) -> List[Dict[str, int]]:
|
||||
"""
|
||||
Extracts a list of name-value pairs from a ctypes array buffer.
|
||||
|
||||
This function works around a ctypes array issue where direct field access
|
||||
to the `amdsmi_name_value_t` structure is unreliable. Instead, it uses
|
||||
memory operations to extract the 'name' (a 64-byte char array) and 'value'
|
||||
(a uint64) from each structure in the array.
|
||||
|
||||
Parameters:
|
||||
num (ctypes.c_uint32): Number of elements in the array.
|
||||
data (ctypes.c_void_p): Pointer to the start of the array buffer containing
|
||||
`amdsmi_name_value_t` structures.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, int]]: A list of dictionaries, each with keys 'name' (str)
|
||||
and 'value' (int) extracted from the buffer.
|
||||
|
||||
Workaround:
|
||||
Direct access to the fields of the ctypes array is broken, so the function
|
||||
uses memory alignment and pointer arithmetic to extract the fields manually.
|
||||
"""
|
||||
|
||||
# Work around ctypes array issue by using memory access
|
||||
# Use 4 byte alignment for amdsmi_name_value_t.name char array, 64=256/4
|
||||
# Use 8 bytes for amdsmi_name_value_t.value uint64
|
||||
aligned_name_size = int(AMDSMI_MAX_STRING_LENGTH / 4)
|
||||
value_size_bytes = 8
|
||||
struct_alignment = aligned_name_size + value_size_bytes
|
||||
|
||||
# Access name,value field using memory operations since direct access is broken
|
||||
struct_ptr = ctypes.cast(data, ctypes.POINTER(ctypes.c_char * struct_alignment))
|
||||
|
||||
results = []
|
||||
for i in range(num.value):
|
||||
# Offset into structure array
|
||||
current_struct = struct_ptr[i]
|
||||
|
||||
# Cast address for name member with max chars to read
|
||||
name_ptr = ctypes.cast(ctypes.addressof(current_struct), ctypes.POINTER(ctypes.c_char * AMDSMI_MAX_STRING_LENGTH))
|
||||
# Data buffer in bytes
|
||||
name_bytes = ctypes.string_at(name_ptr.contents)
|
||||
# Get string
|
||||
name_str = name_bytes.rstrip(b'\x00').decode('utf-8', errors='replace')
|
||||
|
||||
# Address for value member
|
||||
addr_value = ctypes.addressof(current_struct) + struct_alignment
|
||||
# Cast data buffer to a uint64
|
||||
int64_ptr = ctypes.cast(addr_value, ctypes.POINTER(ctypes.c_uint64))
|
||||
# Get value
|
||||
value = int64_ptr.contents.value
|
||||
|
||||
item = {
|
||||
'name': name_str,
|
||||
'value': value
|
||||
}
|
||||
results.append(item)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def amdsmi_get_gpu_pm_metrics_info(
|
||||
processor_handle: processor_handle_t,
|
||||
) -> List[Dict[str, Any]]:
|
||||
@@ -2185,7 +2247,7 @@ def amdsmi_get_gpu_pm_metrics_info(
|
||||
)
|
||||
|
||||
pm_metrics = POINTER(amdsmi_wrapper.amdsmi_name_value_t)()
|
||||
num_mets = ctypes.c_uint32()
|
||||
num_mets = ctypes.c_uint32(0)
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_pm_metrics_info(
|
||||
@@ -2193,16 +2255,11 @@ def amdsmi_get_gpu_pm_metrics_info(
|
||||
)
|
||||
)
|
||||
|
||||
results = []
|
||||
for i in range(num_mets.value):
|
||||
item = {
|
||||
'name': pm_metrics[i].name.decode('utf-8'),
|
||||
'value': pm_metrics[i].value
|
||||
}
|
||||
results.append(item)
|
||||
results = _get_name_value(num_mets, pm_metrics)
|
||||
|
||||
# Free the allocated memory
|
||||
amdsmi_wrapper.amdsmi_free_name_value_pairs(pm_metrics)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -2219,18 +2276,15 @@ def amdsmi_get_gpu_reg_table_info(
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_reg_table_info(
|
||||
processor_handle, reg_type, reg_metrics, ctypes.byref(num_regs)
|
||||
processor_handle, reg_type, ctypes.byref(reg_metrics), ctypes.byref(num_regs)
|
||||
)
|
||||
)
|
||||
|
||||
results = []
|
||||
for i in range(num_regs.value):
|
||||
item = {
|
||||
'name': reg_metrics[i].name,
|
||||
'value': reg_metrics[i].value
|
||||
}
|
||||
results.append(item)
|
||||
results = _get_name_value(num_regs, reg_metrics)
|
||||
|
||||
# Free the allocated memory
|
||||
amdsmi_wrapper.amdsmi_free_name_value_pairs(reg_metrics)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -5692,3 +5746,4 @@ def amdsmi_get_gpu_busy_percent(processor_handle: processor_handle_t):
|
||||
gpu_busy_percent = ctypes.c_uint32(0)
|
||||
_check_res(amdsmi_wrapper.amdsmi_get_gpu_busy_percent(processor_handle, ctypes.byref(gpu_busy_percent)))
|
||||
return gpu_busy_percent.value
|
||||
|
||||
|
||||
@@ -27,24 +27,22 @@ import threading
|
||||
import unittest
|
||||
|
||||
|
||||
# Default path for AMDSMI_CLI_PATH is "/opt/rocm/libexec/amdsmi_cli/"
|
||||
amdsmi_cli_path = os.environ.get("AMDSMI_CLI_PATH", "/opt/rocm/libexec/amdsmi_cli/")
|
||||
if not os.path.exists(amdsmi_cli_path):
|
||||
raise FileNotFoundError(f"AMDSMI_CLI_PATH '{amdsmi_cli_path}' does not exist. Please set the correct path in your environment.")
|
||||
sys.path.append(amdsmi_cli_path)
|
||||
amdsmi_path = os.environ.get("AMDSMI_PATH", "/opt/rocm/share/amd_smi")
|
||||
if not os.path.exists(amdsmi_path):
|
||||
raise FileNotFoundError(f"AMDSMI_PATH '{amdsmi_path}' does not exist. Please set the correct path in your environment.")
|
||||
sys.path.append(amdsmi_path)
|
||||
|
||||
try:
|
||||
import amdsmi, amdsmi.amdsmi_wrapper
|
||||
except ImportError:
|
||||
raise ImportError(f"Could not import the 'amdsmi' module from '{amdsmi_cli_path}'")
|
||||
import amdsmi
|
||||
except ImportError as exc:
|
||||
raise ImportError(f'Could not import {amdsmi_path}') from exc
|
||||
|
||||
class TestAmdSmiInit(unittest.TestCase):
|
||||
|
||||
def test_init(self):
|
||||
amdsmi.amdsmi_init()
|
||||
amdsmi.amdsmi_shut_down()
|
||||
|
||||
class TestAmdSmiPythonInterface(unittest.TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
@@ -1,36 +1,41 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) Advanced Micro Devices. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# this software and associated documentation files (the "Software"), to deal in
|
||||
# the Software without restriction, including without limitation the rights to
|
||||
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
# the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
# subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
import ctypes
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
sys.path.append("/opt/rocm/libexec/amdsmi_cli/")
|
||||
|
||||
|
||||
amdsmi_path = os.environ.get("AMDSMI_PATH", "/opt/rocm/share/amd_smi")
|
||||
if not os.path.exists(amdsmi_path):
|
||||
raise FileNotFoundError(f"AMDSMI_PATH '{amdsmi_path}' does not exist. Please set the correct path in your environment.")
|
||||
sys.path.append(amdsmi_path)
|
||||
|
||||
try:
|
||||
import amdsmi
|
||||
except ImportError as exc:
|
||||
raise ImportError("Could not import /opt/rocm/libexec/amdsmi_cli/amdsmi_cli.py") from exc
|
||||
raise ImportError(f'Could not import {amdsmi_path}') from exc
|
||||
|
||||
not_supported_error_codes = \
|
||||
[
|
||||
@@ -220,26 +225,26 @@ class TestAmdSmiPythonBDF(unittest.TestCase):
|
||||
# expect retry error to raise SmiRetryException
|
||||
with self.assertRaises(amdsmi.AmdSmiRetryException) as retry_test:
|
||||
amdsmi.amdsmi_interface._check_res(
|
||||
(lambda: amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_RETRY)())
|
||||
(lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY)())
|
||||
# except retry error to have AMDSMI_STATUS_RETRY error code
|
||||
self.assertEqual(retry_test.exception.get_error_code(),
|
||||
amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_RETRY)
|
||||
amdsmi.amdsmi_wrapper.AMDSMI_STATUS_RETRY)
|
||||
|
||||
# expect timeout error to raise SmiTimeoutException
|
||||
with self.assertRaises(amdsmi.AmdSmiTimeoutException) as timeout_test:
|
||||
amdsmi.amdsmi_interface._check_res(
|
||||
(lambda: amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)())
|
||||
(lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)())
|
||||
# except timeout error to have AMDSMI_STATUS_RETRY error code
|
||||
self.assertEqual(timeout_test.exception.get_error_code(),
|
||||
amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)
|
||||
amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT)
|
||||
|
||||
# expect invalid args error to raise AmdSmiLibraryException
|
||||
with self.assertRaises(amdsmi.AmdSmiLibraryException) as inval_test:
|
||||
amdsmi.amdsmi_interface._check_res(
|
||||
(lambda: amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL)())
|
||||
(lambda: amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)())
|
||||
# expect invalid args error to have AMDSMI_STATUS_INVAL error code
|
||||
self.assertEqual(inval_test.exception.get_error_code(),
|
||||
amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
|
||||
amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
|
||||
|
||||
class TestAmdSmiPython(unittest.TestCase):
|
||||
|
||||
@@ -257,6 +262,10 @@ class TestAmdSmiPython(unittest.TestCase):
|
||||
msg = f'asic info(gpu={i})'
|
||||
ret = amdsmi.amdsmi_get_gpu_asic_info(gpu)
|
||||
self._print(msg, ret)
|
||||
except amdsmi.AmdSmiLibraryException as e:
|
||||
raise e
|
||||
for i, gpu in enumerate(self.processors):
|
||||
try:
|
||||
# Print board info
|
||||
msg = f'board info(gpu={i})'
|
||||
ret = amdsmi.amdsmi_get_gpu_board_info(gpu)
|
||||
@@ -347,9 +356,9 @@ class TestAmdSmiPython(unittest.TestCase):
|
||||
|
||||
io_bw_encodings = \
|
||||
[
|
||||
('AGG_BW0', amdsmi.amdsmi_interface.amdsmi_wrapper.AGG_BW0, PASS),
|
||||
('RD_BW0', amdsmi.amdsmi_interface.amdsmi_wrapper.RD_BW0, PASS),
|
||||
('WR_BW0', amdsmi.amdsmi_interface.amdsmi_wrapper.WR_BW0, PASS)
|
||||
('AGG_BW0', amdsmi.amdsmi_wrapper.AGG_BW0, PASS),
|
||||
('RD_BW0', amdsmi.amdsmi_wrapper.RD_BW0, PASS),
|
||||
('WR_BW0', amdsmi.amdsmi_wrapper.WR_BW0, PASS)
|
||||
]
|
||||
|
||||
event_groups = \
|
||||
@@ -567,10 +576,10 @@ class TestAmdSmiPython(unittest.TestCase):
|
||||
print(msg, end='')
|
||||
else:
|
||||
print(msg)
|
||||
if isinstance(data, str) or isinstance(data, int):
|
||||
print(data)
|
||||
else:
|
||||
if isinstance(data, dict) or isinstance(data, list):
|
||||
print(json.dumps(data, sort_keys=False, indent=4), flush=True)
|
||||
else:
|
||||
print(data)
|
||||
return
|
||||
|
||||
def _print_func_name(self, msg):
|
||||
@@ -1794,8 +1803,6 @@ class TestAmdSmiPython(unittest.TestCase):
|
||||
|
||||
def test_get_gpu_reg_table_info(self):
|
||||
self._print_func_name('')
|
||||
if self.TODO_SKIP_FAIL:
|
||||
self.skipTest("Skipping test_get_gpu_reg_table_info as it fails on MI300.")
|
||||
for i, gpu in enumerate(self.processors):
|
||||
for reg_type_name, reg_type, reg_type_cond in self.reg_types:
|
||||
msg = f'gpu({i}): reg_type({reg_type_name}):'
|
||||
@@ -2179,7 +2186,7 @@ class TestAmdSmiPython(unittest.TestCase):
|
||||
ret = amdsmi.amdsmi_get_processor_handle_from_bdf(bdf)
|
||||
if gpu.value != ret.value:
|
||||
msg += f'{msg}Expected: {gpu.value}, Received: {ret.value}'
|
||||
self.raise_exception = amdsmi.AmdSmiLibraryException(amdsmi.amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
|
||||
self.raise_exception = amdsmi.AmdSmiLibraryException(amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL)
|
||||
else:
|
||||
self._print(msg)
|
||||
except amdsmi.AmdSmiLibraryException as e:
|
||||
|
||||
Reference in New Issue
Block a user