Query UBB/OAM temperature API (#581)

Add support to Query UBB/OAM temperature.
* Updated Python API with new temperature metrics enum

---------

Co-authored-by: Bill Liu <shuzhliu@amd.com>
Co-authored-by: gabrpham_amdeng <Gabriel.Pham@amd.com>
This commit is contained in:
Liu, Shuzhou (Bill)
2025-08-05 21:37:45 -04:00
committed by GitHub
parent 753a5ea326
commit abd3c02a3c
12 changed files with 964 additions and 25 deletions
+5 -2
View File
@@ -187,7 +187,8 @@ set(CMN_SRC_LIST
"${ROCM_SRC_DIR}/rocm_smi.cc"
"${ROCM_SRC_DIR}/rocm_smi_logger.cc"
"${SHR_MUTEX_DIR}/shared_mutex.cc"
"${ROCM_SRC_DIR}/rocm_smi_binary_parser.cc")
"${ROCM_SRC_DIR}/rocm_smi_binary_parser.cc"
"${ROCM_SRC_DIR}/rocm_smi_board_temp.cc")
if(ENABLE_ESMI_LIB)
list(APPEND CMN_SRC_LIST ${ESMI_SRC_DIR}/e_smi.c)
@@ -211,7 +212,9 @@ set(CMN_INC_LIST
"${ROCM_INC_DIR}/rocm_smi.h"
"${ROCM_INC_DIR}/rocm_smi_logger.h"
"${SHR_MUTEX_DIR}/shared_mutex.h"
"${ROCM_INC_DIR}/rocm_smi_binary_parser.h")
"${ROCM_INC_DIR}/rocm_smi_binary_parser.h"
"${ROCM_INC_DIR}/rocm_smi_board_temp.h"
)
add_subdirectory("rocm_smi")
add_subdirectory("src")
+60 -1
View File
@@ -464,7 +464,66 @@ typedef enum {
AMDSMI_TEMPERATURE_TYPE_HBM_2, //!< High Bandwidth 2 temperature per stack
AMDSMI_TEMPERATURE_TYPE_HBM_3, //!< High Bandwidth 3 temperature per stack
AMDSMI_TEMPERATURE_TYPE_PLX, //!< PCIe switch temperature
AMDSMI_TEMPERATURE_TYPE__MAX = AMDSMI_TEMPERATURE_TYPE_PLX
// GPU Board Node temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST = 100,
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X
= AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST, //!< Retimer X temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC, //!< OAM X IBC temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2, //!< OAM X IBC 2 temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR, //!< OAM X VDD 1.8V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR, //!< OAM X 0.4V HBM B voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR, //!< OAM X 0.4V HBM D voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST = 149,
// GPU Board VR (Voltage Regulator) temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST = 150,
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0
= AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST, //!< VDDCR VDD0 voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1, //!< VDDCR VDD1 voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2, //!< VDDCR VDD2 voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3, //!< VDDCR VDD3 voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A, //!< VDDCR SOC A voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C, //!< VDDCR SOC C voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A, //!< VDDCR SOCIO A voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C, //!< VDDCR SOCIO C voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM, //!< VDD 0.85V HBM voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B, //!< VDDCR 1.1V HBM B voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D, //!< VDDCR 1.1V HBM D voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR, //!< VDD USR voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32, //!< VDDIO 1.1V E32 voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST = 199,
// Baseboard System temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST = 200,
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA
= AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST, //!< UBB FPGA temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT, //!< UBB front temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK, //!< UBB back temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7, //!< UBB OAM7 temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC, //!< UBB IBC temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA, //!< UBB UFPGA temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1, //!< UBB OAM1 temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC, //!< OAM 0-1 HSC temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC, //!< OAM 2-3 HSC temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC, //!< OAM 4-5 HSC temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC, //!< OAM 6-7 HSC temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR, //!< UBB FPGA 0.72V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR, //!< UBB FPGA 3.3V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR, //!< Retimer 0-1-2-3 1.2V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR, //!< Retimer 4-5-6-7 1.2V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR, //!< Retimer 0-1 0.9V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR, //!< Retimer 4-5 0.9V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR, //!< Retimer 2-3 0.9V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR, //!< Retimer 6-7 0.9V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR, //!< OAM 0-1-2-3 3.3V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR, //!< OAM 4-5-6-7 3.3V voltage regulator temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC, //!< IBC HSC temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC, //!< IBC temperature
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST = 249,
AMDSMI_TEMPERATURE_TYPE__MAX = AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST, //!< Maximum per GPU temperature type
} amdsmi_temperature_type_t;
/**
+55
View File
@@ -222,6 +222,61 @@ class AmdSmiTemperatureType(IntEnum):
HBM_3 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_HBM_3
PLX = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_PLX
# GPU Board Node temperature
GPUBOARD_NODE_FIRST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST
GPUBOARD_NODE_RETIMER_X = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X # Retimer X temperature
GPUBOARD_NODE_OAM_X_IBC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC # OAM X IBC temperature
GPUBOARD_NODE_OAM_X_IBC_2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2 # OAM X IBC 2 temperature
GPUBOARD_NODE_OAM_X_VDD18_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR # OAM X VDD 1.8V voltage regulator temperature
GPUBOARD_NODE_OAM_X_04_HBM_B_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR # OAM X 0.4V HBM B voltage regulator temperature
GPUBOARD_NODE_OAM_X_04_HBM_D_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR # OAM X 0.4V HBM D voltage regulator temperature
GPUBOARD_NODE_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST
# GPU Board VR (Voltage Regulator) temperature
GPUBOARD_VR_FIRST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST
GPUBOARD_VDDCR_VDD0 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0 # VDDCR VDD0 voltage regulator temperature
GPUBOARD_VDDCR_VDD1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1 # VDDCR VDD1 voltage regulator temperature
GPUBOARD_VDDCR_VDD2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2 # VDDCR VDD2 voltage regulator temperature
GPUBOARD_VDDCR_VDD3 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3 # VDDCR VDD3 voltage regulator temperature
GPUBOARD_VDDCR_SOC_A = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A # VDDCR SOC A voltage regulator temperature
GPUBOARD_VDDCR_SOC_C = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C # VDDCR SOC C voltage regulator temperature
GPUBOARD_VDDCR_SOCIO_A = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A # VDDCR SOCIO A voltage regulator temperature
GPUBOARD_VDDCR_SOCIO_C = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C # VDDCR SOCIO C voltage regulator temperature
GPUBOARD_VDD_085_HBM = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM # VDD 0.85V HBM voltage regulator temperature
GPUBOARD_VDDCR_11_HBM_B = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B # VDDCR 1.1V HBM B voltage regulator temperature
GPUBOARD_VDDCR_11_HBM_D = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D # VDDCR 1.1V HBM D voltage regulator temperature
GPUBOARD_VDD_USR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR # VDD USR voltage regulator temperature
GPUBOARD_VDDIO_11_E32 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32 # VDDIO 1.1V E32 voltage regulator temperature
GPUBOARD_VR_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST
# Baseboard System temperature
BASEBOARD_FIRST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST
BASEBOARD_UBB_FPGA = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA # UBB FPGA temperature
BASEBOARD_UBB_FRONT = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT # UBB front temperature
BASEBOARD_UBB_BACK = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK # UBB back temperature
BASEBOARD_UBB_OAM7 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7 # UBB OAM7 temperature
BASEBOARD_UBB_IBC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC # UBB IBC temperature
BASEBOARD_UBB_UFPGA = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA # UBB UFPGA temperature
BASEBOARD_UBB_OAM1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1 # UBB OAM1 temperature
BASEBOARD_OAM_0_1_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC # OAM 0-1 HSC temperature
BASEBOARD_OAM_2_3_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC # OAM 2-3 HSC temperature
BASEBOARD_OAM_4_5_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC # OAM 4-5 HSC temperature
BASEBOARD_OAM_6_7_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC # OAM 6-7 HSC temperature
BASEBOARD_UBB_FPGA_0V72_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR # UBB FPGA 0.72V voltage regulator temperature
BASEBOARD_UBB_FPGA_3V3_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR # UBB FPGA 3.3V voltage regulator temperature
BASEBOARD_RETIMER_0_1_2_3_1V2_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR # Retimer 0-1-2-3 1.2V voltage regulator temperature
BASEBOARD_RETIMER_4_5_6_7_1V2_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR # Retimer 4-5-6-7 1.2V voltage regulator temperature
BASEBOARD_RETIMER_0_1_0V9_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR # Retimer 0-1 0.9V voltage regulator temperature
BASEBOARD_RETIMER_4_5_0V9_VR= amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR # Retimer 4-5 0.9V voltage regulator temperature
BASEBOARD_RETIMER_2_3_0V9_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR # Retimer 2-3 0.9V voltage regulator temperature
BASEBOARD_RETIMER_6_7_0V9_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR # Retimer 6-7 0.9V voltage regulator temperature
BASEBOARD_OAM_0_1_2_3_3V3_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR # OAM 0-1-2-3 3.3V voltage regulator temperature
BASEBOARD_OAM_4_5_6_7_3V3_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR # OAM 4-5-6-7 3.3V voltage regulator temperature
BASEBOARD_IBC_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC # IBC HSC temperature
BASEBOARD_IBC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC # IBC temperature
BASEBOARD_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST
BASEBOARD__MAX = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE__MAX # Maximum per GPU temperature type
class AmdSmiDevPerfLevel(IntEnum):
AUTO = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_AUTO
+161 -17
View File
@@ -493,7 +493,55 @@ amdsmi_temperature_type_t__enumvalues = {
5: 'AMDSMI_TEMPERATURE_TYPE_HBM_2',
6: 'AMDSMI_TEMPERATURE_TYPE_HBM_3',
7: 'AMDSMI_TEMPERATURE_TYPE_PLX',
7: 'AMDSMI_TEMPERATURE_TYPE__MAX',
100: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST',
100: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X',
101: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC',
102: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2',
103: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR',
104: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR',
105: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR',
149: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST',
150: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST',
150: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0',
151: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1',
152: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2',
153: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3',
154: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A',
155: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C',
156: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A',
157: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C',
158: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM',
159: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B',
160: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D',
161: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR',
162: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32',
199: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST',
200: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST',
200: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA',
201: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT',
202: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK',
203: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7',
204: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC',
205: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA',
206: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1',
207: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC',
208: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC',
209: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC',
210: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC',
211: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR',
212: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR',
213: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR',
214: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR',
215: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR',
216: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR',
217: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR',
218: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR',
219: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR',
220: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR',
221: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC',
222: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC',
249: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST',
249: 'AMDSMI_TEMPERATURE_TYPE__MAX',
}
AMDSMI_TEMPERATURE_TYPE_EDGE = 0
AMDSMI_TEMPERATURE_TYPE_FIRST = 0
@@ -505,7 +553,55 @@ AMDSMI_TEMPERATURE_TYPE_HBM_1 = 4
AMDSMI_TEMPERATURE_TYPE_HBM_2 = 5
AMDSMI_TEMPERATURE_TYPE_HBM_3 = 6
AMDSMI_TEMPERATURE_TYPE_PLX = 7
AMDSMI_TEMPERATURE_TYPE__MAX = 7
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST = 100
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X = 100
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC = 101
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2 = 102
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR = 103
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR = 104
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR = 105
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST = 149
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST = 150
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0 = 150
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1 = 151
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2 = 152
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3 = 153
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A = 154
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C = 155
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A = 156
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C = 157
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM = 158
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B = 159
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D = 160
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR = 161
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32 = 162
AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST = 199
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST = 200
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA = 200
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT = 201
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK = 202
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7 = 203
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC = 204
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA = 205
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1 = 206
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC = 207
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC = 208
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC = 209
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC = 210
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR = 211
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR = 212
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR = 213
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR = 214
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR = 215
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR = 216
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR = 217
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR = 218
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR = 219
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR = 220
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC = 221
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC = 222
AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST = 249
AMDSMI_TEMPERATURE_TYPE__MAX = 249
amdsmi_temperature_type_t = ctypes.c_uint32 # enum
# values for enumeration 'amdsmi_fw_block_t'
@@ -866,21 +962,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum
class struct_amdsmi_pcie_info_t(Structure):
pass
class struct_pcie_static_(Structure):
pass
struct_pcie_static_._pack_ = 1 # source:False
struct_pcie_static_._fields_ = [
('max_pcie_width', ctypes.c_uint16),
('PADDING_0', ctypes.c_ubyte * 2),
('max_pcie_speed', ctypes.c_uint32),
('pcie_interface_version', ctypes.c_uint32),
('slot_type', amdsmi_card_form_factor_t),
('max_pcie_interface_version', ctypes.c_uint32),
('PADDING_1', ctypes.c_ubyte * 4),
('reserved', ctypes.c_uint64 * 9),
]
class struct_pcie_metric_(Structure):
pass
@@ -901,6 +982,21 @@ struct_pcie_metric_._fields_ = [
('reserved', ctypes.c_uint64 * 12),
]
class struct_pcie_static_(Structure):
pass
struct_pcie_static_._pack_ = 1 # source:False
struct_pcie_static_._fields_ = [
('max_pcie_width', ctypes.c_uint16),
('PADDING_0', ctypes.c_ubyte * 2),
('max_pcie_speed', ctypes.c_uint32),
('pcie_interface_version', ctypes.c_uint32),
('slot_type', amdsmi_card_form_factor_t),
('max_pcie_interface_version', ctypes.c_uint32),
('PADDING_1', ctypes.c_ubyte * 4),
('reserved', ctypes.c_uint64 * 9),
]
struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
struct_amdsmi_pcie_info_t._fields_ = [
('pcie_static', struct_pcie_static_),
@@ -3146,7 +3242,55 @@ __all__ = \
'AMDSMI_STATUS_SETTING_UNAVAILABLE', 'AMDSMI_STATUS_SUCCESS',
'AMDSMI_STATUS_TIMEOUT', 'AMDSMI_STATUS_UNEXPECTED_DATA',
'AMDSMI_STATUS_UNEXPECTED_SIZE', 'AMDSMI_STATUS_UNKNOWN_ERROR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7',
'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA',
'AMDSMI_TEMPERATURE_TYPE_EDGE', 'AMDSMI_TEMPERATURE_TYPE_FIRST',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST',
'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST',
'AMDSMI_TEMPERATURE_TYPE_HBM_0', 'AMDSMI_TEMPERATURE_TYPE_HBM_1',
'AMDSMI_TEMPERATURE_TYPE_HBM_2', 'AMDSMI_TEMPERATURE_TYPE_HBM_3',
'AMDSMI_TEMPERATURE_TYPE_HOTSPOT',
+58 -1
View File
@@ -532,7 +532,64 @@ typedef enum {
RSMI_TEMP_TYPE_HBM_1, //!< HBM temperature instance 1
RSMI_TEMP_TYPE_HBM_2, //!< HBM temperature instance 2
RSMI_TEMP_TYPE_HBM_3, //!< HBM temperature instance 3
RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_HBM_3,
RSMI_TEMP_TYPE_PLX, //!< PLX temperature
// GPU Board Node temperature
RSMI_TEMP_TYPE_GPUBOARD_NODE_FIRST = 100,
RSMI_TEMP_TYPE_GPUBOARD_NODE_RETIMER_X = RSMI_TEMP_TYPE_GPUBOARD_NODE_FIRST, //!< Retimer X temperature
RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_IBC, //!< OAM X IBC temperature
RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_IBC_2, //!< OAM X IBC 2 temperature
RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR, //!< OAM X VDD 1.8V voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR, //!< OAM X 0.4V HBM B voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR, //!< OAM X 0.4V HBM D voltage regulator temperature
// GPU Board VR (Voltage Regulator) temperature
RSMI_TEMP_TYPE_GPUBOARD_VR_FIRST = 150,
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD0 = RSMI_TEMP_TYPE_GPUBOARD_VR_FIRST, //!< VDDCR VDD0 voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD1, //!< VDDCR VDD1 voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD2, //!< VDDCR VDD2 voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD3, //!< VDDCR VDD3 voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOC_A, //!< VDDCR SOC A voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOC_C, //!< VDDCR SOC C voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOCIO_A, //!< VDDCR SOCIO A voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOCIO_C, //!< VDDCR SOCIO C voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDD_085_HBM, //!< VDD 0.85V HBM voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_11_HBM_B, //!< VDDCR 1.1V HBM B voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDCR_11_HBM_D, //!< VDDCR 1.1V HBM D voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDD_USR, //!< VDD USR voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_VDDIO_11_E32, //!< VDDIO 1.1V E32 voltage regulator temperature
RSMI_TEMP_TYPE_GPUBOARD_LAST = 199,
// Baseboard System temperature
RSMI_TEMP_TYPE_BASEBOARD_FIRST = 200,
RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA = RSMI_TEMP_TYPE_BASEBOARD_FIRST, //!< UBB FPGA temperature
RSMI_TEMP_TYPE_BASEBOARD_UBB_FRONT, //!< UBB front temperature
RSMI_TEMP_TYPE_BASEBOARD_UBB_BACK, //!< UBB back temperature
RSMI_TEMP_TYPE_BASEBOARD_UBB_OAM7, //!< UBB OAM7 temperature
RSMI_TEMP_TYPE_BASEBOARD_UBB_IBC, //!< UBB IBC temperature
RSMI_TEMP_TYPE_BASEBOARD_UBB_UFPGA, //!< UBB UFPGA temperature
RSMI_TEMP_TYPE_BASEBOARD_UBB_OAM1, //!< UBB OAM1 temperature
RSMI_TEMP_TYPE_BASEBOARD_OAM_0_1_HSC, //!< OAM 0-1 HSC temperature
RSMI_TEMP_TYPE_BASEBOARD_OAM_2_3_HSC, //!< OAM 2-3 HSC temperature
RSMI_TEMP_TYPE_BASEBOARD_OAM_4_5_HSC, //!< OAM 4-5 HSC temperature
RSMI_TEMP_TYPE_BASEBOARD_OAM_6_7_HSC, //!< OAM 6-7 HSC temperature
RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA_0V72_VR, //!< UBB FPGA 0.72V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA_3V3_VR, //!< UBB FPGA 3.3V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR, //!< Retimer 0-1-2-3 1.2V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR, //!< Retimer 4-5-6-7 1.2V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR, //!< Retimer 0-1 0.9V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR, //!< Retimer 4-5 0.9V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR, //!< Retimer 2-3 0.9V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR, //!< Retimer 6-7 0.9V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR, //!< OAM 0-1-2-3 3.3V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR, //!< OAM 4-5-6-7 3.3V voltage regulator temperature
RSMI_TEMP_TYPE_BASEBOARD_IBC_HSC, //!< IBC HSC temperature
RSMI_TEMP_TYPE_BASEBOARD_IBC, //!< IBC temperature
RSMI_TEMP_TYPE_BASEBOARD_LAST = 249,
RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_BASEBOARD_LAST, //!< Last of per GPU temperature types
RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
} rsmi_temperature_type_t;
@@ -0,0 +1,117 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_BOARD_TEMP_H_
#define ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_BOARD_TEMP_H_
#include "rocm_smi/rocm_smi.h"
// Headers from the driver
namespace amd::smi {
enum amdgpu_vr_temp {
AMDGPU_VDDCR_VDD0_TEMP,
AMDGPU_VDDCR_VDD1_TEMP,
AMDGPU_VDDCR_VDD2_TEMP,
AMDGPU_VDDCR_VDD3_TEMP,
AMDGPU_VDDCR_SOC_A_TEMP,
AMDGPU_VDDCR_SOC_C_TEMP,
AMDGPU_VDDCR_SOCIO_A_TEMP,
AMDGPU_VDDCR_SOCIO_C_TEMP,
AMDGPU_VDD_085_HBM_TEMP,
AMDGPU_VDDCR_11_HBM_B_TEMP,
AMDGPU_VDDCR_11_HBM_D_TEMP,
AMDGPU_VDD_USR_TEMP,
AMDGPU_VDDIO_11_E32_TEMP,
AMDGPU_VR_MAX_TEMP_ENTRIES,
};
enum amdgpu_system_temp {
AMDGPU_UBB_FPGA_TEMP,
AMDGPU_UBB_FRONT_TEMP,
AMDGPU_UBB_BACK_TEMP,
AMDGPU_UBB_OAM7_TEMP,
AMDGPU_UBB_IBC_TEMP,
AMDGPU_UBB_UFPGA_TEMP,
AMDGPU_UBB_OAM1_TEMP,
AMDGPU_OAM_0_1_HSC_TEMP,
AMDGPU_OAM_2_3_HSC_TEMP,
AMDGPU_OAM_4_5_HSC_TEMP,
AMDGPU_OAM_6_7_HSC_TEMP,
AMDGPU_UBB_FPGA_0V72_VR_TEMP,
AMDGPU_UBB_FPGA_3V3_VR_TEMP,
AMDGPU_RETIMER_0_1_2_3_1V2_VR_TEMP,
AMDGPU_RETIMER_4_5_6_7_1V2_VR_TEMP,
AMDGPU_RETIMER_0_1_0V9_VR_TEMP,
AMDGPU_RETIMER_4_5_0V9_VR_TEMP,
AMDGPU_RETIMER_2_3_0V9_VR_TEMP,
AMDGPU_RETIMER_6_7_0V9_VR_TEMP,
AMDGPU_OAM_0_1_2_3_3V3_VR_TEMP,
AMDGPU_OAM_4_5_6_7_3V3_VR_TEMP,
AMDGPU_IBC_HSC_TEMP,
AMDGPU_IBC_TEMP,
AMDGPU_SYSTEM_MAX_TEMP_ENTRIES = 32,
};
enum amdgpu_node_temp {
AMDGPU_RETIMER_X_TEMP,
AMDGPU_OAM_X_IBC_TEMP,
AMDGPU_OAM_X_IBC_2_TEMP,
AMDGPU_OAM_X_VDD18_VR_TEMP,
AMDGPU_OAM_X_04_HBM_B_VR_TEMP,
AMDGPU_OAM_X_04_HBM_D_VR_TEMP,
AMDGPU_NODE_MAX_TEMP_ENTRIES = 12,
};
struct amdgpu_gpuboard_temp_metrics_v1_0 {
struct metrics_table_header_t common_header;
uint16_t label_version;
uint16_t node_id;
uint64_t accumulation_counter;
/* Encoded temperature in Celcius, 24:31 is sensor id 0:23 is temp value */
uint32_t node_temp[AMDGPU_NODE_MAX_TEMP_ENTRIES];
uint32_t vr_temp[AMDGPU_VR_MAX_TEMP_ENTRIES];
};
struct amdgpu_baseboard_temp_metrics_v1_0 {
struct metrics_table_header_t common_header;
uint16_t label_version;
uint16_t node_id;
uint64_t accumulation_counter;
/* Encoded temperature in Celcius, 24:31 is sensor id 0:23 is temp value */
uint32_t system_temp[AMDGPU_SYSTEM_MAX_TEMP_ENTRIES];
};
rsmi_status_t read_gpuboard_temp_metrics(const char* filename, amdgpu_gpuboard_temp_metrics_v1_0& metrics);
rsmi_status_t read_baseboard_temp_metrics(const char* filename, amdgpu_baseboard_temp_metrics_v1_0& metrics);
rsmi_status_t get_baseboard_temp_value(const amdgpu_baseboard_temp_metrics_v1_0& metrics,
rsmi_temperature_type_t temperature_type,
int64_t* value);
rsmi_status_t get_gpuboard_temp_value(const amdgpu_gpuboard_temp_metrics_v1_0& metrics,
rsmi_temperature_type_t temperature_type,
int64_t* value);
}
#endif // ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_BOARD_TEMP_H_
@@ -160,6 +160,8 @@ enum DevInfoTypes {
kDevGpuMetrics,
kDevPmMetrics,
kDevRegMetrics,
kDevBaseBoardTempMetrics,
kDevGpuBoardTempMetrics,
kDevGpuReset,
kDevAvailableComputePartition,
kDevComputePartition,
+58 -2
View File
@@ -59,6 +59,7 @@
#include "rocm_smi/rocm_smi_io_link.h"
#include "rocm_smi/rocm_smi64Config.h"
#include "rocm_smi/rocm_smi_logger.h"
#include "rocm_smi/rocm_smi_board_temp.h"
using amd::smi::monitorTypesToString;
using amd::smi::getRSMIStatusString;
@@ -3293,6 +3294,63 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
rsmi_status_t ret;
amd::smi::MonitorTypes mon_type = amd::smi::kMonInvalid;
uint16_t val_ui16;
GET_DEV_FROM_INDX
// handle gpu board temp
if (sensor_type >= RSMI_TEMP_TYPE_GPUBOARD_NODE_FIRST &&
sensor_type <= RSMI_TEMP_TYPE_GPUBOARD_LAST ) {
if (metric != RSMI_TEMP_CURRENT) {
LOG_ERROR("GPUBoard temperature only support RSMI_TEMP_CURRENT");
return RSMI_STATUS_NOT_SUPPORTED;
}
std::string file_path = dev->get_sys_file_path_by_type(amd::smi::kDevGpuBoardTempMetrics);
if (file_path == "") {
LOG_ERROR("Failed to get GPU board temperature metrics file path");
return RSMI_STATUS_NOT_SUPPORTED;
}
amd::smi::amdgpu_gpuboard_temp_metrics_v1_0 gpuboard_metric;
ret = read_gpuboard_temp_metrics(file_path.c_str(), gpuboard_metric);
if (ret != RSMI_STATUS_SUCCESS) {
std::string err_msg = "Failed to read GPU board temperature metrics at " + file_path;
LOG_ERROR(err_msg);
return ret;
}
ret = get_gpuboard_temp_value(gpuboard_metric,
static_cast<rsmi_temperature_type_t>(sensor_type), temperature);
return ret;
}
// handle base board temp
if (sensor_type >= RSMI_TEMP_TYPE_BASEBOARD_FIRST &&
sensor_type <= RSMI_TEMP_TYPE_BASEBOARD_LAST ) {
if (metric != RSMI_TEMP_CURRENT) {
LOG_ERROR("Baseboard temperature only supports RSMI_TEMP_CURRENT");
return RSMI_STATUS_NOT_SUPPORTED;
}
std::string file_path = dev->get_sys_file_path_by_type(amd::smi::kDevBaseBoardTempMetrics);
if (file_path.empty()) {
LOG_ERROR("Failed to get baseboard temperature metrics file path");
return RSMI_STATUS_NOT_SUPPORTED;
}
amd::smi::amdgpu_baseboard_temp_metrics_v1_0 baseboard_metric;
ret = read_baseboard_temp_metrics(file_path.c_str(), baseboard_metric);
if (ret != RSMI_STATUS_SUCCESS) {
std::string err_msg = "Failed to read baseboard temperature metrics at " + file_path;
LOG_ERROR(err_msg);
return ret;
}
ret = get_baseboard_temp_value(baseboard_metric,
static_cast<rsmi_temperature_type_t>(sensor_type), temperature);
return ret;
}
static const std::map<rsmi_temperature_metric_t, amd::smi::MonitorTypes>
kMetricTypeMap = {
@@ -3410,8 +3468,6 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
DEVICE_MUTEX
GET_DEV_FROM_INDX
if (dev->monitor() == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
+388
View File
@@ -0,0 +1,388 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "rocm_smi/rocm_smi_board_temp.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi_logger.h"
#include <fstream>
#include <cstring>
#include <cerrno>
#include <iomanip>
#include <sstream>
#include <map>
using amd::smi::getRSMIStatusString;
namespace amd::smi {
// Static mapping tables for temperature type conversions
static const std::map<int, rsmi_temperature_type_t> vr_temp_map = {
{AMDGPU_VDDCR_VDD0_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD0},
{AMDGPU_VDDCR_VDD1_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD1},
{AMDGPU_VDDCR_VDD2_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD2},
{AMDGPU_VDDCR_VDD3_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD3},
{AMDGPU_VDDCR_SOC_A_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOC_A},
{AMDGPU_VDDCR_SOC_C_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOC_C},
{AMDGPU_VDDCR_SOCIO_A_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOCIO_A},
{AMDGPU_VDDCR_SOCIO_C_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOCIO_C},
{AMDGPU_VDD_085_HBM_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDD_085_HBM},
{AMDGPU_VDDCR_11_HBM_B_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_11_HBM_B},
{AMDGPU_VDDCR_11_HBM_D_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_11_HBM_D},
{AMDGPU_VDD_USR_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDD_USR},
{AMDGPU_VDDIO_11_E32_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDIO_11_E32}
};
static const std::map<int, rsmi_temperature_type_t> node_temp_map = {
{AMDGPU_RETIMER_X_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_RETIMER_X},
{AMDGPU_OAM_X_IBC_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_IBC},
{AMDGPU_OAM_X_IBC_2_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_IBC_2},
{AMDGPU_OAM_X_VDD18_VR_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR},
{AMDGPU_OAM_X_04_HBM_B_VR_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR},
{AMDGPU_OAM_X_04_HBM_D_VR_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR}
};
static const std::map<int, rsmi_temperature_type_t> system_temp_map = {
{AMDGPU_UBB_FPGA_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA},
{AMDGPU_UBB_FRONT_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_FRONT},
{AMDGPU_UBB_BACK_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_BACK},
{AMDGPU_UBB_OAM7_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_OAM7},
{AMDGPU_UBB_IBC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_IBC},
{AMDGPU_UBB_UFPGA_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_UFPGA},
{AMDGPU_UBB_OAM1_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_OAM1},
{AMDGPU_OAM_0_1_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_0_1_HSC},
{AMDGPU_OAM_2_3_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_2_3_HSC},
{AMDGPU_OAM_4_5_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_4_5_HSC},
{AMDGPU_OAM_6_7_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_6_7_HSC},
{AMDGPU_UBB_FPGA_0V72_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA_0V72_VR},
{AMDGPU_UBB_FPGA_3V3_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA_3V3_VR},
{AMDGPU_RETIMER_0_1_2_3_1V2_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR},
{AMDGPU_RETIMER_4_5_6_7_1V2_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR},
{AMDGPU_RETIMER_0_1_0V9_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR},
{AMDGPU_RETIMER_4_5_0V9_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR},
{AMDGPU_RETIMER_2_3_0V9_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR},
{AMDGPU_RETIMER_6_7_0V9_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR},
{AMDGPU_OAM_0_1_2_3_3V3_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR},
{AMDGPU_OAM_4_5_6_7_3V3_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR},
{AMDGPU_IBC_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_IBC_HSC},
{AMDGPU_IBC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_IBC}
};
// Helper function to create hex dump string
static std::string createHexDump(const void* data, size_t size, const std::string& description) {
std::ostringstream ss;
const unsigned char* bytes = static_cast<const unsigned char*>(data);
ss << "=== " << description << " (size: " << size << " bytes) ===" << std::endl;
for (size_t i = 0; i < size; i += 16) {
// Print offset
ss << std::hex << std::setfill('0') << std::setw(8) << i << ": ";
// Print hex bytes
for (size_t j = 0; j < 16; ++j) {
if (i + j < size) {
ss << std::hex << std::setfill('0') << std::setw(2) << static_cast<unsigned>(bytes[i + j]) << " ";
} else {
ss << " ";
}
}
ss << " | ";
// Print ASCII representation
for (size_t j = 0; j < 16 && i + j < size; ++j) {
unsigned char c = bytes[i + j];
ss << (std::isprint(c) ? static_cast<char>(c) : '.');
}
ss << std::endl;
}
ss << "=== End " << description << " ===" << std::endl;
return ss.str();
}
rsmi_status_t read_gpuboard_temp_metrics(const char* filename, amdgpu_gpuboard_temp_metrics_v1_0& metrics) {
if (!filename) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start ======= "
<< " | Fail | filename is null | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
LOG_INFO(ss);
return RSMI_STATUS_INVALID_ARGS;
}
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start ======= "
<< " | filename: " << filename;
LOG_INFO(ss);
std::ifstream file(filename, std::ios::binary);
if (!file.is_open()) {
std::ostringstream ess;
ess << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Fail | Could not open file: " << filename
<< " | errno: " << errno << " (" << std::strerror(errno) << ")"
<< " | Returning = " << getRSMIStatusString(ErrnoToRsmiStatus(errno)) << " |";
LOG_INFO(ess);
return ErrnoToRsmiStatus(errno);
}
// Clear the metrics structure
std::memset(&metrics, 0, sizeof(metrics));
// Read the entire structure
file.read(reinterpret_cast<char*>(&metrics), sizeof(metrics));
if (file.bad()) {
std::ostringstream ess;
ess << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Fail | File read error | errno: " << errno << " (" << std::strerror(errno) << ")"
<< " | Returning = " << getRSMIStatusString(ErrnoToRsmiStatus(errno)) << " |";
LOG_INFO(ess);
return ErrnoToRsmiStatus(errno);
}
// Always create hex dump for debugging, using the number of bytes actually read
std::string hexDump = createHexDump(&metrics, file.gcount(), "GPU Board Temperature Metrics");
LOG_DEBUG(hexDump);
if (file.gcount() != sizeof(metrics)) {
std::ostringstream ess;
ess << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Fail | Insufficient data read"
<< " | Expected: " << sizeof(metrics) << " bytes"
<< " | Actual: " << file.gcount() << " bytes"
<< " | Returning = " << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |";
LOG_INFO(ess);
return RSMI_STATUS_INSUFFICIENT_SIZE;
}
std::ostringstream oss;
oss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success | File: " << filename
<< " | Bytes read: " << sizeof(metrics)
<< " | Header format: " << static_cast<unsigned>(metrics.common_header.format_revision)
<< " | Header content: " << static_cast<unsigned>(metrics.common_header.content_revision)
<< " | Node ID: " << metrics.node_id
<< " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |";
LOG_INFO(oss);
return RSMI_STATUS_SUCCESS;
}
rsmi_status_t read_baseboard_temp_metrics(const char* filename, amdgpu_baseboard_temp_metrics_v1_0& metrics) {
if (!filename) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start ======= "
<< " | Fail | filename is null | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
LOG_INFO(ss);
return RSMI_STATUS_INVALID_ARGS;
}
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start ======= "
<< " | filename: " << filename;
LOG_INFO(ss);
std::ifstream file(filename, std::ios::binary);
if (!file.is_open()) {
std::ostringstream ess;
ess << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Fail | Could not open file: " << filename
<< " | errno: " << errno << " (" << std::strerror(errno) << ")"
<< " | Returning = " << getRSMIStatusString(ErrnoToRsmiStatus(errno)) << " |";
LOG_INFO(ess);
return ErrnoToRsmiStatus(errno);
}
// Clear the metrics structure
std::memset(&metrics, 0, sizeof(metrics));
// Read the entire structure
file.read(reinterpret_cast<char*>(&metrics), sizeof(metrics));
if (file.bad()) {
std::ostringstream ess;
ess << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Fail | File read error | errno: " << errno << " (" << std::strerror(errno) << ")"
<< " | Returning = " << getRSMIStatusString(ErrnoToRsmiStatus(errno)) << " |";
LOG_INFO(ess);
return ErrnoToRsmiStatus(errno);
}
// Always create hex dump for debugging, using the number of bytes actually read
std::string hexDump = createHexDump(&metrics, file.gcount(), "Baseboard Temperature Metrics");
LOG_DEBUG(hexDump);
if (file.gcount() != sizeof(metrics)) {
std::ostringstream ess;
ess << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Fail | Insufficient data read"
<< " | Expected: " << sizeof(metrics) << " bytes"
<< " | Actual: " << file.gcount() << " bytes"
<< " | Returning = " << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |";
LOG_INFO(ess);
return RSMI_STATUS_INSUFFICIENT_SIZE;
}
std::ostringstream oss;
oss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success | File: " << filename
<< " | Bytes read: " << sizeof(metrics)
<< " | Header format: " << static_cast<unsigned>(metrics.common_header.format_revision)
<< " | Header content: " << static_cast<unsigned>(metrics.common_header.content_revision)
<< " | Node ID: " << metrics.node_id
<< " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |";
LOG_INFO(oss);
return RSMI_STATUS_SUCCESS;
}
// Decode encoded temperature value: bits 24-31 = sensor id, bits 0-23 = signed temperature (Celsius)
static int32_t decode_temperature_value(uint32_t encoded, uint8_t* sensor_id = nullptr) {
if (sensor_id) {
*sensor_id = static_cast<uint8_t>((encoded >> 24) & 0xFF);
}
// Extract signed 24-bit temperature value
int32_t temp = static_cast<int32_t>(encoded & 0xFFFFFF);
// Sign-extend if negative
if (temp & 0x800000) {
temp |= ~0xFFFFFF;
}
temp *= 1000; // Convert Celsius to milli-Celsius
return temp;
}
rsmi_status_t get_gpuboard_temp_value(const amdgpu_gpuboard_temp_metrics_v1_0& metrics,
rsmi_temperature_type_t temperature_type,
int64_t* value) {
if (!value) {
return RSMI_STATUS_INVALID_ARGS;
}
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start ======= "
<< " | Node ID: " << metrics.node_id
<< " | Temperature type: " << static_cast<int>(temperature_type);
LOG_INFO(ss);
*value = 0; // Initialize to 0
const uint32_t INVALID_VALUE = std::numeric_limits<uint32_t>::max();
// Check VR (Voltage Regulator) temperatures first
for (int i = 0; i < AMDGPU_VR_MAX_TEMP_ENTRIES; ++i) {
if (metrics.vr_temp[i] != INVALID_VALUE) {
auto it = vr_temp_map.find(i);
if (it != vr_temp_map.end() && it->second == temperature_type) {
*value = decode_temperature_value(metrics.vr_temp[i]);
std::ostringstream oss;
oss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success | VR temp found at index: " << i
<< " | Raw value: " << *value
<< " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |";
LOG_INFO(oss);
return RSMI_STATUS_SUCCESS;
}
}
}
// Check node temperatures if not found in VR
for (int i = 0; i < AMDGPU_NODE_MAX_TEMP_ENTRIES; ++i) {
if (metrics.node_temp[i] != INVALID_VALUE) { // Max int indicates invalid temperature reading
auto it = node_temp_map.find(i);
if (it != node_temp_map.end() && it->second == temperature_type) {
*value = decode_temperature_value(metrics.node_temp[i]);
std::ostringstream oss;
oss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success | Node temp found at index: " << i
<< " | Raw value: " << *value
<< " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |";
LOG_INFO(oss);
return RSMI_STATUS_SUCCESS;
}
}
}
// Temperature type not found in metrics
std::ostringstream ess;
ess << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Fail | Temperature type not found in GPU board metrics"
<< " | Temperature type: " << static_cast<int>(temperature_type)
<< " | Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
LOG_ERROR(ess);
return RSMI_STATUS_NOT_SUPPORTED;
}
rsmi_status_t get_baseboard_temp_value(const amdgpu_baseboard_temp_metrics_v1_0& metrics,
rsmi_temperature_type_t temperature_type,
int64_t* value) {
if (!value) {
return RSMI_STATUS_INVALID_ARGS;
}
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start ======= "
<< " | Node ID: " << metrics.node_id
<< " | Temperature type: " << static_cast<int>(temperature_type);
LOG_INFO(ss);
*value = 0; // Initialize to 0
const uint32_t INVALID_VALUE = std::numeric_limits<uint32_t>::max();
// Check system temperatures
for (int i = 0; i < AMDGPU_SYSTEM_MAX_TEMP_ENTRIES; ++i) {
if (metrics.system_temp[i] != INVALID_VALUE) { // Max int indicates invalid temperature reading
auto it = system_temp_map.find(i);
if (it != system_temp_map.end() && it->second == temperature_type) {
*value = decode_temperature_value(metrics.system_temp[i]);
std::ostringstream oss;
oss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success | System temp found at index: " << i
<< " | Raw value: " << *value
<< " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |";
LOG_INFO(oss);
return RSMI_STATUS_SUCCESS;
}
}
}
// Temperature type not found in metrics
std::ostringstream ess;
ess << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Fail | Temperature type not found in baseboard metrics"
<< " | Temperature type: " << static_cast<int>(temperature_type)
<< " | Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
LOG_ERROR(ess);
return RSMI_STATUS_NOT_SUPPORTED;
}
} // end namespace
+7
View File
@@ -118,6 +118,9 @@ static const char *kDevNumaNodeFName = "numa_node";
static const char *kDevGpuMetricsFName = "gpu_metrics";
static const char *kDevPmMetricsFName = "pm_metrics"; // PM log
static const char *kDevRegMetricsFName = "reg_state"; // register table
static const char *kDevBaseBoardTempMetricsFName = "board/baseboard_temp";
static const char *kDevGpuBoardTempMetricsFName = "board/gpuboard_temp";
static const char *kDevAvailableComputePartitionFName =
"available_compute_partition";
static const char *kDevComputePartitionFName = "current_compute_partition";
@@ -325,6 +328,8 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevProcessIsolation, kDevProcessIsolationFName},
{kDevShaderClean, kDevShaderCleanFName},
{kDevRegMetrics, kDevRegMetricsFName},
{kDevBaseBoardTempMetrics, kDevBaseBoardTempMetricsFName},
{kDevGpuBoardTempMetrics, kDevGpuBoardTempMetricsFName},
{kDevGpuReset, kDevGpuResetFName},
{kDevAvailableComputePartition, kDevAvailableComputePartitionFName},
{kDevComputePartition, kDevComputePartitionFName},
@@ -495,6 +500,8 @@ Device::devInfoTypesStrings = {
{kDevGpuMetrics, "kDevGpuMetrics"},
{kDevPmMetrics, "kDevPmMetrics"},
{kDevRegMetrics, "kDevRegMetrics"},
{kDevBaseBoardTempMetrics, "kDevBaseBoardTempMetrics"},
{kDevGpuBoardTempMetrics, "kDevGpuBoardTempMetrics"},
{kDevGpuReset, "kDevGpuReset"},
{kDevAvailableComputePartition, "kDevAvailableComputePartition"},
{kDevComputePartition, "kDevComputePartition"},
+1 -1
View File
@@ -144,7 +144,7 @@ amdsmi_status_t AMDSmiDrm::init() {
// even if fail, still add to prevent mismatch the index
if (!has_valid_fds) {
drm_bdfs_.push_back(bdf);
drm_free_device(&device);
// No need to free device here since it is not valid
continue;
}
+52 -1
View File
@@ -42,7 +42,55 @@ static const std::map<uint32_t, std::string> kTempSensorNameMap = {
{AMDSMI_TEMPERATURE_TYPE_HBM_1, "HBM_1"},
{AMDSMI_TEMPERATURE_TYPE_HBM_2, "HBM_2"},
{AMDSMI_TEMPERATURE_TYPE_HBM_3, "HBM_3"},
{AMDSMI_TEMPERATURE_TYPE_PLX, "PLX"}
{AMDSMI_TEMPERATURE_TYPE_PLX, "PLX"},
// GPU Board Node Temperature Types (100-149)
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X, "GPU Board Node Retimer X"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC, "GPU Board Node OAM X IBC"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2, "GPU Board Node OAM X IBC 2"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR, "GPU Board Node OAM X VDD18 VR"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR, "GPU Board Node OAM X 04 HBM B VR"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR, "GPU Board Node OAM X 04 HBM D VR"},
// GPU Board VR Temperature Types (150-199)
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0, "GPU Board VDDCR VDD0"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1, "GPU Board VDDCR VDD1"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2, "GPU Board VDDCR VDD2"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3, "GPU Board VDDCR VDD3"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A, "GPU Board VDDCR SOC A"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C, "GPU Board VDDCR SOC C"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A, "GPU Board VDDCR SOCIO A"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C, "GPU Board VDDCR SOCIO C"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM, "GPU Board VDD 085 HBM"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B, "GPU Board VDDCR 11 HBM B"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D, "GPU Board VDDCR 11 HBM D"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR, "GPU Board VDD USR"},
{AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32, "GPU Board VDDIO 11 E32"},
// Baseboard System Temperature Types (200+)
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA, "Baseboard UBB FPGA"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT, "Baseboard UBB Front"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK, "Baseboard UBB Back"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7, "Baseboard UBB OAM7"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC, "Baseboard UBB IBC"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA, "Baseboard UBB UFPGA"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1, "Baseboard UBB OAM1"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC, "Baseboard OAM 0-1 HSC"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC, "Baseboard OAM 2-3 HSC"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC, "Baseboard OAM 4-5 HSC"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC, "Baseboard OAM 6-7 HSC"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR, "Baseboard UBB FPGA 0V72 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR, "Baseboard UBB FPGA 3V3 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR, "Baseboard Retimer 0-1-2-3 1V2 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR, "Baseboard Retimer 4-5-6-7 1V2 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR, "Baseboard Retimer 0-1 0V9 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR, "Baseboard Retimer 4-5 0V9 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR, "Baseboard Retimer 2-3 0V9 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR, "Baseboard Retimer 6-7 0V9 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR, "Baseboard OAM 0-1-2-3 3V3 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR, "Baseboard OAM 4-5-6-7 3V3 VR"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC, "Baseboard IBC HSC"},
{AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC, "Baseboard IBC"}
};
TestTempRead::TestTempRead() : TestBase() {
set_title("AMDSMI Temp Read Test");
@@ -118,6 +166,9 @@ void TestTempRead::Run(void) {
}
};
for (type = AMDSMI_TEMPERATURE_TYPE_FIRST; type <= AMDSMI_TEMPERATURE_TYPE__MAX; ++type) {
if (kTempSensorNameMap.find(type) == kTempSensorNameMap.end()) {
continue;
}
IF_VERB(STANDARD) {
std::cout << "\t** **********" << kTempSensorNameMap.at(type) <<
" Temperatures **********" << std::endl;