From abd3c02a3cf9e15036dee69885ca442166844fd2 Mon Sep 17 00:00:00 2001 From: "Liu, Shuzhou (Bill)" Date: Tue, 5 Aug 2025 21:37:45 -0400 Subject: [PATCH] Query UBB/OAM temperature API (#581) Add support to Query UBB/OAM temperature. * Updated Python API with new temperature metrics enum --------- Co-authored-by: Bill Liu Co-authored-by: gabrpham_amdeng --- CMakeLists.txt | 7 +- include/amd_smi/amdsmi.h | 61 ++- py-interface/amdsmi_interface.py | 55 +++ py-interface/amdsmi_wrapper.py | 178 +++++++- rocm_smi/include/rocm_smi/rocm_smi.h | 59 ++- .../include/rocm_smi/rocm_smi_board_temp.h | 117 ++++++ rocm_smi/include/rocm_smi/rocm_smi_device.h | 2 + rocm_smi/src/rocm_smi.cc | 60 ++- rocm_smi/src/rocm_smi_board_temp.cc | 388 ++++++++++++++++++ rocm_smi/src/rocm_smi_device.cc | 7 + src/amd_smi/amd_smi_drm.cc | 2 +- tests/amd_smi_test/functional/temp_read.cc | 53 ++- 12 files changed, 964 insertions(+), 25 deletions(-) create mode 100644 rocm_smi/include/rocm_smi/rocm_smi_board_temp.h create mode 100644 rocm_smi/src/rocm_smi_board_temp.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 82c3b5b26f..b12b56f74a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,7 +187,8 @@ set(CMN_SRC_LIST "${ROCM_SRC_DIR}/rocm_smi.cc" "${ROCM_SRC_DIR}/rocm_smi_logger.cc" "${SHR_MUTEX_DIR}/shared_mutex.cc" - "${ROCM_SRC_DIR}/rocm_smi_binary_parser.cc") + "${ROCM_SRC_DIR}/rocm_smi_binary_parser.cc" + "${ROCM_SRC_DIR}/rocm_smi_board_temp.cc") if(ENABLE_ESMI_LIB) list(APPEND CMN_SRC_LIST ${ESMI_SRC_DIR}/e_smi.c) @@ -211,7 +212,9 @@ set(CMN_INC_LIST "${ROCM_INC_DIR}/rocm_smi.h" "${ROCM_INC_DIR}/rocm_smi_logger.h" "${SHR_MUTEX_DIR}/shared_mutex.h" - "${ROCM_INC_DIR}/rocm_smi_binary_parser.h") + "${ROCM_INC_DIR}/rocm_smi_binary_parser.h" + "${ROCM_INC_DIR}/rocm_smi_board_temp.h" + ) add_subdirectory("rocm_smi") add_subdirectory("src") diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 6a398b37ed..77075ac9a4 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -464,7 +464,66 @@ typedef enum { AMDSMI_TEMPERATURE_TYPE_HBM_2, //!< High Bandwidth 2 temperature per stack AMDSMI_TEMPERATURE_TYPE_HBM_3, //!< High Bandwidth 3 temperature per stack AMDSMI_TEMPERATURE_TYPE_PLX, //!< PCIe switch temperature - AMDSMI_TEMPERATURE_TYPE__MAX = AMDSMI_TEMPERATURE_TYPE_PLX + + // GPU Board Node temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST = 100, + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X + = AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST, //!< Retimer X temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC, //!< OAM X IBC temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2, //!< OAM X IBC 2 temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR, //!< OAM X VDD 1.8V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR, //!< OAM X 0.4V HBM B voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR, //!< OAM X 0.4V HBM D voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST = 149, + + // GPU Board VR (Voltage Regulator) temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST = 150, + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0 + = AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST, //!< VDDCR VDD0 voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1, //!< VDDCR VDD1 voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2, //!< VDDCR VDD2 voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3, //!< VDDCR VDD3 voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A, //!< VDDCR SOC A voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C, //!< VDDCR SOC C voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A, //!< VDDCR SOCIO A voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C, //!< VDDCR SOCIO C voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM, //!< VDD 0.85V HBM voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B, //!< VDDCR 1.1V HBM B voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D, //!< VDDCR 1.1V HBM D voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR, //!< VDD USR voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32, //!< VDDIO 1.1V E32 voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST = 199, + + // Baseboard System temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST = 200, + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA + = AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST, //!< UBB FPGA temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT, //!< UBB front temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK, //!< UBB back temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7, //!< UBB OAM7 temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC, //!< UBB IBC temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA, //!< UBB UFPGA temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1, //!< UBB OAM1 temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC, //!< OAM 0-1 HSC temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC, //!< OAM 2-3 HSC temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC, //!< OAM 4-5 HSC temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC, //!< OAM 6-7 HSC temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR, //!< UBB FPGA 0.72V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR, //!< UBB FPGA 3.3V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR, //!< Retimer 0-1-2-3 1.2V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR, //!< Retimer 4-5-6-7 1.2V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR, //!< Retimer 0-1 0.9V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR, //!< Retimer 4-5 0.9V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR, //!< Retimer 2-3 0.9V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR, //!< Retimer 6-7 0.9V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR, //!< OAM 0-1-2-3 3.3V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR, //!< OAM 4-5-6-7 3.3V voltage regulator temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC, //!< IBC HSC temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC, //!< IBC temperature + AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST = 249, + AMDSMI_TEMPERATURE_TYPE__MAX = AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST, //!< Maximum per GPU temperature type + + } amdsmi_temperature_type_t; /** diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 33590d3078..162c3aa520 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -222,6 +222,61 @@ class AmdSmiTemperatureType(IntEnum): HBM_3 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_HBM_3 PLX = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_PLX + # GPU Board Node temperature + GPUBOARD_NODE_FIRST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST + GPUBOARD_NODE_RETIMER_X = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X # Retimer X temperature + GPUBOARD_NODE_OAM_X_IBC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC # OAM X IBC temperature + GPUBOARD_NODE_OAM_X_IBC_2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2 # OAM X IBC 2 temperature + GPUBOARD_NODE_OAM_X_VDD18_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR # OAM X VDD 1.8V voltage regulator temperature + GPUBOARD_NODE_OAM_X_04_HBM_B_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR # OAM X 0.4V HBM B voltage regulator temperature + GPUBOARD_NODE_OAM_X_04_HBM_D_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR # OAM X 0.4V HBM D voltage regulator temperature + GPUBOARD_NODE_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST + + # GPU Board VR (Voltage Regulator) temperature + GPUBOARD_VR_FIRST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST + GPUBOARD_VDDCR_VDD0 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0 # VDDCR VDD0 voltage regulator temperature + GPUBOARD_VDDCR_VDD1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1 # VDDCR VDD1 voltage regulator temperature + GPUBOARD_VDDCR_VDD2 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2 # VDDCR VDD2 voltage regulator temperature + GPUBOARD_VDDCR_VDD3 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3 # VDDCR VDD3 voltage regulator temperature + GPUBOARD_VDDCR_SOC_A = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A # VDDCR SOC A voltage regulator temperature + GPUBOARD_VDDCR_SOC_C = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C # VDDCR SOC C voltage regulator temperature + GPUBOARD_VDDCR_SOCIO_A = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A # VDDCR SOCIO A voltage regulator temperature + GPUBOARD_VDDCR_SOCIO_C = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C # VDDCR SOCIO C voltage regulator temperature + GPUBOARD_VDD_085_HBM = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM # VDD 0.85V HBM voltage regulator temperature + GPUBOARD_VDDCR_11_HBM_B = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B # VDDCR 1.1V HBM B voltage regulator temperature + GPUBOARD_VDDCR_11_HBM_D = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D # VDDCR 1.1V HBM D voltage regulator temperature + GPUBOARD_VDD_USR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR # VDD USR voltage regulator temperature + GPUBOARD_VDDIO_11_E32 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32 # VDDIO 1.1V E32 voltage regulator temperature + GPUBOARD_VR_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST + + # Baseboard System temperature + BASEBOARD_FIRST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST + BASEBOARD_UBB_FPGA = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA # UBB FPGA temperature + BASEBOARD_UBB_FRONT = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT # UBB front temperature + BASEBOARD_UBB_BACK = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK # UBB back temperature + BASEBOARD_UBB_OAM7 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7 # UBB OAM7 temperature + BASEBOARD_UBB_IBC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC # UBB IBC temperature + BASEBOARD_UBB_UFPGA = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA # UBB UFPGA temperature + BASEBOARD_UBB_OAM1 = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1 # UBB OAM1 temperature + BASEBOARD_OAM_0_1_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC # OAM 0-1 HSC temperature + BASEBOARD_OAM_2_3_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC # OAM 2-3 HSC temperature + BASEBOARD_OAM_4_5_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC # OAM 4-5 HSC temperature + BASEBOARD_OAM_6_7_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC # OAM 6-7 HSC temperature + BASEBOARD_UBB_FPGA_0V72_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR # UBB FPGA 0.72V voltage regulator temperature + BASEBOARD_UBB_FPGA_3V3_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR # UBB FPGA 3.3V voltage regulator temperature + BASEBOARD_RETIMER_0_1_2_3_1V2_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR # Retimer 0-1-2-3 1.2V voltage regulator temperature + BASEBOARD_RETIMER_4_5_6_7_1V2_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR # Retimer 4-5-6-7 1.2V voltage regulator temperature + BASEBOARD_RETIMER_0_1_0V9_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR # Retimer 0-1 0.9V voltage regulator temperature + BASEBOARD_RETIMER_4_5_0V9_VR= amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR # Retimer 4-5 0.9V voltage regulator temperature + BASEBOARD_RETIMER_2_3_0V9_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR # Retimer 2-3 0.9V voltage regulator temperature + BASEBOARD_RETIMER_6_7_0V9_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR # Retimer 6-7 0.9V voltage regulator temperature + BASEBOARD_OAM_0_1_2_3_3V3_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR # OAM 0-1-2-3 3.3V voltage regulator temperature + BASEBOARD_OAM_4_5_6_7_3V3_VR = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR # OAM 4-5-6-7 3.3V voltage regulator temperature + BASEBOARD_IBC_HSC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC # IBC HSC temperature + BASEBOARD_IBC = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC # IBC temperature + BASEBOARD_LAST = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST + BASEBOARD__MAX = amdsmi_wrapper.AMDSMI_TEMPERATURE_TYPE__MAX # Maximum per GPU temperature type + class AmdSmiDevPerfLevel(IntEnum): AUTO = amdsmi_wrapper.AMDSMI_DEV_PERF_LEVEL_AUTO diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 151e5872e9..8f78defeca 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -493,7 +493,55 @@ amdsmi_temperature_type_t__enumvalues = { 5: 'AMDSMI_TEMPERATURE_TYPE_HBM_2', 6: 'AMDSMI_TEMPERATURE_TYPE_HBM_3', 7: 'AMDSMI_TEMPERATURE_TYPE_PLX', - 7: 'AMDSMI_TEMPERATURE_TYPE__MAX', + 100: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST', + 100: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X', + 101: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC', + 102: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2', + 103: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR', + 104: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR', + 105: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR', + 149: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST', + 150: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST', + 150: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0', + 151: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1', + 152: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2', + 153: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3', + 154: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A', + 155: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C', + 156: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A', + 157: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C', + 158: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM', + 159: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B', + 160: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D', + 161: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR', + 162: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32', + 199: 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST', + 200: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST', + 200: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA', + 201: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT', + 202: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK', + 203: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7', + 204: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC', + 205: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA', + 206: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1', + 207: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC', + 208: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC', + 209: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC', + 210: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC', + 211: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR', + 212: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR', + 213: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR', + 214: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR', + 215: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR', + 216: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR', + 217: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR', + 218: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR', + 219: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR', + 220: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR', + 221: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC', + 222: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC', + 249: 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST', + 249: 'AMDSMI_TEMPERATURE_TYPE__MAX', } AMDSMI_TEMPERATURE_TYPE_EDGE = 0 AMDSMI_TEMPERATURE_TYPE_FIRST = 0 @@ -505,7 +553,55 @@ AMDSMI_TEMPERATURE_TYPE_HBM_1 = 4 AMDSMI_TEMPERATURE_TYPE_HBM_2 = 5 AMDSMI_TEMPERATURE_TYPE_HBM_3 = 6 AMDSMI_TEMPERATURE_TYPE_PLX = 7 -AMDSMI_TEMPERATURE_TYPE__MAX = 7 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST = 100 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X = 100 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC = 101 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2 = 102 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR = 103 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR = 104 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR = 105 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST = 149 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST = 150 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0 = 150 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1 = 151 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2 = 152 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3 = 153 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A = 154 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C = 155 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A = 156 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C = 157 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM = 158 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B = 159 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D = 160 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR = 161 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32 = 162 +AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST = 199 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST = 200 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA = 200 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT = 201 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK = 202 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7 = 203 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC = 204 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA = 205 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1 = 206 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC = 207 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC = 208 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC = 209 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC = 210 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR = 211 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR = 212 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR = 213 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR = 214 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR = 215 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR = 216 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR = 217 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR = 218 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR = 219 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR = 220 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC = 221 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC = 222 +AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST = 249 +AMDSMI_TEMPERATURE_TYPE__MAX = 249 amdsmi_temperature_type_t = ctypes.c_uint32 # enum # values for enumeration 'amdsmi_fw_block_t' @@ -866,21 +962,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('max_pcie_interface_version', ctypes.c_uint32), - ('PADDING_1', ctypes.c_ubyte * 4), - ('reserved', ctypes.c_uint64 * 9), -] - class struct_pcie_metric_(Structure): pass @@ -901,6 +982,21 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 12), ] +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('max_pcie_interface_version', ctypes.c_uint32), + ('PADDING_1', ctypes.c_ubyte * 4), + ('reserved', ctypes.c_uint64 * 9), +] + struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -3146,7 +3242,55 @@ __all__ = \ 'AMDSMI_STATUS_SETTING_UNAVAILABLE', 'AMDSMI_STATUS_SUCCESS', 'AMDSMI_STATUS_TIMEOUT', 'AMDSMI_STATUS_UNEXPECTED_DATA', 'AMDSMI_STATUS_UNEXPECTED_SIZE', 'AMDSMI_STATUS_UNKNOWN_ERROR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_FIRST', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_LAST', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7', + 'AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA', 'AMDSMI_TEMPERATURE_TYPE_EDGE', 'AMDSMI_TEMPERATURE_TYPE_FIRST', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_FIRST', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_LAST', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_FIRST', + 'AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VR_LAST', 'AMDSMI_TEMPERATURE_TYPE_HBM_0', 'AMDSMI_TEMPERATURE_TYPE_HBM_1', 'AMDSMI_TEMPERATURE_TYPE_HBM_2', 'AMDSMI_TEMPERATURE_TYPE_HBM_3', 'AMDSMI_TEMPERATURE_TYPE_HOTSPOT', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 8abeb1ad71..9fab284ee1 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -532,7 +532,64 @@ typedef enum { RSMI_TEMP_TYPE_HBM_1, //!< HBM temperature instance 1 RSMI_TEMP_TYPE_HBM_2, //!< HBM temperature instance 2 RSMI_TEMP_TYPE_HBM_3, //!< HBM temperature instance 3 - RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_HBM_3, + RSMI_TEMP_TYPE_PLX, //!< PLX temperature + + + // GPU Board Node temperature + RSMI_TEMP_TYPE_GPUBOARD_NODE_FIRST = 100, + RSMI_TEMP_TYPE_GPUBOARD_NODE_RETIMER_X = RSMI_TEMP_TYPE_GPUBOARD_NODE_FIRST, //!< Retimer X temperature + RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_IBC, //!< OAM X IBC temperature + RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_IBC_2, //!< OAM X IBC 2 temperature + RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR, //!< OAM X VDD 1.8V voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR, //!< OAM X 0.4V HBM B voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR, //!< OAM X 0.4V HBM D voltage regulator temperature + + // GPU Board VR (Voltage Regulator) temperature + RSMI_TEMP_TYPE_GPUBOARD_VR_FIRST = 150, + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD0 = RSMI_TEMP_TYPE_GPUBOARD_VR_FIRST, //!< VDDCR VDD0 voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD1, //!< VDDCR VDD1 voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD2, //!< VDDCR VDD2 voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD3, //!< VDDCR VDD3 voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOC_A, //!< VDDCR SOC A voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOC_C, //!< VDDCR SOC C voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOCIO_A, //!< VDDCR SOCIO A voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOCIO_C, //!< VDDCR SOCIO C voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDD_085_HBM, //!< VDD 0.85V HBM voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_11_HBM_B, //!< VDDCR 1.1V HBM B voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDCR_11_HBM_D, //!< VDDCR 1.1V HBM D voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDD_USR, //!< VDD USR voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_VDDIO_11_E32, //!< VDDIO 1.1V E32 voltage regulator temperature + RSMI_TEMP_TYPE_GPUBOARD_LAST = 199, + + // Baseboard System temperature + RSMI_TEMP_TYPE_BASEBOARD_FIRST = 200, + RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA = RSMI_TEMP_TYPE_BASEBOARD_FIRST, //!< UBB FPGA temperature + RSMI_TEMP_TYPE_BASEBOARD_UBB_FRONT, //!< UBB front temperature + RSMI_TEMP_TYPE_BASEBOARD_UBB_BACK, //!< UBB back temperature + RSMI_TEMP_TYPE_BASEBOARD_UBB_OAM7, //!< UBB OAM7 temperature + RSMI_TEMP_TYPE_BASEBOARD_UBB_IBC, //!< UBB IBC temperature + RSMI_TEMP_TYPE_BASEBOARD_UBB_UFPGA, //!< UBB UFPGA temperature + RSMI_TEMP_TYPE_BASEBOARD_UBB_OAM1, //!< UBB OAM1 temperature + RSMI_TEMP_TYPE_BASEBOARD_OAM_0_1_HSC, //!< OAM 0-1 HSC temperature + RSMI_TEMP_TYPE_BASEBOARD_OAM_2_3_HSC, //!< OAM 2-3 HSC temperature + RSMI_TEMP_TYPE_BASEBOARD_OAM_4_5_HSC, //!< OAM 4-5 HSC temperature + RSMI_TEMP_TYPE_BASEBOARD_OAM_6_7_HSC, //!< OAM 6-7 HSC temperature + RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA_0V72_VR, //!< UBB FPGA 0.72V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA_3V3_VR, //!< UBB FPGA 3.3V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR, //!< Retimer 0-1-2-3 1.2V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR, //!< Retimer 4-5-6-7 1.2V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR, //!< Retimer 0-1 0.9V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR, //!< Retimer 4-5 0.9V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR, //!< Retimer 2-3 0.9V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR, //!< Retimer 6-7 0.9V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR, //!< OAM 0-1-2-3 3.3V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR, //!< OAM 4-5-6-7 3.3V voltage regulator temperature + RSMI_TEMP_TYPE_BASEBOARD_IBC_HSC, //!< IBC HSC temperature + RSMI_TEMP_TYPE_BASEBOARD_IBC, //!< IBC temperature + RSMI_TEMP_TYPE_BASEBOARD_LAST = 249, + + RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_BASEBOARD_LAST, //!< Last of per GPU temperature types + RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } rsmi_temperature_type_t; diff --git a/rocm_smi/include/rocm_smi/rocm_smi_board_temp.h b/rocm_smi/include/rocm_smi/rocm_smi_board_temp.h new file mode 100644 index 0000000000..10fa9d9f6b --- /dev/null +++ b/rocm_smi/include/rocm_smi/rocm_smi_board_temp.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_BOARD_TEMP_H_ +#define ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_BOARD_TEMP_H_ + +#include "rocm_smi/rocm_smi.h" + + +// Headers from the driver +namespace amd::smi { +enum amdgpu_vr_temp { + AMDGPU_VDDCR_VDD0_TEMP, + AMDGPU_VDDCR_VDD1_TEMP, + AMDGPU_VDDCR_VDD2_TEMP, + AMDGPU_VDDCR_VDD3_TEMP, + AMDGPU_VDDCR_SOC_A_TEMP, + AMDGPU_VDDCR_SOC_C_TEMP, + AMDGPU_VDDCR_SOCIO_A_TEMP, + AMDGPU_VDDCR_SOCIO_C_TEMP, + AMDGPU_VDD_085_HBM_TEMP, + AMDGPU_VDDCR_11_HBM_B_TEMP, + AMDGPU_VDDCR_11_HBM_D_TEMP, + AMDGPU_VDD_USR_TEMP, + AMDGPU_VDDIO_11_E32_TEMP, + AMDGPU_VR_MAX_TEMP_ENTRIES, +}; + +enum amdgpu_system_temp { + AMDGPU_UBB_FPGA_TEMP, + AMDGPU_UBB_FRONT_TEMP, + AMDGPU_UBB_BACK_TEMP, + AMDGPU_UBB_OAM7_TEMP, + AMDGPU_UBB_IBC_TEMP, + AMDGPU_UBB_UFPGA_TEMP, + AMDGPU_UBB_OAM1_TEMP, + AMDGPU_OAM_0_1_HSC_TEMP, + AMDGPU_OAM_2_3_HSC_TEMP, + AMDGPU_OAM_4_5_HSC_TEMP, + AMDGPU_OAM_6_7_HSC_TEMP, + AMDGPU_UBB_FPGA_0V72_VR_TEMP, + AMDGPU_UBB_FPGA_3V3_VR_TEMP, + AMDGPU_RETIMER_0_1_2_3_1V2_VR_TEMP, + AMDGPU_RETIMER_4_5_6_7_1V2_VR_TEMP, + AMDGPU_RETIMER_0_1_0V9_VR_TEMP, + AMDGPU_RETIMER_4_5_0V9_VR_TEMP, + AMDGPU_RETIMER_2_3_0V9_VR_TEMP, + AMDGPU_RETIMER_6_7_0V9_VR_TEMP, + AMDGPU_OAM_0_1_2_3_3V3_VR_TEMP, + AMDGPU_OAM_4_5_6_7_3V3_VR_TEMP, + AMDGPU_IBC_HSC_TEMP, + AMDGPU_IBC_TEMP, + AMDGPU_SYSTEM_MAX_TEMP_ENTRIES = 32, +}; + +enum amdgpu_node_temp { + AMDGPU_RETIMER_X_TEMP, + AMDGPU_OAM_X_IBC_TEMP, + AMDGPU_OAM_X_IBC_2_TEMP, + AMDGPU_OAM_X_VDD18_VR_TEMP, + AMDGPU_OAM_X_04_HBM_B_VR_TEMP, + AMDGPU_OAM_X_04_HBM_D_VR_TEMP, + AMDGPU_NODE_MAX_TEMP_ENTRIES = 12, +}; + +struct amdgpu_gpuboard_temp_metrics_v1_0 { + struct metrics_table_header_t common_header; + uint16_t label_version; + uint16_t node_id; + uint64_t accumulation_counter; + /* Encoded temperature in Celcius, 24:31 is sensor id 0:23 is temp value */ + uint32_t node_temp[AMDGPU_NODE_MAX_TEMP_ENTRIES]; + uint32_t vr_temp[AMDGPU_VR_MAX_TEMP_ENTRIES]; +}; + +struct amdgpu_baseboard_temp_metrics_v1_0 { + struct metrics_table_header_t common_header; + uint16_t label_version; + uint16_t node_id; + uint64_t accumulation_counter; + /* Encoded temperature in Celcius, 24:31 is sensor id 0:23 is temp value */ + uint32_t system_temp[AMDGPU_SYSTEM_MAX_TEMP_ENTRIES]; +}; + + + +rsmi_status_t read_gpuboard_temp_metrics(const char* filename, amdgpu_gpuboard_temp_metrics_v1_0& metrics); +rsmi_status_t read_baseboard_temp_metrics(const char* filename, amdgpu_baseboard_temp_metrics_v1_0& metrics); + +rsmi_status_t get_baseboard_temp_value(const amdgpu_baseboard_temp_metrics_v1_0& metrics, + rsmi_temperature_type_t temperature_type, + int64_t* value); + +rsmi_status_t get_gpuboard_temp_value(const amdgpu_gpuboard_temp_metrics_v1_0& metrics, + rsmi_temperature_type_t temperature_type, + int64_t* value); +} +#endif // ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_BOARD_TEMP_H_ diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index 80dc2d2d4a..7be11d7eca 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -160,6 +160,8 @@ enum DevInfoTypes { kDevGpuMetrics, kDevPmMetrics, kDevRegMetrics, + kDevBaseBoardTempMetrics, + kDevGpuBoardTempMetrics, kDevGpuReset, kDevAvailableComputePartition, kDevComputePartition, diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index afaf4161e3..ef627c15d7 100644 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -59,6 +59,7 @@ #include "rocm_smi/rocm_smi_io_link.h" #include "rocm_smi/rocm_smi64Config.h" #include "rocm_smi/rocm_smi_logger.h" +#include "rocm_smi/rocm_smi_board_temp.h" using amd::smi::monitorTypesToString; using amd::smi::getRSMIStatusString; @@ -3293,6 +3294,63 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, rsmi_status_t ret; amd::smi::MonitorTypes mon_type = amd::smi::kMonInvalid; uint16_t val_ui16; + GET_DEV_FROM_INDX + + // handle gpu board temp + if (sensor_type >= RSMI_TEMP_TYPE_GPUBOARD_NODE_FIRST && + sensor_type <= RSMI_TEMP_TYPE_GPUBOARD_LAST ) { + if (metric != RSMI_TEMP_CURRENT) { + LOG_ERROR("GPUBoard temperature only support RSMI_TEMP_CURRENT"); + return RSMI_STATUS_NOT_SUPPORTED; + } + + + std::string file_path = dev->get_sys_file_path_by_type(amd::smi::kDevGpuBoardTempMetrics); + if (file_path == "") { + LOG_ERROR("Failed to get GPU board temperature metrics file path"); + return RSMI_STATUS_NOT_SUPPORTED; + } + + amd::smi::amdgpu_gpuboard_temp_metrics_v1_0 gpuboard_metric; + ret = read_gpuboard_temp_metrics(file_path.c_str(), gpuboard_metric); + if (ret != RSMI_STATUS_SUCCESS) { + std::string err_msg = "Failed to read GPU board temperature metrics at " + file_path; + LOG_ERROR(err_msg); + return ret; + } + + ret = get_gpuboard_temp_value(gpuboard_metric, + static_cast(sensor_type), temperature); + return ret; + } + + // handle base board temp + if (sensor_type >= RSMI_TEMP_TYPE_BASEBOARD_FIRST && + sensor_type <= RSMI_TEMP_TYPE_BASEBOARD_LAST ) { + if (metric != RSMI_TEMP_CURRENT) { + LOG_ERROR("Baseboard temperature only supports RSMI_TEMP_CURRENT"); + return RSMI_STATUS_NOT_SUPPORTED; + } + + + std::string file_path = dev->get_sys_file_path_by_type(amd::smi::kDevBaseBoardTempMetrics); + if (file_path.empty()) { + LOG_ERROR("Failed to get baseboard temperature metrics file path"); + return RSMI_STATUS_NOT_SUPPORTED; + } + + amd::smi::amdgpu_baseboard_temp_metrics_v1_0 baseboard_metric; + ret = read_baseboard_temp_metrics(file_path.c_str(), baseboard_metric); + if (ret != RSMI_STATUS_SUCCESS) { + std::string err_msg = "Failed to read baseboard temperature metrics at " + file_path; + LOG_ERROR(err_msg); + return ret; + } + + ret = get_baseboard_temp_value(baseboard_metric, + static_cast(sensor_type), temperature); + return ret; + } static const std::map kMetricTypeMap = { @@ -3410,8 +3468,6 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, DEVICE_MUTEX - GET_DEV_FROM_INDX - if (dev->monitor() == nullptr) { ss << __PRETTY_FUNCTION__ << " | ======= end ======= " diff --git a/rocm_smi/src/rocm_smi_board_temp.cc b/rocm_smi/src/rocm_smi_board_temp.cc new file mode 100644 index 0000000000..d1ac6078bf --- /dev/null +++ b/rocm_smi/src/rocm_smi_board_temp.cc @@ -0,0 +1,388 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "rocm_smi/rocm_smi_board_temp.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi_logger.h" +#include +#include +#include +#include +#include +#include + +using amd::smi::getRSMIStatusString; + +namespace amd::smi { + +// Static mapping tables for temperature type conversions +static const std::map vr_temp_map = { + {AMDGPU_VDDCR_VDD0_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD0}, + {AMDGPU_VDDCR_VDD1_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD1}, + {AMDGPU_VDDCR_VDD2_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD2}, + {AMDGPU_VDDCR_VDD3_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_VDD3}, + {AMDGPU_VDDCR_SOC_A_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOC_A}, + {AMDGPU_VDDCR_SOC_C_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOC_C}, + {AMDGPU_VDDCR_SOCIO_A_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOCIO_A}, + {AMDGPU_VDDCR_SOCIO_C_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_SOCIO_C}, + {AMDGPU_VDD_085_HBM_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDD_085_HBM}, + {AMDGPU_VDDCR_11_HBM_B_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_11_HBM_B}, + {AMDGPU_VDDCR_11_HBM_D_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDCR_11_HBM_D}, + {AMDGPU_VDD_USR_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDD_USR}, + {AMDGPU_VDDIO_11_E32_TEMP, RSMI_TEMP_TYPE_GPUBOARD_VDDIO_11_E32} +}; + +static const std::map node_temp_map = { + {AMDGPU_RETIMER_X_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_RETIMER_X}, + {AMDGPU_OAM_X_IBC_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_IBC}, + {AMDGPU_OAM_X_IBC_2_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_IBC_2}, + {AMDGPU_OAM_X_VDD18_VR_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR}, + {AMDGPU_OAM_X_04_HBM_B_VR_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR}, + {AMDGPU_OAM_X_04_HBM_D_VR_TEMP, RSMI_TEMP_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR} +}; + +static const std::map system_temp_map = { + {AMDGPU_UBB_FPGA_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA}, + {AMDGPU_UBB_FRONT_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_FRONT}, + {AMDGPU_UBB_BACK_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_BACK}, + {AMDGPU_UBB_OAM7_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_OAM7}, + {AMDGPU_UBB_IBC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_IBC}, + {AMDGPU_UBB_UFPGA_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_UFPGA}, + {AMDGPU_UBB_OAM1_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_OAM1}, + {AMDGPU_OAM_0_1_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_0_1_HSC}, + {AMDGPU_OAM_2_3_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_2_3_HSC}, + {AMDGPU_OAM_4_5_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_4_5_HSC}, + {AMDGPU_OAM_6_7_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_6_7_HSC}, + {AMDGPU_UBB_FPGA_0V72_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA_0V72_VR}, + {AMDGPU_UBB_FPGA_3V3_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_UBB_FPGA_3V3_VR}, + {AMDGPU_RETIMER_0_1_2_3_1V2_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR}, + {AMDGPU_RETIMER_4_5_6_7_1V2_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR}, + {AMDGPU_RETIMER_0_1_0V9_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR}, + {AMDGPU_RETIMER_4_5_0V9_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR}, + {AMDGPU_RETIMER_2_3_0V9_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR}, + {AMDGPU_RETIMER_6_7_0V9_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR}, + {AMDGPU_OAM_0_1_2_3_3V3_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR}, + {AMDGPU_OAM_4_5_6_7_3V3_VR_TEMP, RSMI_TEMP_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR}, + {AMDGPU_IBC_HSC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_IBC_HSC}, + {AMDGPU_IBC_TEMP, RSMI_TEMP_TYPE_BASEBOARD_IBC} +}; + +// Helper function to create hex dump string +static std::string createHexDump(const void* data, size_t size, const std::string& description) { + std::ostringstream ss; + const unsigned char* bytes = static_cast(data); + + ss << "=== " << description << " (size: " << size << " bytes) ===" << std::endl; + + for (size_t i = 0; i < size; i += 16) { + // Print offset + ss << std::hex << std::setfill('0') << std::setw(8) << i << ": "; + + // Print hex bytes + for (size_t j = 0; j < 16; ++j) { + if (i + j < size) { + ss << std::hex << std::setfill('0') << std::setw(2) << static_cast(bytes[i + j]) << " "; + } else { + ss << " "; + } + } + + ss << " | "; + + // Print ASCII representation + for (size_t j = 0; j < 16 && i + j < size; ++j) { + unsigned char c = bytes[i + j]; + ss << (std::isprint(c) ? static_cast(c) : '.'); + } + + ss << std::endl; + } + + ss << "=== End " << description << " ===" << std::endl; + return ss.str(); +} + + +rsmi_status_t read_gpuboard_temp_metrics(const char* filename, amdgpu_gpuboard_temp_metrics_v1_0& metrics) { + if (!filename) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======= " + << " | Fail | filename is null | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_INFO(ss); + return RSMI_STATUS_INVALID_ARGS; + } + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======= " + << " | filename: " << filename; + LOG_INFO(ss); + + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::ostringstream ess; + ess << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Fail | Could not open file: " << filename + << " | errno: " << errno << " (" << std::strerror(errno) << ")" + << " | Returning = " << getRSMIStatusString(ErrnoToRsmiStatus(errno)) << " |"; + LOG_INFO(ess); + return ErrnoToRsmiStatus(errno); + } + + // Clear the metrics structure + std::memset(&metrics, 0, sizeof(metrics)); + + // Read the entire structure + file.read(reinterpret_cast(&metrics), sizeof(metrics)); + + if (file.bad()) { + std::ostringstream ess; + ess << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Fail | File read error | errno: " << errno << " (" << std::strerror(errno) << ")" + << " | Returning = " << getRSMIStatusString(ErrnoToRsmiStatus(errno)) << " |"; + LOG_INFO(ess); + return ErrnoToRsmiStatus(errno); + } + + // Always create hex dump for debugging, using the number of bytes actually read + std::string hexDump = createHexDump(&metrics, file.gcount(), "GPU Board Temperature Metrics"); + LOG_DEBUG(hexDump); + + if (file.gcount() != sizeof(metrics)) { + std::ostringstream ess; + ess << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Fail | Insufficient data read" + << " | Expected: " << sizeof(metrics) << " bytes" + << " | Actual: " << file.gcount() << " bytes" + << " | Returning = " << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |"; + LOG_INFO(ess); + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + + std::ostringstream oss; + oss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success | File: " << filename + << " | Bytes read: " << sizeof(metrics) + << " | Header format: " << static_cast(metrics.common_header.format_revision) + << " | Header content: " << static_cast(metrics.common_header.content_revision) + << " | Node ID: " << metrics.node_id + << " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + LOG_INFO(oss); + + return RSMI_STATUS_SUCCESS; +} + +rsmi_status_t read_baseboard_temp_metrics(const char* filename, amdgpu_baseboard_temp_metrics_v1_0& metrics) { + if (!filename) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======= " + << " | Fail | filename is null | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_INFO(ss); + return RSMI_STATUS_INVALID_ARGS; + } + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======= " + << " | filename: " << filename; + LOG_INFO(ss); + + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::ostringstream ess; + ess << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Fail | Could not open file: " << filename + << " | errno: " << errno << " (" << std::strerror(errno) << ")" + << " | Returning = " << getRSMIStatusString(ErrnoToRsmiStatus(errno)) << " |"; + LOG_INFO(ess); + return ErrnoToRsmiStatus(errno); + } + + // Clear the metrics structure + std::memset(&metrics, 0, sizeof(metrics)); + + // Read the entire structure + file.read(reinterpret_cast(&metrics), sizeof(metrics)); + + if (file.bad()) { + std::ostringstream ess; + ess << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Fail | File read error | errno: " << errno << " (" << std::strerror(errno) << ")" + << " | Returning = " << getRSMIStatusString(ErrnoToRsmiStatus(errno)) << " |"; + LOG_INFO(ess); + return ErrnoToRsmiStatus(errno); + } + + // Always create hex dump for debugging, using the number of bytes actually read + std::string hexDump = createHexDump(&metrics, file.gcount(), "Baseboard Temperature Metrics"); + LOG_DEBUG(hexDump); + + if (file.gcount() != sizeof(metrics)) { + std::ostringstream ess; + ess << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Fail | Insufficient data read" + << " | Expected: " << sizeof(metrics) << " bytes" + << " | Actual: " << file.gcount() << " bytes" + << " | Returning = " << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |"; + LOG_INFO(ess); + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + + std::ostringstream oss; + oss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success | File: " << filename + << " | Bytes read: " << sizeof(metrics) + << " | Header format: " << static_cast(metrics.common_header.format_revision) + << " | Header content: " << static_cast(metrics.common_header.content_revision) + << " | Node ID: " << metrics.node_id + << " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + LOG_INFO(oss); + + return RSMI_STATUS_SUCCESS; +} + + +// Decode encoded temperature value: bits 24-31 = sensor id, bits 0-23 = signed temperature (Celsius) +static int32_t decode_temperature_value(uint32_t encoded, uint8_t* sensor_id = nullptr) { + if (sensor_id) { + *sensor_id = static_cast((encoded >> 24) & 0xFF); + } + // Extract signed 24-bit temperature value + int32_t temp = static_cast(encoded & 0xFFFFFF); + // Sign-extend if negative + if (temp & 0x800000) { + temp |= ~0xFFFFFF; + } + + temp *= 1000; // Convert Celsius to milli-Celsius + return temp; +} + +rsmi_status_t get_gpuboard_temp_value(const amdgpu_gpuboard_temp_metrics_v1_0& metrics, + rsmi_temperature_type_t temperature_type, + int64_t* value) { + if (!value) { + return RSMI_STATUS_INVALID_ARGS; + } + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======= " + << " | Node ID: " << metrics.node_id + << " | Temperature type: " << static_cast(temperature_type); + LOG_INFO(ss); + + *value = 0; // Initialize to 0 + const uint32_t INVALID_VALUE = std::numeric_limits::max(); + + // Check VR (Voltage Regulator) temperatures first + for (int i = 0; i < AMDGPU_VR_MAX_TEMP_ENTRIES; ++i) { + if (metrics.vr_temp[i] != INVALID_VALUE) { + auto it = vr_temp_map.find(i); + if (it != vr_temp_map.end() && it->second == temperature_type) { + *value = decode_temperature_value(metrics.vr_temp[i]); + + std::ostringstream oss; + oss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success | VR temp found at index: " << i + << " | Raw value: " << *value + << " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + LOG_INFO(oss); + return RSMI_STATUS_SUCCESS; + } + } + } + + // Check node temperatures if not found in VR + for (int i = 0; i < AMDGPU_NODE_MAX_TEMP_ENTRIES; ++i) { + if (metrics.node_temp[i] != INVALID_VALUE) { // Max int indicates invalid temperature reading + auto it = node_temp_map.find(i); + if (it != node_temp_map.end() && it->second == temperature_type) { + *value = decode_temperature_value(metrics.node_temp[i]); + + std::ostringstream oss; + oss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success | Node temp found at index: " << i + << " | Raw value: " << *value + << " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + LOG_INFO(oss); + return RSMI_STATUS_SUCCESS; + } + } + } + + // Temperature type not found in metrics + std::ostringstream ess; + ess << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Fail | Temperature type not found in GPU board metrics" + << " | Temperature type: " << static_cast(temperature_type) + << " | Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ess); + return RSMI_STATUS_NOT_SUPPORTED; +} + +rsmi_status_t get_baseboard_temp_value(const amdgpu_baseboard_temp_metrics_v1_0& metrics, + rsmi_temperature_type_t temperature_type, + int64_t* value) { + if (!value) { + return RSMI_STATUS_INVALID_ARGS; + } + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======= " + << " | Node ID: " << metrics.node_id + << " | Temperature type: " << static_cast(temperature_type); + LOG_INFO(ss); + + *value = 0; // Initialize to 0 + const uint32_t INVALID_VALUE = std::numeric_limits::max(); + + // Check system temperatures + for (int i = 0; i < AMDGPU_SYSTEM_MAX_TEMP_ENTRIES; ++i) { + if (metrics.system_temp[i] != INVALID_VALUE) { // Max int indicates invalid temperature reading + auto it = system_temp_map.find(i); + if (it != system_temp_map.end() && it->second == temperature_type) { + *value = decode_temperature_value(metrics.system_temp[i]); + + std::ostringstream oss; + oss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success | System temp found at index: " << i + << " | Raw value: " << *value + << " | Returning = " << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + LOG_INFO(oss); + return RSMI_STATUS_SUCCESS; + } + } + } + + // Temperature type not found in metrics + std::ostringstream ess; + ess << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Fail | Temperature type not found in baseboard metrics" + << " | Temperature type: " << static_cast(temperature_type) + << " | Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ess); + return RSMI_STATUS_NOT_SUPPORTED; +} + + +} // end namespace \ No newline at end of file diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 28c7c0267d..2509c837a2 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -118,6 +118,9 @@ static const char *kDevNumaNodeFName = "numa_node"; static const char *kDevGpuMetricsFName = "gpu_metrics"; static const char *kDevPmMetricsFName = "pm_metrics"; // PM log static const char *kDevRegMetricsFName = "reg_state"; // register table +static const char *kDevBaseBoardTempMetricsFName = "board/baseboard_temp"; +static const char *kDevGpuBoardTempMetricsFName = "board/gpuboard_temp"; + static const char *kDevAvailableComputePartitionFName = "available_compute_partition"; static const char *kDevComputePartitionFName = "current_compute_partition"; @@ -325,6 +328,8 @@ static const std::map kDevAttribNameMap = { {kDevProcessIsolation, kDevProcessIsolationFName}, {kDevShaderClean, kDevShaderCleanFName}, {kDevRegMetrics, kDevRegMetricsFName}, + {kDevBaseBoardTempMetrics, kDevBaseBoardTempMetricsFName}, + {kDevGpuBoardTempMetrics, kDevGpuBoardTempMetricsFName}, {kDevGpuReset, kDevGpuResetFName}, {kDevAvailableComputePartition, kDevAvailableComputePartitionFName}, {kDevComputePartition, kDevComputePartitionFName}, @@ -495,6 +500,8 @@ Device::devInfoTypesStrings = { {kDevGpuMetrics, "kDevGpuMetrics"}, {kDevPmMetrics, "kDevPmMetrics"}, {kDevRegMetrics, "kDevRegMetrics"}, + {kDevBaseBoardTempMetrics, "kDevBaseBoardTempMetrics"}, + {kDevGpuBoardTempMetrics, "kDevGpuBoardTempMetrics"}, {kDevGpuReset, "kDevGpuReset"}, {kDevAvailableComputePartition, "kDevAvailableComputePartition"}, {kDevComputePartition, "kDevComputePartition"}, diff --git a/src/amd_smi/amd_smi_drm.cc b/src/amd_smi/amd_smi_drm.cc index 764c2017be..9b9156cf26 100644 --- a/src/amd_smi/amd_smi_drm.cc +++ b/src/amd_smi/amd_smi_drm.cc @@ -144,7 +144,7 @@ amdsmi_status_t AMDSmiDrm::init() { // even if fail, still add to prevent mismatch the index if (!has_valid_fds) { drm_bdfs_.push_back(bdf); - drm_free_device(&device); + // No need to free device here since it is not valid continue; } diff --git a/tests/amd_smi_test/functional/temp_read.cc b/tests/amd_smi_test/functional/temp_read.cc index 9a749d2ae9..c8816ff2a1 100644 --- a/tests/amd_smi_test/functional/temp_read.cc +++ b/tests/amd_smi_test/functional/temp_read.cc @@ -42,7 +42,55 @@ static const std::map kTempSensorNameMap = { {AMDSMI_TEMPERATURE_TYPE_HBM_1, "HBM_1"}, {AMDSMI_TEMPERATURE_TYPE_HBM_2, "HBM_2"}, {AMDSMI_TEMPERATURE_TYPE_HBM_3, "HBM_3"}, - {AMDSMI_TEMPERATURE_TYPE_PLX, "PLX"} + {AMDSMI_TEMPERATURE_TYPE_PLX, "PLX"}, + + // GPU Board Node Temperature Types (100-149) + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_RETIMER_X, "GPU Board Node Retimer X"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC, "GPU Board Node OAM X IBC"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_IBC_2, "GPU Board Node OAM X IBC 2"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_VDD18_VR, "GPU Board Node OAM X VDD18 VR"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_B_VR, "GPU Board Node OAM X 04 HBM B VR"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_NODE_OAM_X_04_HBM_D_VR, "GPU Board Node OAM X 04 HBM D VR"}, + + // GPU Board VR Temperature Types (150-199) + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD0, "GPU Board VDDCR VDD0"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD1, "GPU Board VDDCR VDD1"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD2, "GPU Board VDDCR VDD2"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_VDD3, "GPU Board VDDCR VDD3"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_A, "GPU Board VDDCR SOC A"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOC_C, "GPU Board VDDCR SOC C"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_A, "GPU Board VDDCR SOCIO A"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_SOCIO_C, "GPU Board VDDCR SOCIO C"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_085_HBM, "GPU Board VDD 085 HBM"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_B, "GPU Board VDDCR 11 HBM B"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDCR_11_HBM_D, "GPU Board VDDCR 11 HBM D"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDD_USR, "GPU Board VDD USR"}, + {AMDSMI_TEMPERATURE_TYPE_GPUBOARD_VDDIO_11_E32, "GPU Board VDDIO 11 E32"}, + + // Baseboard System Temperature Types (200+) + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA, "Baseboard UBB FPGA"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FRONT, "Baseboard UBB Front"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_BACK, "Baseboard UBB Back"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM7, "Baseboard UBB OAM7"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_IBC, "Baseboard UBB IBC"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_UFPGA, "Baseboard UBB UFPGA"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_OAM1, "Baseboard UBB OAM1"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_HSC, "Baseboard OAM 0-1 HSC"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_2_3_HSC, "Baseboard OAM 2-3 HSC"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_HSC, "Baseboard OAM 4-5 HSC"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_6_7_HSC, "Baseboard OAM 6-7 HSC"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_0V72_VR, "Baseboard UBB FPGA 0V72 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_UBB_FPGA_3V3_VR, "Baseboard UBB FPGA 3V3 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_2_3_1V2_VR, "Baseboard Retimer 0-1-2-3 1V2 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_6_7_1V2_VR, "Baseboard Retimer 4-5-6-7 1V2 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_0_1_0V9_VR, "Baseboard Retimer 0-1 0V9 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_4_5_0V9_VR, "Baseboard Retimer 4-5 0V9 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_2_3_0V9_VR, "Baseboard Retimer 2-3 0V9 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_RETIMER_6_7_0V9_VR, "Baseboard Retimer 6-7 0V9 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_0_1_2_3_3V3_VR, "Baseboard OAM 0-1-2-3 3V3 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_OAM_4_5_6_7_3V3_VR, "Baseboard OAM 4-5-6-7 3V3 VR"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC_HSC, "Baseboard IBC HSC"}, + {AMDSMI_TEMPERATURE_TYPE_BASEBOARD_IBC, "Baseboard IBC"} }; TestTempRead::TestTempRead() : TestBase() { set_title("AMDSMI Temp Read Test"); @@ -118,6 +166,9 @@ void TestTempRead::Run(void) { } }; for (type = AMDSMI_TEMPERATURE_TYPE_FIRST; type <= AMDSMI_TEMPERATURE_TYPE__MAX; ++type) { + if (kTempSensorNameMap.find(type) == kTempSensorNameMap.end()) { + continue; + } IF_VERB(STANDARD) { std::cout << "\t** **********" << kTempSensorNameMap.at(type) << " Temperatures **********" << std::endl;