From da480b4589b198a3d29f87054daa1985ddb2df66 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Tue, 30 Mar 2021 10:44:01 -0400 Subject: [PATCH] Add support for the HBM temperature The rsmi_dev_temp_metric_get() can also support the HBM temperatures which is retrieved from gpu_metrics. Change-Id: I96b979296e90cf881523627b41b1a02849676416 --- include/rocm_smi/rocm_smi.h | 7 +++-- src/rocm_smi.cc | 34 +++++++++++++++++++++ tests/rocm_smi_test/functional/temp_read.cc | 4 +++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 894ba534f2..24ec69e5d2 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -407,8 +407,11 @@ typedef enum { RSMI_TEMP_TYPE_JUNCTION, //!< Junction/hotspot //!< temperature RSMI_TEMP_TYPE_MEMORY, //!< VRAM temperature - - RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_MEMORY, + RSMI_TEMP_TYPE_HBM_0, //!< HBM temperature instance 0 + RSMI_TEMP_TYPE_HBM_1, //!< HBM temperature instance 1 + RSMI_TEMP_TYPE_HBM_2, //!< HBM temperature instance 2 + RSMI_TEMP_TYPE_HBM_3, //!< HBM temperature instance 3 + RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_HBM_3, RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } rsmi_temperature_type_t; diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index eebd59bc28..525311a568 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -2024,6 +2024,40 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, mon_type = amd::smi::kMonInvalid; } + // The HBM temperature is retreived from the gpu_metrics + if (sensor_type == RSMI_TEMP_TYPE_HBM_0 + || sensor_type == RSMI_TEMP_TYPE_HBM_1 + || sensor_type == RSMI_TEMP_TYPE_HBM_2 + || sensor_type == RSMI_TEMP_TYPE_HBM_3) { + if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT + return RSMI_STATUS_NOT_SUPPORTED; + } + + rsmi_gpu_metrics_t gpu_metrics; + ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + if (temperature == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + if (sensor_type == RSMI_TEMP_TYPE_HBM_0) { + *temperature = gpu_metrics.temperature_hbm[0]; + } else if (sensor_type == RSMI_TEMP_TYPE_HBM_1) { + *temperature = gpu_metrics.temperature_hbm[1]; + } else if (sensor_type == RSMI_TEMP_TYPE_HBM_2) { + *temperature = gpu_metrics.temperature_hbm[2]; + } else if (sensor_type == RSMI_TEMP_TYPE_HBM_3) { + *temperature = gpu_metrics.temperature_hbm[3]; + } else { + return RSMI_STATUS_NOT_SUPPORTED; + } + + return RSMI_STATUS_SUCCESS; + } // end HBM temperature + DEVICE_MUTEX GET_DEV_FROM_INDX diff --git a/tests/rocm_smi_test/functional/temp_read.cc b/tests/rocm_smi_test/functional/temp_read.cc index 938ddc3777..859d50399a 100755 --- a/tests/rocm_smi_test/functional/temp_read.cc +++ b/tests/rocm_smi_test/functional/temp_read.cc @@ -60,6 +60,10 @@ static const std::map kTempSensorNameMap = { {RSMI_TEMP_TYPE_MEMORY, "Memory"}, {RSMI_TEMP_TYPE_JUNCTION, "Junction"}, {RSMI_TEMP_TYPE_EDGE, "Edge"}, + {RSMI_TEMP_TYPE_HBM_0, "HBM_0"}, + {RSMI_TEMP_TYPE_HBM_1, "HBM_1"}, + {RSMI_TEMP_TYPE_HBM_2, "HBM_2"}, + {RSMI_TEMP_TYPE_HBM_3, "HBM_3"}, }; TestTempRead::TestTempRead() : TestBase() { set_title("RSMI Temp Read Test");