From a95af9b70dbb67488db321f9287ee87ec150bcd0 Mon Sep 17 00:00:00 2001 From: Divya Shikre Date: Tue, 23 Nov 2021 14:25:23 -0500 Subject: [PATCH] Add fix for out of range temperature value for HBM. Driver mem fills in 0xFF for all for the metrices not supported for that ASIC. So if 0xFF is detected, return RSMI_STATUS_NOT_SUPPORTED Signed-off-by: Divya Shikre Change-Id: Iacb6474486e3732f2aa824ff447c17f8243b65cd [ROCm/rocm_smi_lib commit: f61cb1b41d1ce37e03c94c8115b5ed158ca629eb] --- projects/rocm-smi-lib/src/rocm_smi.cc | 34 ++++++++++++++++----------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 0cd1526944..52582d6d40 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -2081,6 +2081,7 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, rsmi_status_t ret; amd::smi::MonitorTypes mon_type; + uint16_t val_ui16; switch (metric) { case RSMI_TEMP_CURRENT: @@ -2148,21 +2149,26 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, return RSMI_STATUS_INVALID_ARGS; } - if (sensor_type == RSMI_TEMP_TYPE_HBM_0) { - *temperature = gpu_metrics.temperature_hbm[0] * - CENTRIGRADE_TO_MILLI_CENTIGRADE; - } else if (sensor_type == RSMI_TEMP_TYPE_HBM_1) { - *temperature = gpu_metrics.temperature_hbm[1] * - CENTRIGRADE_TO_MILLI_CENTIGRADE; - } else if (sensor_type == RSMI_TEMP_TYPE_HBM_2) { - *temperature = gpu_metrics.temperature_hbm[2] * - CENTRIGRADE_TO_MILLI_CENTIGRADE; - } else if (sensor_type == RSMI_TEMP_TYPE_HBM_3) { - *temperature = gpu_metrics.temperature_hbm[3] * - CENTRIGRADE_TO_MILLI_CENTIGRADE; - } else { - return RSMI_STATUS_NOT_SUPPORTED; + switch (sensor_type) { + case RSMI_TEMP_TYPE_HBM_0: + val_ui16 = gpu_metrics.temperature_hbm[0]; + break; + case RSMI_TEMP_TYPE_HBM_1: + val_ui16 = gpu_metrics.temperature_hbm[1]; + break; + case RSMI_TEMP_TYPE_HBM_2: + val_ui16 = gpu_metrics.temperature_hbm[2]; + break; + case RSMI_TEMP_TYPE_HBM_3: + val_ui16 = gpu_metrics.temperature_hbm[3]; + break; + default: + return RSMI_STATUS_INVALID_ARGS; } + if (val_ui16 == UINT16_MAX) + return RSMI_STATUS_NOT_SUPPORTED; + else + *temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE; return RSMI_STATUS_SUCCESS; } // end HBM temperature