From 4f3b1147404b20341965bd891e506354d262c968 Mon Sep 17 00:00:00 2001
From: limeng12
Date: Thu, 9 Jan 2025 13:52:56 +0800
Subject: [PATCH] [SWDEV-230863] Improve the functionality of RdcSmiHealth
module.
Memory check:get the threshold of retired page number
EEPROM check:read and verify the checksum
Power/Thermal check: power/thermal throttle status counter
Signed-off-by: Meng Li
Change-Id: Id2c751416eb5bf007e6e1da8dc05966a6ba1324e
[ROCm/rdc commit: 016a1d9d391fcef7ec996dc8feb19f846deea4cb]
---
projects/rdc/common/rdc_field.data | 9 +-
projects/rdc/example/health_example.cc | 14 +-
projects/rdc/include/rdc/rdc.h | 9 +-
.../include/rdc_lib/impl/RdcWatchTableImpl.h | 7 +
projects/rdc/python_binding/rdc_bootstrap.py | 3 +-
.../rdc_libs/bootstrap/src/RdcBootStrap.cc | 2 +
.../rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 36 ++-
projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc | 2 +-
.../rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc | 257 +++++++++++++++---
projects/rdc/rdc_libs/rdc/src/SmiUtils.cc | 3 +
projects/rdc/rdci/src/RdciHealthSubSystem.cc | 37 ++-
11 files changed, 307 insertions(+), 72 deletions(-)
diff --git a/projects/rdc/common/rdc_field.data b/projects/rdc/common/rdc_field.data
index 31c9e4de2d..632f64347f 100644
--- a/projects/rdc/common/rdc_field.data
+++ b/projects/rdc/common/rdc_field.data
@@ -169,8 +169,9 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured",
// RDC health related fields
FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR, "XGMI one or more errors detected", "XGMI_ERROR", true)
FLD_DESC_ENT(RDC_HEALTH_PCIE_REPLAY_COUNT, "Total PCIE replay count", "PCIE_REPLAY_COUNT", true)
+FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_NUM, "Retired page number", "RETIRED_PAGE_NUM", true)
FLD_DESC_ENT(RDC_HEALTH_PENDING_PAGE_NUM, "Pending page number", "PENDING_PAGE_NUM", true)
-FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", false)
-FLD_DESC_ENT(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, "Uncorrectable page limit", "UNCORRECTABLE_PAGE_LIMIT", false)
-FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", false)
-FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", false)
+FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", true)
+FLD_DESC_ENT(RDC_HEALTH_EEPROM_CONFIG_VALID, "Verify checksum of EEPROM", "EEPROM_CONFIG_VALID", true)
+FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", true)
+FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", true)
diff --git a/projects/rdc/example/health_example.cc b/projects/rdc/example/health_example.cc
index d15dfa35a5..a842d2bebb 100644
--- a/projects/rdc/example/health_example.cc
+++ b/projects/rdc/example/health_example.cc
@@ -45,6 +45,12 @@ rdc_status_t get_watches(rdc_handle_t rdc_handle, rdc_gpu_group_t group_id) {
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Memory" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
+ std::cout << "|" << std::setw(20) << std::left << " EEPROM" << "| "
+ << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_EEPROM) ? on : off).c_str() << "|\n";
+ std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| "
+ << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n";
+ std::cout << "|" << std::setw(20) << std::left << " Power" << "| "
+ << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";
std::cout << "+--------------------+" //"-" width :20
<< "---------------------------------------------------+\n"; //-" width :51
}
@@ -79,8 +85,8 @@ std::string component_string(rdc_health_system_t component) {
case RDC_HEALTH_WATCH_MEM:
return "Memory system: ";
- case RDC_HEALTH_WATCH_INFOROM:
- return "Inforom system: ";
+ case RDC_HEALTH_WATCH_EEPROM:
+ return "EEPROM system: ";
case RDC_HEALTH_WATCH_THERMAL:
return "Thermal system:";
@@ -280,7 +286,9 @@ int main(int, char**) {
// (3) set health watches.
unsigned int components;
- components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM;
+ components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM
+ | RDC_HEALTH_WATCH_EEPROM | RDC_HEALTH_WATCH_THERMAL
+ | RDC_HEALTH_WATCH_POWER;
result = rdc_health_set(rdc_handle, group_id, components);
if (result != RDC_ST_OK) {
std::cout << "Error setting health watches. Return: " << rdc_status_string(result)
diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h
index d9d2a6f834..de4a90d60f 100644
--- a/projects/rdc/include/rdc/rdc.h
+++ b/projects/rdc/include/rdc/rdc.h
@@ -80,6 +80,7 @@ typedef enum {
//!< but none was found
RDC_ST_PERM_ERROR, //!< Insufficient permission to complete
//!< operation
+ RDC_ST_CORRUPTED_EEPROM, //!< EEPROM is corrupted
RDC_ST_DISABLED_MODULE, //!< Attempted loading disabled module
RDC_ST_UNKNOWN_ERROR = 0xFFFFFFFF //!< Unknown error
@@ -353,8 +354,8 @@ typedef enum {
RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count
RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number
RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number
- RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page
- RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, //!< The threshold of uncorrectable page
+ RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< the threshold of retired page number
+ RDC_HEALTH_EEPROM_CONFIG_VALID, //!< Reads the EEPROM and verifies the checksums
RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter
RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds)
} rdc_field_t;
@@ -681,7 +682,7 @@ typedef enum {
RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches
RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches
RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches
- RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches
+ RDC_HEALTH_WATCH_EEPROM = 0x8, //!< EEPROM watches
RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches
RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches
} rdc_health_system_t;
@@ -716,7 +717,7 @@ typedef enum {
RDC_FR_CLOCKS_THROTTLE_POWER = 1006,
RDC_FR_XGMI_SINGLE_ERROR = 1007,
RDC_FR_XGMI_MULTIPLE_ERROR = 1008,
- RDC_FR_CORRUPT_INFOROM = 1009
+ RDC_FR_CORRUPT_EEPROM = 1009
} rdc_health_error_code_t;
/**
diff --git a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h
index f9923cc22b..71d02e2910 100644
--- a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h
+++ b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h
@@ -137,6 +137,7 @@ class RdcWatchTableImpl : public RdcWatchTable {
rdc_status_t get_start_end_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field,
+ uint64_t start_timestamp,
rdc_field_value *start_value,
rdc_field_value *end_value);
rdc_status_t pcie_check(rdc_gpu_group_t group_id,
@@ -145,6 +146,12 @@ class RdcWatchTableImpl : public RdcWatchTable {
uint32_t gpu_index, rdc_health_response_t* response);
rdc_status_t memory_check(rdc_gpu_group_t group_id,
uint32_t gpu_index, rdc_health_response_t* response);
+ rdc_status_t eeprom_check(rdc_gpu_group_t group_id,
+ uint32_t gpu_index, rdc_health_response_t* response);
+ rdc_status_t thermal_check(rdc_gpu_group_t group_id,
+ uint32_t gpu_index, rdc_health_response_t* response);
+ rdc_status_t power_check(rdc_gpu_group_t group_id,
+ uint32_t gpu_index, rdc_health_response_t* response);
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py
index 9a21126d3f..d4fa98a6f0 100644
--- a/projects/rdc/python_binding/rdc_bootstrap.py
+++ b/projects/rdc/python_binding/rdc_bootstrap.py
@@ -55,6 +55,7 @@ class rdc_status_t(Enum):
RDC_ST_FILE_ERROR = 12
RDC_ST_NO_DATA = 13
RDC_ST_PERM_ERROR = 14
+ RDC_ST_CORRUPTED_EEPROM = 15
RDC_ST_UNKNOWN_ERROR = 4294967295
class rdc_operation_mode_t(c_int):
@@ -173,7 +174,7 @@ class rdc_field_t(c_int):
RDC_HEALTH_RETIRED_PAGE_NUM = 3002
RDC_HEALTH_PENDING_PAGE_NUM = 3003
RDC_HEALTH_RETIRED_PAGE_LIMIT = 3004
- RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT = 3005
+ RDC_HEALTH_EEPROM_CONFIG_VALID = 3005
RDC_HEALTH_POWER_THROTTLE_TIME = 3006
RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007
diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc
index dac201b758..3520041de9 100644
--- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc
+++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc
@@ -369,6 +369,8 @@ const char* rdc_status_string(rdc_status_t result) {
return "Data was requested, but none was found";
case RDC_ST_PERM_ERROR:
return "Insufficient permission to complete operation";
+ case RDC_ST_CORRUPTED_EEPROM:
+ return "EEPROM is corrupted";
case RDC_ST_UNKNOWN_ERROR:
return "Unknown error";
default:
diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc
index ab3ecfd592..89d9bca79c 100644
--- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc
+++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc
@@ -886,10 +886,38 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
break;
}
- case RDC_HEALTH_RETIRED_PAGE_LIMIT:
- case RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT:
- case RDC_HEALTH_POWER_THROTTLE_TIME: //gpu_metrics 1.6
- case RDC_HEALTH_THERMAL_THROTTLE_TIME: //gpu_metrics 1.6
+ case RDC_HEALTH_RETIRED_PAGE_LIMIT: {
+ uint32_t retired_page_threshold = 0;
+ ret = amdsmi_get_gpu_bad_page_threshold(processor_handle, &retired_page_threshold);
+ value->status = Smi2RdcError(ret);
+ value->type = INTEGER;
+ if (value->status == AMDSMI_STATUS_SUCCESS) {
+ value->value.l_int = static_cast(retired_page_threshold);
+ }
+ break;
+ }
+
+ case RDC_HEALTH_EEPROM_CONFIG_VALID: {
+ ret = amdsmi_gpu_validate_ras_eeprom(processor_handle);
+ value->status = Smi2RdcError(ret);
+ break;
+ }
+
+ case RDC_HEALTH_POWER_THROTTLE_TIME:
+ case RDC_HEALTH_THERMAL_THROTTLE_TIME: {
+ amdsmi_violation_status_t violation_status;
+ ret = amdsmi_get_violation_status(processor_handle, &violation_status);
+ value->status = Smi2RdcError(ret);
+ value->type = INTEGER;
+ if (value->status == AMDSMI_STATUS_SUCCESS) {
+ if (RDC_HEALTH_POWER_THROTTLE_TIME == field_id)
+ value->value.l_int = static_cast(violation_status.acc_ppt_pwr);
+ if (RDC_HEALTH_THERMAL_THROTTLE_TIME == field_id)
+ value->value.l_int = static_cast(violation_status.acc_socket_thrm);
+ }
+ break;
+ }
+
default:
break;
}
diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc
index 91d9a4931f..d8313a287d 100644
--- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc
+++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc
@@ -181,7 +181,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, RDC_FI_OAM_ID,
RDC_FI_GPU_MM_ENC_UTIL, RDC_FI_GPU_MM_DEC_UTIL, RDC_FI_GPU_MEMORY_ACTIVITY,
RDC_HEALTH_XGMI_ERROR, RDC_HEALTH_PCIE_REPLAY_COUNT, RDC_HEALTH_RETIRED_PAGE_NUM,
- RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,
+ RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_EEPROM_CONFIG_VALID,
RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME,
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH,
};
diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc
index ce7c0874d4..8c91d7ba05 100644
--- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc
+++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc
@@ -35,6 +35,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
#include "rdc_lib/rdc_common.h"
+#include "rdc_lib/impl/SmiUtils.h"
namespace amd {
namespace rdc {
@@ -392,10 +393,10 @@ rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int component
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT);
- field_ids.push_back(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT);
}
- if (components & RDC_HEALTH_WATCH_INFOROM) {
+ if (components & RDC_HEALTH_WATCH_EEPROM) {
+ field_ids.push_back(RDC_HEALTH_EEPROM_CONFIG_VALID);
}
if (components & RDC_HEALTH_WATCH_THERMAL) {
@@ -506,24 +507,23 @@ bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index,
rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field,
+ uint64_t start_timestamp,
rdc_field_value *start_value,
rdc_field_value *end_value) {
- if ((nullptr == start_value) || (nullptr == end_value))
+ if ((nullptr == start_value) && (nullptr == end_value))
return RDC_ST_BAD_PARAMETER;
- uint64_t start_timestamp = 0;
-
- //get the history data last 1 minute
- start_timestamp = static_cast(time(nullptr) - 60) * 1000;
-
- //get the values of the field at the start_timestamp/end_timestampe
- rdc_status_t result = cache_mgr_->rdc_health_get_values(group_id,
- gpu_index, field,
- start_timestamp, 0,
- start_value, nullptr);
- if (result != RDC_ST_OK) {
- RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result);
- return result;
+ rdc_status_t result = RDC_ST_OK;
+ if (nullptr != start_value) {
+ //get the values of the field at the start_timestamp/end_timestampe
+ result = cache_mgr_->rdc_health_get_values(group_id,
+ gpu_index, field,
+ start_timestamp, 0,
+ start_value, nullptr);
+ if (result != RDC_ST_OK) {
+ RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result);
+ return result;
+ }
}
// get end values
@@ -539,9 +539,12 @@ rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start = {}, end = {};
+ uint64_t start_timestamp = static_cast(time(nullptr) - 60) * 1000;
+ //get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_PCIE_REPLAY_COUNT,
+ start_timestamp,
&start,
&end);
if (result != RDC_ST_OK)
@@ -575,11 +578,12 @@ rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
- rdc_field_value start = {}, end = {};
+ rdc_field_value end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_XGMI_ERROR,
- &start,
+ 0,
+ nullptr,
&end);
if (result != RDC_ST_OK)
return result;
@@ -617,23 +621,24 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
- rdc_field_value start = {}, end = {};
+ rdc_field_value start= {}, end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_FI_ECC_UNCORRECT_TOTAL,
- &start,
+ 0,
+ nullptr,
&end);
if (result != RDC_ST_OK)
return result;
uint64_t ecc_uncorrectable_count = 0;
- ecc_uncorrectable_count = end.value.l_int - start.value.l_int;
+ ecc_uncorrectable_count = end.value.l_int;
if (ecc_uncorrectable_count > 0) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(ecc_uncorrectable_count);
- err_msg += " uncorrectable ECC error(s) in the last minute.";
+ err_msg += " uncorrectable ECC error(s) since last GPU reset.";
//add incident
if (add_health_incident(gpu_index,
@@ -649,12 +654,13 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_PENDING_PAGE_NUM,
- &start,
+ 0,
+ nullptr,
&end);
if (result != RDC_ST_OK)
return result;
- uint64_t num_pages = end.value.l_int - start.value.l_int;
+ uint64_t num_pages = end.value.l_int;
if (num_pages > 0) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
@@ -673,12 +679,192 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
return RDC_ST_MAX_LIMIT;
}
- //To do: RDC_FR_RETIRED_PAGES_LIMIT
- //To do: RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT
+ //get retired page number
+ result = get_start_end_values(group_id,
+ gpu_index,
+ RDC_HEALTH_RETIRED_PAGE_NUM,
+ 0,
+ nullptr,
+ &end);
+ if (result != RDC_ST_OK)
+ return result;
+ uint64_t retired_page = end.value.l_int;
+
+ //get retired page threshold
+ result = get_start_end_values(group_id,
+ gpu_index,
+ RDC_HEALTH_RETIRED_PAGE_LIMIT,
+ 0,
+ nullptr,
+ &end);
+ if (result != RDC_ST_OK)
+ return result;
+ uint32_t retired_page_threshold = end.value.l_int;
+
+ if (retired_page > retired_page_threshold) {
+ rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
+
+ std::string err_msg = "Detected ";
+ err_msg += std::to_string(retired_page);
+ err_msg += " retired pages exceeding the max limit: ";
+ err_msg += std::to_string(retired_page_threshold);
+ err_msg += ".";
+
+ //add incident
+ if (add_health_incident(gpu_index,
+ RDC_HEALTH_WATCH_MEM,
+ RDC_HEALTH_RESULT_FAIL,
+ RDC_FR_RETIRED_PAGES_LIMIT,
+ err_msg,
+ incident,
+ response))
+ return RDC_ST_MAX_LIMIT;
+
+ return RDC_ST_OK;
+ }
+
+ if (retired_page > 0) {
+ uint64_t start_timestamp = static_cast(time(nullptr) - 604800) * 1000;
+ //get retired page number last 1 week
+ result = get_start_end_values(group_id,
+ gpu_index,
+ RDC_HEALTH_RETIRED_PAGE_NUM,
+ start_timestamp,
+ &start,
+ &end);
+ if (result != RDC_ST_OK)
+ return result;
+
+ retired_page = end.value.l_int - start.value.l_int;
+ if (retired_page > 1) {
+ rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
+
+ std::string err_msg = "Detected ";
+ err_msg += std::to_string(retired_page);
+ err_msg += " retired pages more than one in the last week.";
+
+ //add incident
+ if (add_health_incident(gpu_index,
+ RDC_HEALTH_WATCH_MEM,
+ RDC_HEALTH_RESULT_FAIL,
+ RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT,
+ err_msg,
+ incident,
+ response))
+ return RDC_ST_MAX_LIMIT;
+ }
+ }
return RDC_ST_OK;
}
+rdc_status_t RdcWatchTableImpl::eeprom_check(rdc_gpu_group_t group_id,
+ uint32_t gpu_index,
+ rdc_health_response_t* response) {
+ rdc_field_value end = {};
+ rdc_status_t result = get_start_end_values(group_id,
+ gpu_index,
+ RDC_FI_ECC_UNCORRECT_TOTAL,
+ 0,
+ nullptr,
+ &end);
+ if (result != RDC_ST_OK && result != RDC_ST_CORRUPTED_EEPROM)
+ return result;
+
+ if (result == RDC_ST_CORRUPTED_EEPROM) {
+ rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
+
+ std::string err_msg = "Detected a corrupt EEPROM since last GPU reset.";
+
+ //add incident
+ if (add_health_incident(gpu_index,
+ RDC_HEALTH_WATCH_EEPROM,
+ RDC_HEALTH_RESULT_WARN,
+ RDC_FR_CORRUPT_EEPROM,
+ err_msg,
+ incident,
+ response))
+ return RDC_ST_MAX_LIMIT;
+ }
+
+ return RDC_ST_OK;
+}
+
+rdc_status_t RdcWatchTableImpl::thermal_check(rdc_gpu_group_t group_id,
+ uint32_t gpu_index,
+ rdc_health_response_t* response) {
+ //get field start/end values
+ rdc_field_value start = {}, end = {};
+ uint64_t start_timestamp = static_cast(time(nullptr) - 60) * 1000;
+ //get the history data last 1 minute
+ rdc_status_t result = get_start_end_values(group_id,
+ gpu_index,
+ RDC_HEALTH_THERMAL_THROTTLE_TIME,
+ start_timestamp,
+ &start,
+ &end);
+ if (result != RDC_ST_OK)
+ return result;
+
+ uint64_t acc_socket_thrm = end.value.l_int - start.value.l_int;
+ if (0 < acc_socket_thrm) {
+ rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
+
+ std::string err_msg = "Detected ";
+ err_msg += std::to_string(acc_socket_thrm);
+ err_msg += " clock throttling due to thermal violation in the last minute.";
+
+ //add incident
+ if (add_health_incident(gpu_index,
+ RDC_HEALTH_WATCH_THERMAL,
+ RDC_HEALTH_RESULT_WARN,
+ RDC_FR_CLOCKS_THROTTLE_THERMAL,
+ err_msg,
+ incident,
+ response))
+ return RDC_ST_MAX_LIMIT;
+ }
+
+ return RDC_ST_OK;
+}
+
+rdc_status_t RdcWatchTableImpl::power_check(rdc_gpu_group_t group_id,
+ uint32_t gpu_index,
+ rdc_health_response_t* response) {
+ //get field start/end values
+ rdc_field_value start = {}, end = {};
+ uint64_t start_timestamp = static_cast(time(nullptr) - 60) * 1000;
+ //get the history data last 1 minute
+ rdc_status_t result = get_start_end_values(group_id,
+ gpu_index,
+ RDC_HEALTH_POWER_THROTTLE_TIME,
+ start_timestamp,
+ &start,
+ &end);
+ if (result != RDC_ST_OK)
+ return result;
+
+ uint64_t acc_ppt_pwr = end.value.l_int - start.value.l_int;
+ if (0 < acc_ppt_pwr) {
+ rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
+
+ std::string err_msg = "Detected ";
+ err_msg += std::to_string(acc_ppt_pwr);
+ err_msg += " Detected clock throttling due to power violation in the last minute.";
+
+ //add incident
+ if (add_health_incident(gpu_index,
+ RDC_HEALTH_WATCH_POWER,
+ RDC_HEALTH_RESULT_WARN,
+ RDC_FR_CLOCKS_THROTTLE_POWER,
+ err_msg,
+ incident,
+ response))
+ return RDC_ST_MAX_LIMIT;
+ }
+
+ return RDC_ST_OK;
+}
rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
if (nullptr == response)
@@ -739,22 +925,25 @@ rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
return result;
}
- //InfoROM
- if (components & RDC_HEALTH_WATCH_INFOROM) {
- //To do:
- return RDC_ST_NOT_SUPPORTED;
+ //EEPROM
+ if (components & RDC_HEALTH_WATCH_EEPROM) {
+ result = eeprom_check(group_id, ginfo.entity_ids[gindex], response);
+ if (result == RDC_ST_MAX_LIMIT)
+ return result;
}
//Thermal
if (components & RDC_HEALTH_WATCH_THERMAL) {
- //To do:
- return RDC_ST_NOT_SUPPORTED;
+ result = thermal_check(group_id, ginfo.entity_ids[gindex], response);
+ if (result == RDC_ST_MAX_LIMIT)
+ return result;
}
//Power
if (components & RDC_HEALTH_WATCH_POWER) {
- //To do:
- return RDC_ST_NOT_SUPPORTED;
+ result = power_check(group_id, ginfo.entity_ids[gindex], response);
+ if (result == RDC_ST_MAX_LIMIT)
+ return result;
}
} //end of for gindex
diff --git a/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc
index 1e03de8490..8f85f591a7 100644
--- a/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc
+++ b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc
@@ -58,6 +58,9 @@ rdc_status_t Smi2RdcError(amdsmi_status_t rsmi) {
case AMDSMI_STATUS_NO_PERM:
return RDC_ST_PERM_ERROR;
+ case AMDSMI_STATUS_CORRUPTED_EEPROM:
+ return RDC_ST_CORRUPTED_EEPROM;
+
case AMDSMI_STATUS_BUSY:
case AMDSMI_STATUS_UNKNOWN_ERROR:
case AMDSMI_STATUS_INTERNAL_EXCEPTION:
diff --git a/projects/rdc/rdci/src/RdciHealthSubSystem.cc b/projects/rdc/rdci/src/RdciHealthSubSystem.cc
index b2ba82251d..c0aab30585 100644
--- a/projects/rdc/rdci/src/RdciHealthSubSystem.cc
+++ b/projects/rdc/rdci/src/RdciHealthSubSystem.cc
@@ -106,10 +106,9 @@ void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) {
components |= RDC_HEALTH_WATCH_PCIE;
components |= RDC_HEALTH_WATCH_XGMI;
components |= RDC_HEALTH_WATCH_MEM;
- //To do:
- //components |= RDC_HEALTH_WATCH_INFOROM;
- //components |= RDC_HEALTH_WATCH_THERMAL;
- //components |= RDC_HEALTH_WATCH_POWER;
+ components |= RDC_HEALTH_WATCH_EEPROM;
+ components |= RDC_HEALTH_WATCH_THERMAL;
+ components |= RDC_HEALTH_WATCH_POWER;
break;
case 'p':
@@ -120,17 +119,13 @@ void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) {
components |= RDC_HEALTH_WATCH_MEM;
break;
- case 'i':
- //To do:
- //components |= RDC_HEALTH_WATCH_INFOROM;
- throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
+ case 'e':
+ components |= RDC_HEALTH_WATCH_EEPROM;
break;
case 't':
- //To do:
- //components |= RDC_HEALTH_WATCH_THERMAL;
- //components |= RDC_HEALTH_WATCH_POWER;
- throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
+ components |= RDC_HEALTH_WATCH_THERMAL;
+ components |= RDC_HEALTH_WATCH_POWER;
break;
case 'x':
@@ -187,8 +182,8 @@ void RdciHealthSubSystem::show_help() const {
std::cout << " a - watch all components\n";
std::cout << " p - watch PCIe\n";
std::cout << " m - watch Memory\n";
- //std::cout << " i - watch infoROM\n";
- //std::cout << " t - watch power and thermal\n";
+ std::cout << " e - watch EEPROM\n";
+ std::cout << " t - watch power and thermal\n";
std::cout << " x - watch XGMI\n";
std::cout << " -c --check Check to see if any errors or warnings have "
<< "occurred in the currently monitored watches.\n";
@@ -215,9 +210,9 @@ void RdciHealthSubSystem::get_watches() const {
std::cout << "{\"Component\" : \"PCIe\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"XGMI\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"Memory\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "\"},";
- /*std::cout << "{\"Component\" : \"InfoROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "\"},";
+ std::cout << "{\"Component\" : \"EEPROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_EEPROM) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"Thermal\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "\"},";
- std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}";*/
+ std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}";
std::cout << "]";
} else {
std::cout << "Health monitor systems status:" << std::endl;
@@ -229,12 +224,12 @@ void RdciHealthSubSystem::get_watches() const {
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Memory" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
- /*std::cout << "|" << std::setw(20) << std::left << " InfoROM" << "| "
- << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "|\n";
+ std::cout << "|" << std::setw(20) << std::left << " EEPROM" << "| "
+ << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_EEPROM) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Power" << "| "
- << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";*/
+ << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";
std::cout << "+--------------------+" //"-" width :20
<< "---------------------------------------------------+\n"; //-" width :51
}
@@ -282,8 +277,8 @@ std::string RdciHealthSubSystem::component_string(rdc_health_system_t component)
case RDC_HEALTH_WATCH_MEM:
return "Memory system: ";
- case RDC_HEALTH_WATCH_INFOROM:
- return "Inforom system: ";
+ case RDC_HEALTH_WATCH_EEPROM:
+ return "EEPROM system: ";
case RDC_HEALTH_WATCH_THERMAL:
return "Thermal system:";