From 4f3b1147404b20341965bd891e506354d262c968 Mon Sep 17 00:00:00 2001 From: limeng12 Date: Thu, 9 Jan 2025 13:52:56 +0800 Subject: [PATCH] [SWDEV-230863] Improve the functionality of RdcSmiHealth module. Memory check:get the threshold of retired page number EEPROM check:read and verify the checksum Power/Thermal check: power/thermal throttle status counter Signed-off-by: Meng Li Change-Id: Id2c751416eb5bf007e6e1da8dc05966a6ba1324e [ROCm/rdc commit: 016a1d9d391fcef7ec996dc8feb19f846deea4cb] --- projects/rdc/common/rdc_field.data | 9 +- projects/rdc/example/health_example.cc | 14 +- projects/rdc/include/rdc/rdc.h | 9 +- .../include/rdc_lib/impl/RdcWatchTableImpl.h | 7 + projects/rdc/python_binding/rdc_bootstrap.py | 3 +- .../rdc_libs/bootstrap/src/RdcBootStrap.cc | 2 + .../rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 36 ++- projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc | 2 +- .../rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc | 257 +++++++++++++++--- projects/rdc/rdc_libs/rdc/src/SmiUtils.cc | 3 + projects/rdc/rdci/src/RdciHealthSubSystem.cc | 37 ++- 11 files changed, 307 insertions(+), 72 deletions(-) diff --git a/projects/rdc/common/rdc_field.data b/projects/rdc/common/rdc_field.data index 31c9e4de2d..632f64347f 100644 --- a/projects/rdc/common/rdc_field.data +++ b/projects/rdc/common/rdc_field.data @@ -169,8 +169,9 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", // RDC health related fields FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR, "XGMI one or more errors detected", "XGMI_ERROR", true) FLD_DESC_ENT(RDC_HEALTH_PCIE_REPLAY_COUNT, "Total PCIE replay count", "PCIE_REPLAY_COUNT", true) +FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_NUM, "Retired page number", "RETIRED_PAGE_NUM", true) FLD_DESC_ENT(RDC_HEALTH_PENDING_PAGE_NUM, "Pending page number", "PENDING_PAGE_NUM", true) -FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", false) -FLD_DESC_ENT(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, "Uncorrectable page limit", "UNCORRECTABLE_PAGE_LIMIT", false) -FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", false) -FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", false) +FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", true) +FLD_DESC_ENT(RDC_HEALTH_EEPROM_CONFIG_VALID, "Verify checksum of EEPROM", "EEPROM_CONFIG_VALID", true) +FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", true) +FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", true) diff --git a/projects/rdc/example/health_example.cc b/projects/rdc/example/health_example.cc index d15dfa35a5..a842d2bebb 100644 --- a/projects/rdc/example/health_example.cc +++ b/projects/rdc/example/health_example.cc @@ -45,6 +45,12 @@ rdc_status_t get_watches(rdc_handle_t rdc_handle, rdc_gpu_group_t group_id) { << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n"; std::cout << "|" << std::setw(20) << std::left << " Memory" << "| " << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " EEPROM" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_EEPROM) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " Power" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n"; std::cout << "+--------------------+" //"-" width :20 << "---------------------------------------------------+\n"; //-" width :51 } @@ -79,8 +85,8 @@ std::string component_string(rdc_health_system_t component) { case RDC_HEALTH_WATCH_MEM: return "Memory system: "; - case RDC_HEALTH_WATCH_INFOROM: - return "Inforom system: "; + case RDC_HEALTH_WATCH_EEPROM: + return "EEPROM system: "; case RDC_HEALTH_WATCH_THERMAL: return "Thermal system:"; @@ -280,7 +286,9 @@ int main(int, char**) { // (3) set health watches. unsigned int components; - components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM; + components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM + | RDC_HEALTH_WATCH_EEPROM | RDC_HEALTH_WATCH_THERMAL + | RDC_HEALTH_WATCH_POWER; result = rdc_health_set(rdc_handle, group_id, components); if (result != RDC_ST_OK) { std::cout << "Error setting health watches. Return: " << rdc_status_string(result) diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index d9d2a6f834..de4a90d60f 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -80,6 +80,7 @@ typedef enum { //!< but none was found RDC_ST_PERM_ERROR, //!< Insufficient permission to complete //!< operation + RDC_ST_CORRUPTED_EEPROM, //!< EEPROM is corrupted RDC_ST_DISABLED_MODULE, //!< Attempted loading disabled module RDC_ST_UNKNOWN_ERROR = 0xFFFFFFFF //!< Unknown error @@ -353,8 +354,8 @@ typedef enum { RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number - RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page - RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, //!< The threshold of uncorrectable page + RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< the threshold of retired page number + RDC_HEALTH_EEPROM_CONFIG_VALID, //!< Reads the EEPROM and verifies the checksums RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds) } rdc_field_t; @@ -681,7 +682,7 @@ typedef enum { RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches - RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches + RDC_HEALTH_WATCH_EEPROM = 0x8, //!< EEPROM watches RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches } rdc_health_system_t; @@ -716,7 +717,7 @@ typedef enum { RDC_FR_CLOCKS_THROTTLE_POWER = 1006, RDC_FR_XGMI_SINGLE_ERROR = 1007, RDC_FR_XGMI_MULTIPLE_ERROR = 1008, - RDC_FR_CORRUPT_INFOROM = 1009 + RDC_FR_CORRUPT_EEPROM = 1009 } rdc_health_error_code_t; /** diff --git a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h index f9923cc22b..71d02e2910 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -137,6 +137,7 @@ class RdcWatchTableImpl : public RdcWatchTable { rdc_status_t get_start_end_values(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_field_t field, + uint64_t start_timestamp, rdc_field_value *start_value, rdc_field_value *end_value); rdc_status_t pcie_check(rdc_gpu_group_t group_id, @@ -145,6 +146,12 @@ class RdcWatchTableImpl : public RdcWatchTable { uint32_t gpu_index, rdc_health_response_t* response); rdc_status_t memory_check(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_health_response_t* response); + rdc_status_t eeprom_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, rdc_health_response_t* response); + rdc_status_t thermal_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, rdc_health_response_t* response); + rdc_status_t power_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, rdc_health_response_t* response); RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py index 9a21126d3f..d4fa98a6f0 100644 --- a/projects/rdc/python_binding/rdc_bootstrap.py +++ b/projects/rdc/python_binding/rdc_bootstrap.py @@ -55,6 +55,7 @@ class rdc_status_t(Enum): RDC_ST_FILE_ERROR = 12 RDC_ST_NO_DATA = 13 RDC_ST_PERM_ERROR = 14 + RDC_ST_CORRUPTED_EEPROM = 15 RDC_ST_UNKNOWN_ERROR = 4294967295 class rdc_operation_mode_t(c_int): @@ -173,7 +174,7 @@ class rdc_field_t(c_int): RDC_HEALTH_RETIRED_PAGE_NUM = 3002 RDC_HEALTH_PENDING_PAGE_NUM = 3003 RDC_HEALTH_RETIRED_PAGE_LIMIT = 3004 - RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT = 3005 + RDC_HEALTH_EEPROM_CONFIG_VALID = 3005 RDC_HEALTH_POWER_THROTTLE_TIME = 3006 RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007 diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index dac201b758..3520041de9 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -369,6 +369,8 @@ const char* rdc_status_string(rdc_status_t result) { return "Data was requested, but none was found"; case RDC_ST_PERM_ERROR: return "Insufficient permission to complete operation"; + case RDC_ST_CORRUPTED_EEPROM: + return "EEPROM is corrupted"; case RDC_ST_UNKNOWN_ERROR: return "Unknown error"; default: diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index ab3ecfd592..89d9bca79c 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -886,10 +886,38 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field break; } - case RDC_HEALTH_RETIRED_PAGE_LIMIT: - case RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT: - case RDC_HEALTH_POWER_THROTTLE_TIME: //gpu_metrics 1.6 - case RDC_HEALTH_THERMAL_THROTTLE_TIME: //gpu_metrics 1.6 + case RDC_HEALTH_RETIRED_PAGE_LIMIT: { + uint32_t retired_page_threshold = 0; + ret = amdsmi_get_gpu_bad_page_threshold(processor_handle, &retired_page_threshold); + value->status = Smi2RdcError(ret); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(retired_page_threshold); + } + break; + } + + case RDC_HEALTH_EEPROM_CONFIG_VALID: { + ret = amdsmi_gpu_validate_ras_eeprom(processor_handle); + value->status = Smi2RdcError(ret); + break; + } + + case RDC_HEALTH_POWER_THROTTLE_TIME: + case RDC_HEALTH_THERMAL_THROTTLE_TIME: { + amdsmi_violation_status_t violation_status; + ret = amdsmi_get_violation_status(processor_handle, &violation_status); + value->status = Smi2RdcError(ret); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + if (RDC_HEALTH_POWER_THROTTLE_TIME == field_id) + value->value.l_int = static_cast(violation_status.acc_ppt_pwr); + if (RDC_HEALTH_THERMAL_THROTTLE_TIME == field_id) + value->value.l_int = static_cast(violation_status.acc_socket_thrm); + } + break; + } + default: break; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 91d9a4931f..d8313a287d 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -181,7 +181,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, RDC_FI_OAM_ID, RDC_FI_GPU_MM_ENC_UTIL, RDC_FI_GPU_MM_DEC_UTIL, RDC_FI_GPU_MEMORY_ACTIVITY, RDC_HEALTH_XGMI_ERROR, RDC_HEALTH_PCIE_REPLAY_COUNT, RDC_HEALTH_RETIRED_PAGE_NUM, - RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, + RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_EEPROM_CONFIG_VALID, RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME, RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH, }; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc index ce7c0874d4..8c91d7ba05 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -35,6 +35,7 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcMetricFetcherImpl.h" #include "rdc_lib/rdc_common.h" +#include "rdc_lib/impl/SmiUtils.h" namespace amd { namespace rdc { @@ -392,10 +393,10 @@ rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int component field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM); field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM); field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT); - field_ids.push_back(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT); } - if (components & RDC_HEALTH_WATCH_INFOROM) { + if (components & RDC_HEALTH_WATCH_EEPROM) { + field_ids.push_back(RDC_HEALTH_EEPROM_CONFIG_VALID); } if (components & RDC_HEALTH_WATCH_THERMAL) { @@ -506,24 +507,23 @@ bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index, rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_field_t field, + uint64_t start_timestamp, rdc_field_value *start_value, rdc_field_value *end_value) { - if ((nullptr == start_value) || (nullptr == end_value)) + if ((nullptr == start_value) && (nullptr == end_value)) return RDC_ST_BAD_PARAMETER; - uint64_t start_timestamp = 0; - - //get the history data last 1 minute - start_timestamp = static_cast(time(nullptr) - 60) * 1000; - - //get the values of the field at the start_timestamp/end_timestampe - rdc_status_t result = cache_mgr_->rdc_health_get_values(group_id, - gpu_index, field, - start_timestamp, 0, - start_value, nullptr); - if (result != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result); - return result; + rdc_status_t result = RDC_ST_OK; + if (nullptr != start_value) { + //get the values of the field at the start_timestamp/end_timestampe + result = cache_mgr_->rdc_health_get_values(group_id, + gpu_index, field, + start_timestamp, 0, + start_value, nullptr); + if (result != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result); + return result; + } } // get end values @@ -539,9 +539,12 @@ rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) { //get field start/end values rdc_field_value start = {}, end = {}; + uint64_t start_timestamp = static_cast(time(nullptr) - 60) * 1000; + //get the history data last 1 minute rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_PCIE_REPLAY_COUNT, + start_timestamp, &start, &end); if (result != RDC_ST_OK) @@ -575,11 +578,12 @@ rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_health_response_t* response) { //get field start/end values - rdc_field_value start = {}, end = {}; + rdc_field_value end = {}; rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_XGMI_ERROR, - &start, + 0, + nullptr, &end); if (result != RDC_ST_OK) return result; @@ -617,23 +621,24 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_health_response_t* response) { //get field start/end values - rdc_field_value start = {}, end = {}; + rdc_field_value start= {}, end = {}; rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_FI_ECC_UNCORRECT_TOTAL, - &start, + 0, + nullptr, &end); if (result != RDC_ST_OK) return result; uint64_t ecc_uncorrectable_count = 0; - ecc_uncorrectable_count = end.value.l_int - start.value.l_int; + ecc_uncorrectable_count = end.value.l_int; if (ecc_uncorrectable_count > 0) { rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; std::string err_msg = "Detected "; err_msg += std::to_string(ecc_uncorrectable_count); - err_msg += " uncorrectable ECC error(s) in the last minute."; + err_msg += " uncorrectable ECC error(s) since last GPU reset."; //add incident if (add_health_incident(gpu_index, @@ -649,12 +654,13 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_PENDING_PAGE_NUM, - &start, + 0, + nullptr, &end); if (result != RDC_ST_OK) return result; - uint64_t num_pages = end.value.l_int - start.value.l_int; + uint64_t num_pages = end.value.l_int; if (num_pages > 0) { rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; @@ -673,12 +679,192 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, return RDC_ST_MAX_LIMIT; } - //To do: RDC_FR_RETIRED_PAGES_LIMIT - //To do: RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT + //get retired page number + result = get_start_end_values(group_id, + gpu_index, + RDC_HEALTH_RETIRED_PAGE_NUM, + 0, + nullptr, + &end); + if (result != RDC_ST_OK) + return result; + uint64_t retired_page = end.value.l_int; + + //get retired page threshold + result = get_start_end_values(group_id, + gpu_index, + RDC_HEALTH_RETIRED_PAGE_LIMIT, + 0, + nullptr, + &end); + if (result != RDC_ST_OK) + return result; + uint32_t retired_page_threshold = end.value.l_int; + + if (retired_page > retired_page_threshold) { + rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + + std::string err_msg = "Detected "; + err_msg += std::to_string(retired_page); + err_msg += " retired pages exceeding the max limit: "; + err_msg += std::to_string(retired_page_threshold); + err_msg += "."; + + //add incident + if (add_health_incident(gpu_index, + RDC_HEALTH_WATCH_MEM, + RDC_HEALTH_RESULT_FAIL, + RDC_FR_RETIRED_PAGES_LIMIT, + err_msg, + incident, + response)) + return RDC_ST_MAX_LIMIT; + + return RDC_ST_OK; + } + + if (retired_page > 0) { + uint64_t start_timestamp = static_cast(time(nullptr) - 604800) * 1000; + //get retired page number last 1 week + result = get_start_end_values(group_id, + gpu_index, + RDC_HEALTH_RETIRED_PAGE_NUM, + start_timestamp, + &start, + &end); + if (result != RDC_ST_OK) + return result; + + retired_page = end.value.l_int - start.value.l_int; + if (retired_page > 1) { + rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + + std::string err_msg = "Detected "; + err_msg += std::to_string(retired_page); + err_msg += " retired pages more than one in the last week."; + + //add incident + if (add_health_incident(gpu_index, + RDC_HEALTH_WATCH_MEM, + RDC_HEALTH_RESULT_FAIL, + RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT, + err_msg, + incident, + response)) + return RDC_ST_MAX_LIMIT; + } + } return RDC_ST_OK; } +rdc_status_t RdcWatchTableImpl::eeprom_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_health_response_t* response) { + rdc_field_value end = {}; + rdc_status_t result = get_start_end_values(group_id, + gpu_index, + RDC_FI_ECC_UNCORRECT_TOTAL, + 0, + nullptr, + &end); + if (result != RDC_ST_OK && result != RDC_ST_CORRUPTED_EEPROM) + return result; + + if (result == RDC_ST_CORRUPTED_EEPROM) { + rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + + std::string err_msg = "Detected a corrupt EEPROM since last GPU reset."; + + //add incident + if (add_health_incident(gpu_index, + RDC_HEALTH_WATCH_EEPROM, + RDC_HEALTH_RESULT_WARN, + RDC_FR_CORRUPT_EEPROM, + err_msg, + incident, + response)) + return RDC_ST_MAX_LIMIT; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcWatchTableImpl::thermal_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_health_response_t* response) { + //get field start/end values + rdc_field_value start = {}, end = {}; + uint64_t start_timestamp = static_cast(time(nullptr) - 60) * 1000; + //get the history data last 1 minute + rdc_status_t result = get_start_end_values(group_id, + gpu_index, + RDC_HEALTH_THERMAL_THROTTLE_TIME, + start_timestamp, + &start, + &end); + if (result != RDC_ST_OK) + return result; + + uint64_t acc_socket_thrm = end.value.l_int - start.value.l_int; + if (0 < acc_socket_thrm) { + rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + + std::string err_msg = "Detected "; + err_msg += std::to_string(acc_socket_thrm); + err_msg += " clock throttling due to thermal violation in the last minute."; + + //add incident + if (add_health_incident(gpu_index, + RDC_HEALTH_WATCH_THERMAL, + RDC_HEALTH_RESULT_WARN, + RDC_FR_CLOCKS_THROTTLE_THERMAL, + err_msg, + incident, + response)) + return RDC_ST_MAX_LIMIT; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcWatchTableImpl::power_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_health_response_t* response) { + //get field start/end values + rdc_field_value start = {}, end = {}; + uint64_t start_timestamp = static_cast(time(nullptr) - 60) * 1000; + //get the history data last 1 minute + rdc_status_t result = get_start_end_values(group_id, + gpu_index, + RDC_HEALTH_POWER_THROTTLE_TIME, + start_timestamp, + &start, + &end); + if (result != RDC_ST_OK) + return result; + + uint64_t acc_ppt_pwr = end.value.l_int - start.value.l_int; + if (0 < acc_ppt_pwr) { + rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + + std::string err_msg = "Detected "; + err_msg += std::to_string(acc_ppt_pwr); + err_msg += " Detected clock throttling due to power violation in the last minute."; + + //add incident + if (add_health_incident(gpu_index, + RDC_HEALTH_WATCH_POWER, + RDC_HEALTH_RESULT_WARN, + RDC_FR_CLOCKS_THROTTLE_POWER, + err_msg, + incident, + response)) + return RDC_ST_MAX_LIMIT; + } + + return RDC_ST_OK; +} rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) { if (nullptr == response) @@ -739,22 +925,25 @@ rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id, return result; } - //InfoROM - if (components & RDC_HEALTH_WATCH_INFOROM) { - //To do: - return RDC_ST_NOT_SUPPORTED; + //EEPROM + if (components & RDC_HEALTH_WATCH_EEPROM) { + result = eeprom_check(group_id, ginfo.entity_ids[gindex], response); + if (result == RDC_ST_MAX_LIMIT) + return result; } //Thermal if (components & RDC_HEALTH_WATCH_THERMAL) { - //To do: - return RDC_ST_NOT_SUPPORTED; + result = thermal_check(group_id, ginfo.entity_ids[gindex], response); + if (result == RDC_ST_MAX_LIMIT) + return result; } //Power if (components & RDC_HEALTH_WATCH_POWER) { - //To do: - return RDC_ST_NOT_SUPPORTED; + result = power_check(group_id, ginfo.entity_ids[gindex], response); + if (result == RDC_ST_MAX_LIMIT) + return result; } } //end of for gindex diff --git a/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc index 1e03de8490..8f85f591a7 100644 --- a/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc +++ b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc @@ -58,6 +58,9 @@ rdc_status_t Smi2RdcError(amdsmi_status_t rsmi) { case AMDSMI_STATUS_NO_PERM: return RDC_ST_PERM_ERROR; + case AMDSMI_STATUS_CORRUPTED_EEPROM: + return RDC_ST_CORRUPTED_EEPROM; + case AMDSMI_STATUS_BUSY: case AMDSMI_STATUS_UNKNOWN_ERROR: case AMDSMI_STATUS_INTERNAL_EXCEPTION: diff --git a/projects/rdc/rdci/src/RdciHealthSubSystem.cc b/projects/rdc/rdci/src/RdciHealthSubSystem.cc index b2ba82251d..c0aab30585 100644 --- a/projects/rdc/rdci/src/RdciHealthSubSystem.cc +++ b/projects/rdc/rdci/src/RdciHealthSubSystem.cc @@ -106,10 +106,9 @@ void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) { components |= RDC_HEALTH_WATCH_PCIE; components |= RDC_HEALTH_WATCH_XGMI; components |= RDC_HEALTH_WATCH_MEM; - //To do: - //components |= RDC_HEALTH_WATCH_INFOROM; - //components |= RDC_HEALTH_WATCH_THERMAL; - //components |= RDC_HEALTH_WATCH_POWER; + components |= RDC_HEALTH_WATCH_EEPROM; + components |= RDC_HEALTH_WATCH_THERMAL; + components |= RDC_HEALTH_WATCH_POWER; break; case 'p': @@ -120,17 +119,13 @@ void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) { components |= RDC_HEALTH_WATCH_MEM; break; - case 'i': - //To do: - //components |= RDC_HEALTH_WATCH_INFOROM; - throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported"); + case 'e': + components |= RDC_HEALTH_WATCH_EEPROM; break; case 't': - //To do: - //components |= RDC_HEALTH_WATCH_THERMAL; - //components |= RDC_HEALTH_WATCH_POWER; - throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported"); + components |= RDC_HEALTH_WATCH_THERMAL; + components |= RDC_HEALTH_WATCH_POWER; break; case 'x': @@ -187,8 +182,8 @@ void RdciHealthSubSystem::show_help() const { std::cout << " a - watch all components\n"; std::cout << " p - watch PCIe\n"; std::cout << " m - watch Memory\n"; - //std::cout << " i - watch infoROM\n"; - //std::cout << " t - watch power and thermal\n"; + std::cout << " e - watch EEPROM\n"; + std::cout << " t - watch power and thermal\n"; std::cout << " x - watch XGMI\n"; std::cout << " -c --check Check to see if any errors or warnings have " << "occurred in the currently monitored watches.\n"; @@ -215,9 +210,9 @@ void RdciHealthSubSystem::get_watches() const { std::cout << "{\"Component\" : \"PCIe\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "\"},"; std::cout << "{\"Component\" : \"XGMI\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "\"},"; std::cout << "{\"Component\" : \"Memory\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "\"},"; - /*std::cout << "{\"Component\" : \"InfoROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "\"},"; + std::cout << "{\"Component\" : \"EEPROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_EEPROM) ? on : off).c_str() << "\"},"; std::cout << "{\"Component\" : \"Thermal\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "\"},"; - std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}";*/ + std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}"; std::cout << "]"; } else { std::cout << "Health monitor systems status:" << std::endl; @@ -229,12 +224,12 @@ void RdciHealthSubSystem::get_watches() const { << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n"; std::cout << "|" << std::setw(20) << std::left << " Memory" << "| " << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n"; - /*std::cout << "|" << std::setw(20) << std::left << " InfoROM" << "| " - << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " EEPROM" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_EEPROM) ? on : off).c_str() << "|\n"; std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| " << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n"; std::cout << "|" << std::setw(20) << std::left << " Power" << "| " - << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";*/ + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n"; std::cout << "+--------------------+" //"-" width :20 << "---------------------------------------------------+\n"; //-" width :51 } @@ -282,8 +277,8 @@ std::string RdciHealthSubSystem::component_string(rdc_health_system_t component) case RDC_HEALTH_WATCH_MEM: return "Memory system: "; - case RDC_HEALTH_WATCH_INFOROM: - return "Inforom system: "; + case RDC_HEALTH_WATCH_EEPROM: + return "EEPROM system: "; case RDC_HEALTH_WATCH_THERMAL: return "Thermal system:";