[SWDEV-230863] Improve the functionality of RdcSmiHealth module.

Memory check:get the threshold of retired page number
EEPROM check:read and verify the checksum
Power/Thermal check: power/thermal throttle status counter

Signed-off-by: Meng Li <li.meng@amd.com>
Change-Id: Id2c751416eb5bf007e6e1da8dc05966a6ba1324e


[ROCm/rdc commit: 016a1d9d39]
This commit is contained in:
limeng12
2025-01-09 13:52:56 +08:00
committed by Meng, Li (Jassmine)
parent 78f37c1784
commit 4f3b114740
11 changed files with 307 additions and 72 deletions
+5 -4
View File
@@ -169,8 +169,9 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured",
// RDC health related fields
FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR, "XGMI one or more errors detected", "XGMI_ERROR", true)
FLD_DESC_ENT(RDC_HEALTH_PCIE_REPLAY_COUNT, "Total PCIE replay count", "PCIE_REPLAY_COUNT", true)
FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_NUM, "Retired page number", "RETIRED_PAGE_NUM", true)
FLD_DESC_ENT(RDC_HEALTH_PENDING_PAGE_NUM, "Pending page number", "PENDING_PAGE_NUM", true)
FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", false)
FLD_DESC_ENT(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, "Uncorrectable page limit", "UNCORRECTABLE_PAGE_LIMIT", false)
FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", false)
FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", false)
FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", true)
FLD_DESC_ENT(RDC_HEALTH_EEPROM_CONFIG_VALID, "Verify checksum of EEPROM", "EEPROM_CONFIG_VALID", true)
FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", true)
FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", true)
+11 -3
View File
@@ -45,6 +45,12 @@ rdc_status_t get_watches(rdc_handle_t rdc_handle, rdc_gpu_group_t group_id) {
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Memory" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " EEPROM" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_EEPROM) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Power" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";
std::cout << "+--------------------+" //"-" width :20
<< "---------------------------------------------------+\n"; //-" width :51
}
@@ -79,8 +85,8 @@ std::string component_string(rdc_health_system_t component) {
case RDC_HEALTH_WATCH_MEM:
return "Memory system: ";
case RDC_HEALTH_WATCH_INFOROM:
return "Inforom system: ";
case RDC_HEALTH_WATCH_EEPROM:
return "EEPROM system: ";
case RDC_HEALTH_WATCH_THERMAL:
return "Thermal system:";
@@ -280,7 +286,9 @@ int main(int, char**) {
// (3) set health watches.
unsigned int components;
components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM;
components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM
| RDC_HEALTH_WATCH_EEPROM | RDC_HEALTH_WATCH_THERMAL
| RDC_HEALTH_WATCH_POWER;
result = rdc_health_set(rdc_handle, group_id, components);
if (result != RDC_ST_OK) {
std::cout << "Error setting health watches. Return: " << rdc_status_string(result)
+5 -4
View File
@@ -80,6 +80,7 @@ typedef enum {
//!< but none was found
RDC_ST_PERM_ERROR, //!< Insufficient permission to complete
//!< operation
RDC_ST_CORRUPTED_EEPROM, //!< EEPROM is corrupted
RDC_ST_DISABLED_MODULE, //!< Attempted loading disabled module
RDC_ST_UNKNOWN_ERROR = 0xFFFFFFFF //!< Unknown error
@@ -353,8 +354,8 @@ typedef enum {
RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count
RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number
RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number
RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page
RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, //!< The threshold of uncorrectable page
RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< the threshold of retired page number
RDC_HEALTH_EEPROM_CONFIG_VALID, //!< Reads the EEPROM and verifies the checksums
RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter
RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds)
} rdc_field_t;
@@ -681,7 +682,7 @@ typedef enum {
RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches
RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches
RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches
RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches
RDC_HEALTH_WATCH_EEPROM = 0x8, //!< EEPROM watches
RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches
RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches
} rdc_health_system_t;
@@ -716,7 +717,7 @@ typedef enum {
RDC_FR_CLOCKS_THROTTLE_POWER = 1006,
RDC_FR_XGMI_SINGLE_ERROR = 1007,
RDC_FR_XGMI_MULTIPLE_ERROR = 1008,
RDC_FR_CORRUPT_INFOROM = 1009
RDC_FR_CORRUPT_EEPROM = 1009
} rdc_health_error_code_t;
/**
@@ -137,6 +137,7 @@ class RdcWatchTableImpl : public RdcWatchTable {
rdc_status_t get_start_end_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field,
uint64_t start_timestamp,
rdc_field_value *start_value,
rdc_field_value *end_value);
rdc_status_t pcie_check(rdc_gpu_group_t group_id,
@@ -145,6 +146,12 @@ class RdcWatchTableImpl : public RdcWatchTable {
uint32_t gpu_index, rdc_health_response_t* response);
rdc_status_t memory_check(rdc_gpu_group_t group_id,
uint32_t gpu_index, rdc_health_response_t* response);
rdc_status_t eeprom_check(rdc_gpu_group_t group_id,
uint32_t gpu_index, rdc_health_response_t* response);
rdc_status_t thermal_check(rdc_gpu_group_t group_id,
uint32_t gpu_index, rdc_health_response_t* response);
rdc_status_t power_check(rdc_gpu_group_t group_id,
uint32_t gpu_index, rdc_health_response_t* response);
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
+2 -1
View File
@@ -55,6 +55,7 @@ class rdc_status_t(Enum):
RDC_ST_FILE_ERROR = 12
RDC_ST_NO_DATA = 13
RDC_ST_PERM_ERROR = 14
RDC_ST_CORRUPTED_EEPROM = 15
RDC_ST_UNKNOWN_ERROR = 4294967295
class rdc_operation_mode_t(c_int):
@@ -173,7 +174,7 @@ class rdc_field_t(c_int):
RDC_HEALTH_RETIRED_PAGE_NUM = 3002
RDC_HEALTH_PENDING_PAGE_NUM = 3003
RDC_HEALTH_RETIRED_PAGE_LIMIT = 3004
RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT = 3005
RDC_HEALTH_EEPROM_CONFIG_VALID = 3005
RDC_HEALTH_POWER_THROTTLE_TIME = 3006
RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007
@@ -369,6 +369,8 @@ const char* rdc_status_string(rdc_status_t result) {
return "Data was requested, but none was found";
case RDC_ST_PERM_ERROR:
return "Insufficient permission to complete operation";
case RDC_ST_CORRUPTED_EEPROM:
return "EEPROM is corrupted";
case RDC_ST_UNKNOWN_ERROR:
return "Unknown error";
default:
@@ -886,10 +886,38 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
break;
}
case RDC_HEALTH_RETIRED_PAGE_LIMIT:
case RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT:
case RDC_HEALTH_POWER_THROTTLE_TIME: //gpu_metrics 1.6
case RDC_HEALTH_THERMAL_THROTTLE_TIME: //gpu_metrics 1.6
case RDC_HEALTH_RETIRED_PAGE_LIMIT: {
uint32_t retired_page_threshold = 0;
ret = amdsmi_get_gpu_bad_page_threshold(processor_handle, &retired_page_threshold);
value->status = Smi2RdcError(ret);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(retired_page_threshold);
}
break;
}
case RDC_HEALTH_EEPROM_CONFIG_VALID: {
ret = amdsmi_gpu_validate_ras_eeprom(processor_handle);
value->status = Smi2RdcError(ret);
break;
}
case RDC_HEALTH_POWER_THROTTLE_TIME:
case RDC_HEALTH_THERMAL_THROTTLE_TIME: {
amdsmi_violation_status_t violation_status;
ret = amdsmi_get_violation_status(processor_handle, &violation_status);
value->status = Smi2RdcError(ret);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
if (RDC_HEALTH_POWER_THROTTLE_TIME == field_id)
value->value.l_int = static_cast<int64_t>(violation_status.acc_ppt_pwr);
if (RDC_HEALTH_THERMAL_THROTTLE_TIME == field_id)
value->value.l_int = static_cast<int64_t>(violation_status.acc_socket_thrm);
}
break;
}
default:
break;
}
+1 -1
View File
@@ -181,7 +181,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, RDC_FI_OAM_ID,
RDC_FI_GPU_MM_ENC_UTIL, RDC_FI_GPU_MM_DEC_UTIL, RDC_FI_GPU_MEMORY_ACTIVITY,
RDC_HEALTH_XGMI_ERROR, RDC_HEALTH_PCIE_REPLAY_COUNT, RDC_HEALTH_RETIRED_PAGE_NUM,
RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,
RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_EEPROM_CONFIG_VALID,
RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME,
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH,
};
@@ -35,6 +35,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/impl/SmiUtils.h"
namespace amd {
namespace rdc {
@@ -392,10 +393,10 @@ rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int component
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT);
field_ids.push_back(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT);
}
if (components & RDC_HEALTH_WATCH_INFOROM) {
if (components & RDC_HEALTH_WATCH_EEPROM) {
field_ids.push_back(RDC_HEALTH_EEPROM_CONFIG_VALID);
}
if (components & RDC_HEALTH_WATCH_THERMAL) {
@@ -506,24 +507,23 @@ bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index,
rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field,
uint64_t start_timestamp,
rdc_field_value *start_value,
rdc_field_value *end_value) {
if ((nullptr == start_value) || (nullptr == end_value))
if ((nullptr == start_value) && (nullptr == end_value))
return RDC_ST_BAD_PARAMETER;
uint64_t start_timestamp = 0;
//get the history data last 1 minute
start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
//get the values of the field at the start_timestamp/end_timestampe
rdc_status_t result = cache_mgr_->rdc_health_get_values(group_id,
gpu_index, field,
start_timestamp, 0,
start_value, nullptr);
if (result != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result);
return result;
rdc_status_t result = RDC_ST_OK;
if (nullptr != start_value) {
//get the values of the field at the start_timestamp/end_timestampe
result = cache_mgr_->rdc_health_get_values(group_id,
gpu_index, field,
start_timestamp, 0,
start_value, nullptr);
if (result != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result);
return result;
}
}
// get end values
@@ -539,9 +539,12 @@ rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start = {}, end = {};
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
//get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_PCIE_REPLAY_COUNT,
start_timestamp,
&start,
&end);
if (result != RDC_ST_OK)
@@ -575,11 +578,12 @@ rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start = {}, end = {};
rdc_field_value end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_XGMI_ERROR,
&start,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
@@ -617,23 +621,24 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start = {}, end = {};
rdc_field_value start= {}, end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_FI_ECC_UNCORRECT_TOTAL,
&start,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
uint64_t ecc_uncorrectable_count = 0;
ecc_uncorrectable_count = end.value.l_int - start.value.l_int;
ecc_uncorrectable_count = end.value.l_int;
if (ecc_uncorrectable_count > 0) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(ecc_uncorrectable_count);
err_msg += " uncorrectable ECC error(s) in the last minute.";
err_msg += " uncorrectable ECC error(s) since last GPU reset.";
//add incident
if (add_health_incident(gpu_index,
@@ -649,12 +654,13 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_PENDING_PAGE_NUM,
&start,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
uint64_t num_pages = end.value.l_int - start.value.l_int;
uint64_t num_pages = end.value.l_int;
if (num_pages > 0) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
@@ -673,12 +679,192 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
return RDC_ST_MAX_LIMIT;
}
//To do: RDC_FR_RETIRED_PAGES_LIMIT
//To do: RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT
//get retired page number
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_RETIRED_PAGE_NUM,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
uint64_t retired_page = end.value.l_int;
//get retired page threshold
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_RETIRED_PAGE_LIMIT,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
uint32_t retired_page_threshold = end.value.l_int;
if (retired_page > retired_page_threshold) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(retired_page);
err_msg += " retired pages exceeding the max limit: ";
err_msg += std::to_string(retired_page_threshold);
err_msg += ".";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_MEM,
RDC_HEALTH_RESULT_FAIL,
RDC_FR_RETIRED_PAGES_LIMIT,
err_msg,
incident,
response))
return RDC_ST_MAX_LIMIT;
return RDC_ST_OK;
}
if (retired_page > 0) {
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 604800) * 1000;
//get retired page number last 1 week
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_RETIRED_PAGE_NUM,
start_timestamp,
&start,
&end);
if (result != RDC_ST_OK)
return result;
retired_page = end.value.l_int - start.value.l_int;
if (retired_page > 1) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(retired_page);
err_msg += " retired pages more than one in the last week.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_MEM,
RDC_HEALTH_RESULT_FAIL,
RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT,
err_msg,
incident,
response))
return RDC_ST_MAX_LIMIT;
}
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::eeprom_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
rdc_field_value end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_FI_ECC_UNCORRECT_TOTAL,
0,
nullptr,
&end);
if (result != RDC_ST_OK && result != RDC_ST_CORRUPTED_EEPROM)
return result;
if (result == RDC_ST_CORRUPTED_EEPROM) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected a corrupt EEPROM since last GPU reset.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_EEPROM,
RDC_HEALTH_RESULT_WARN,
RDC_FR_CORRUPT_EEPROM,
err_msg,
incident,
response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::thermal_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start = {}, end = {};
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
//get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_THERMAL_THROTTLE_TIME,
start_timestamp,
&start,
&end);
if (result != RDC_ST_OK)
return result;
uint64_t acc_socket_thrm = end.value.l_int - start.value.l_int;
if (0 < acc_socket_thrm) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(acc_socket_thrm);
err_msg += " clock throttling due to thermal violation in the last minute.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_THERMAL,
RDC_HEALTH_RESULT_WARN,
RDC_FR_CLOCKS_THROTTLE_THERMAL,
err_msg,
incident,
response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::power_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start = {}, end = {};
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
//get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_POWER_THROTTLE_TIME,
start_timestamp,
&start,
&end);
if (result != RDC_ST_OK)
return result;
uint64_t acc_ppt_pwr = end.value.l_int - start.value.l_int;
if (0 < acc_ppt_pwr) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(acc_ppt_pwr);
err_msg += " Detected clock throttling due to power violation in the last minute.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_POWER,
RDC_HEALTH_RESULT_WARN,
RDC_FR_CLOCKS_THROTTLE_POWER,
err_msg,
incident,
response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
if (nullptr == response)
@@ -739,22 +925,25 @@ rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
return result;
}
//InfoROM
if (components & RDC_HEALTH_WATCH_INFOROM) {
//To do:
return RDC_ST_NOT_SUPPORTED;
//EEPROM
if (components & RDC_HEALTH_WATCH_EEPROM) {
result = eeprom_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
}
//Thermal
if (components & RDC_HEALTH_WATCH_THERMAL) {
//To do:
return RDC_ST_NOT_SUPPORTED;
result = thermal_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
}
//Power
if (components & RDC_HEALTH_WATCH_POWER) {
//To do:
return RDC_ST_NOT_SUPPORTED;
result = power_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
}
} //end of for gindex
@@ -58,6 +58,9 @@ rdc_status_t Smi2RdcError(amdsmi_status_t rsmi) {
case AMDSMI_STATUS_NO_PERM:
return RDC_ST_PERM_ERROR;
case AMDSMI_STATUS_CORRUPTED_EEPROM:
return RDC_ST_CORRUPTED_EEPROM;
case AMDSMI_STATUS_BUSY:
case AMDSMI_STATUS_UNKNOWN_ERROR:
case AMDSMI_STATUS_INTERNAL_EXCEPTION:
+16 -21
View File
@@ -106,10 +106,9 @@ void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) {
components |= RDC_HEALTH_WATCH_PCIE;
components |= RDC_HEALTH_WATCH_XGMI;
components |= RDC_HEALTH_WATCH_MEM;
//To do:
//components |= RDC_HEALTH_WATCH_INFOROM;
//components |= RDC_HEALTH_WATCH_THERMAL;
//components |= RDC_HEALTH_WATCH_POWER;
components |= RDC_HEALTH_WATCH_EEPROM;
components |= RDC_HEALTH_WATCH_THERMAL;
components |= RDC_HEALTH_WATCH_POWER;
break;
case 'p':
@@ -120,17 +119,13 @@ void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) {
components |= RDC_HEALTH_WATCH_MEM;
break;
case 'i':
//To do:
//components |= RDC_HEALTH_WATCH_INFOROM;
throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
case 'e':
components |= RDC_HEALTH_WATCH_EEPROM;
break;
case 't':
//To do:
//components |= RDC_HEALTH_WATCH_THERMAL;
//components |= RDC_HEALTH_WATCH_POWER;
throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
components |= RDC_HEALTH_WATCH_THERMAL;
components |= RDC_HEALTH_WATCH_POWER;
break;
case 'x':
@@ -187,8 +182,8 @@ void RdciHealthSubSystem::show_help() const {
std::cout << " a - watch all components\n";
std::cout << " p - watch PCIe\n";
std::cout << " m - watch Memory\n";
//std::cout << " i - watch infoROM\n";
//std::cout << " t - watch power and thermal\n";
std::cout << " e - watch EEPROM\n";
std::cout << " t - watch power and thermal\n";
std::cout << " x - watch XGMI\n";
std::cout << " -c --check Check to see if any errors or warnings have "
<< "occurred in the currently monitored watches.\n";
@@ -215,9 +210,9 @@ void RdciHealthSubSystem::get_watches() const {
std::cout << "{\"Component\" : \"PCIe\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"XGMI\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"Memory\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "\"},";
/*std::cout << "{\"Component\" : \"InfoROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"EEPROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_EEPROM) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"Thermal\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}";*/
std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}";
std::cout << "]";
} else {
std::cout << "Health monitor systems status:" << std::endl;
@@ -229,12 +224,12 @@ void RdciHealthSubSystem::get_watches() const {
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Memory" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
/*std::cout << "|" << std::setw(20) << std::left << " InfoROM" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " EEPROM" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_EEPROM) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Power" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";*/
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";
std::cout << "+--------------------+" //"-" width :20
<< "---------------------------------------------------+\n"; //-" width :51
}
@@ -282,8 +277,8 @@ std::string RdciHealthSubSystem::component_string(rdc_health_system_t component)
case RDC_HEALTH_WATCH_MEM:
return "Memory system: ";
case RDC_HEALTH_WATCH_INFOROM:
return "Inforom system: ";
case RDC_HEALTH_WATCH_EEPROM:
return "EEPROM system: ";
case RDC_HEALTH_WATCH_THERMAL:
return "Thermal system:";