From a34832f11e5c2a629bac094eccdd2b7fcde6a129 Mon Sep 17 00:00:00 2001 From: Kent Russell Date: Thu, 8 Aug 2019 09:00:22 -0400 Subject: [PATCH] Fix RAS change RAS formatting changed, so get it to handle both types of sysfs output until it's normalized Change-Id: I56f2a2495af8ff4d01011bc614283376afb9ad0a --- include/rocm_smi/rocm_smi.h | 3 ++- src/rocm_smi.cc | 10 +++++++--- tests/rocm_smi_test/test_common.cc | 6 +++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 4434e93f39..a32b7e0809 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -383,8 +383,9 @@ typedef enum { RSMI_RAS_ERR_STATE_MULT_UC, //!< Multiple uncorrectable errors RSMI_RAS_ERR_STATE_POISON, //!< Firmware detected error and isolated //!< page. Treat as uncorrectable. + RSMI_RAS_ERR_STATE_ENABLED, //!< ECC is enabled - RSMI_RAS_ERR_STATE_LAST = RSMI_RAS_ERR_STATE_POISON, + RSMI_RAS_ERR_STATE_LAST = RSMI_RAS_ERR_STATE_ENABLED, RSMI_RAS_ERR_STATE_INVALID = 0xFFFFFFFF } rsmi_ras_err_state_t; diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 2b3953af80..bf8fb81404 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -532,8 +532,10 @@ static const std::map kRocmSMIStateMap = { {"single_correctable", RSMI_RAS_ERR_STATE_SING_C}, {"multi_uncorrectable", RSMI_RAS_ERR_STATE_MULT_UC}, {"poison", RSMI_RAS_ERR_STATE_POISON}, + {"off", RSMI_RAS_ERR_STATE_DISABLED}, + {"on", RSMI_RAS_ERR_STATE_ENABLED}, }; -static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_POISON, +static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_ENABLED, "rsmi_gpu_block_t and/or above name map need to be updated" " and then this assert"); @@ -562,6 +564,7 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, std::string blk_line; std::string search_str = kRocmSMIBlockMap.at(block); + std::string sysfs_junk = " ras feature mask:"; std::string state_str; search_str += ":"; @@ -570,8 +573,9 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, std::istringstream fs1(val_vec[i]); fs1 >> blk_line; - - if (blk_line == search_str) { + if (blk_line == search_str || blk_line == kRocmSMIBlockMap.at(block)) { + if (blk_line.back() != ':') + fs1.ignore(sysfs_junk.length(), ':'); fs1 >> state_str; assert(kRocmSMIStateMap.count(state_str)); *state = kRocmSMIStateMap.at(state_str); diff --git a/tests/rocm_smi_test/test_common.cc b/tests/rocm_smi_test/test_common.cc index 50684065b8..6a7dfd112b 100755 --- a/tests/rocm_smi_test/test_common.cc +++ b/tests/rocm_smi_test/test_common.cc @@ -70,6 +70,8 @@ static const char * kRasErrStateStrings[] = { "Single, Correctable", // RSMI_RAS_ERR_STATE_SING_C "Multiple, Uncorrectable", // RSMI_RAS_ERR_STATE_MULT_UC "Poison" // RSMI_RAS_ERR_STATE_POISON + "off", // RSMI_RAS_ERR_STATE_DISABLED + "on", // RSMI_RAS_ERR_STATE_ENABLED }; static_assert( sizeof(kRasErrStateStrings)/sizeof(char *) == (RSMI_RAS_ERR_STATE_LAST + 1), @@ -89,8 +91,10 @@ static const std::map kErrStateNameMap = { kRasErrStateStrings[RSMI_RAS_ERR_STATE_MULT_UC]}, {RSMI_RAS_ERR_STATE_POISON, kRasErrStateStrings[RSMI_RAS_ERR_STATE_POISON]}, + {RSMI_RAS_ERR_STATE_ENABLED, + kRasErrStateStrings[RSMI_RAS_ERR_STATE_ENABLED]}, }; -static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_POISON, +static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_ENABLED, "kErrStateNameMap needs to be updated"); static const struct option long_options[] = {