From 6d6689b02c2b5399e970c77ffca620732acaa0dd Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Fri, 9 Aug 2019 11:32:16 -0500 Subject: [PATCH] Adjust how we read ECC block counter status This change corresponds to kernel changes. Change-Id: Ibd977e8b3338349036cb16e55fb0b2c9c187726d [ROCm/amdsmi commit: aaecfd6fff28c14be4890d6c6e68cbd736d9cb16] --- projects/amdsmi/include/rocm_smi/rocm_smi.h | 38 +++++++--- projects/amdsmi/src/rocm_smi.cc | 75 +++++-------------- projects/amdsmi/src/rocm_smi_device.cc | 2 +- .../rocm_smi_test/functional/err_cnt_read.cc | 19 ++--- .../amdsmi/tests/rocm_smi_test/test_common.cc | 17 ++++- 5 files changed, 68 insertions(+), 83 deletions(-) diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/include/rocm_smi/rocm_smi.h index a32b7e0809..6f7850aea8 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi.h @@ -361,10 +361,19 @@ typedef enum { RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST, //!< UMC block RSMI_GPU_BLOCK_SDMA = 0x0000000000000002, //!< SDMA block RSMI_GPU_BLOCK_GFX = 0x0000000000000004, //!< GFX block + RSMI_GPU_BLOCK_MMHUB = 0x0000000000000008, //!< MMHUB block + RSMI_GPU_BLOCK_ATHUB = 0x0000000000000010, //!< ATHUB block + RSMI_GPU_BLOCK_PCIE_BIF = 0x0000000000000020, //!< PCIE_BIF block + RSMI_GPU_BLOCK_HDP = 0x0000000000000040, //!< HDP block + RSMI_GPU_BLOCK_XGMI_WAFL = 0x0000000000000080, //!< XGMI block + RSMI_GPU_BLOCK_DF = 0x0000000000000100, //!< DF block + RSMI_GPU_BLOCK_SMN = 0x0000000000000200, //!< SMN block + RSMI_GPU_BLOCK_SEM = 0x0000000000000400, //!< SEM block + RSMI_GPU_BLOCK_MP0 = 0x0000000000000800, //!< MP0 block + RSMI_GPU_BLOCK_MP1 = 0x0000000000001000, //!< MP1 block + RSMI_GPU_BLOCK_FUSE = 0x0000000000002000, //!< Fuse block - // New enum elements will be added as support is added for other blocks - - RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX, //!< The highest bit position + RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_FUSE, //!< The highest bit position //!< for supported blocks RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000 } rsmi_gpu_block_t; @@ -1818,6 +1827,10 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. * + * ::RSMI_NOT_SUPPORTED will be returned if either ECC is not enabled for the + * specified block @p block, or if there is no kernel support for ECC for + * that block. + * */ rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t *ec); @@ -1826,22 +1839,25 @@ rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind, * @brief Retrieve the enabled ECC bit-mask * * @details Given a device index @p dv_ind, and a pointer to a uint64_t @p - * enabled_mask, this function will write a bit_mask to memory pointed to by - * @p enabled_mask. Upon a successful call, the bitmask can then be AND'd with - * elements of the ::rsmi_gpu_block_t ennumeration to determine if the - * corresponding block has ECC enabled. Note that the bits above - * ::RSMI_GPU_BLOCK_LAST correspond to blocks that do not yet have ECC support. + * enabled_mask, this function will write bits to memory pointed to by + * @p enabled_blocks. Upon a successful call, @p enabled_blocks can then be + * AND'd with elements of the ::rsmi_gpu_block_t ennumeration to determine if + * the corresponding block has ECC enabled. Note that whether a block has ECC + * enabled or not in the device is independent of whether there is kernel + * support for error counting for that block. Although a block may be enabled, + * but there may not be kernel support for reading error counters for that + * block. * * @param[in] dv_ind a device index * - * @param[inout] enabled_mask A pointer to a uint64_t to which the enabled - * mask will be written + * @param[inout] enabled_blocks A pointer to a uint64_t to which the enabled + * blocks bits will be written * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. * */ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, - uint64_t *enabled_mask); + uint64_t *enabled_blocks); /** * @brief Retrieve the ECC status for a GPU block diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index bf8fb81404..3960e0f53c 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -471,20 +471,19 @@ rsmi_num_monitor_devices(uint32_t *num_devices) { } rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, - uint64_t *enabled_mask) { + uint64_t *enabled_blks) { TRY - rsmi_status_t ret; - - if (enabled_mask == nullptr) { + if (enabled_blks == nullptr) { return RSMI_STATUS_INVALID_ARGS; } + rsmi_status_t ret; + std::string feature_line; + std::string tmp_str; + DEVICE_MUTEX - std::vector val_vec; - - ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec); - + ret = get_dev_value_line(amd::smi::kDevErrCntFeatures, dv_ind, &feature_line); if (ret == RSMI_STATUS_FILE_ERROR) { return RSMI_STATUS_NOT_SUPPORTED; } @@ -492,39 +491,22 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, return ret; } - std::string junk; - std::istringstream fs1(val_vec[0]); - std::string mask_str; + std::istringstream fs1(feature_line); - fs1 >> junk; - assert(junk == "feature"); - fs1 >> junk; - assert(junk == "mask:"); - fs1 >> mask_str; + fs1 >> tmp_str; // ignore + assert(tmp_str == "feature"); + fs1 >> tmp_str; // ignore + assert(tmp_str == "mask:"); + fs1 >> tmp_str; errno = 0; - *enabled_mask = strtoul(mask_str.c_str(), nullptr, 16); + *enabled_blks = strtoul(tmp_str.c_str(), nullptr, 16); assert(errno == 0); return errno_to_rsmi_status(errno); - CATCH } - -static const char *kRSMIGpuBlkUMCFName = "umc"; -static const char *kRSMIGpuBlkSDMAFName = "sdma"; -static const char *kRSMIGpuBlkGFXFName = "gfx"; - -static const std::map kRocmSMIBlockMap = { - {RSMI_GPU_BLOCK_UMC, kRSMIGpuBlkUMCFName}, - {RSMI_GPU_BLOCK_SDMA, kRSMIGpuBlkSDMAFName}, - {RSMI_GPU_BLOCK_GFX, kRSMIGpuBlkGFXFName}, -}; -static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX, - "rsmi_gpu_block_t and/or above name map need to be updated" - " and then this assert"); - static const std::map kRocmSMIStateMap = { {"none", RSMI_RAS_ERR_STATE_NONE}, {"disabled", RSMI_RAS_ERR_STATE_DISABLED}, @@ -549,11 +531,11 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, return RSMI_STATUS_INVALID_ARGS; } rsmi_status_t ret; - std::vector val_vec; + uint64_t features_mask; DEVICE_MUTEX - ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec); + ret = rsmi_dev_ecc_enabled_get(dv_ind, &features_mask); if (ret == RSMI_STATUS_FILE_ERROR) { return RSMI_STATUS_NOT_SUPPORTED; @@ -562,29 +544,10 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, return ret; } - std::string blk_line; - std::string search_str = kRocmSMIBlockMap.at(block); - std::string sysfs_junk = " ras feature mask:"; - std::string state_str; + *state = (features_mask & block) ? + RSMI_RAS_ERR_STATE_ENABLED : RSMI_RAS_ERR_STATE_DISABLED; - search_str += ":"; - - for (uint32_t i = 1; i < val_vec.size(); ++i) { // Skip features line - std::istringstream fs1(val_vec[i]); - - fs1 >> blk_line; - if (blk_line == search_str || blk_line == kRocmSMIBlockMap.at(block)) { - if (blk_line.back() != ':') - fs1.ignore(sysfs_junk.length(), ':'); - fs1 >> state_str; - assert(kRocmSMIStateMap.count(state_str)); - *state = kRocmSMIStateMap.at(state_str); - return RSMI_STATUS_SUCCESS; - } - } - assert(!"Block was not found"); - *state = RSMI_RAS_ERR_STATE_INVALID; - return RSMI_STATUS_NOT_FOUND; + return RSMI_STATUS_SUCCESS; CATCH } diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index 5d4ce010d2..dc6c0a5b53 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -415,6 +415,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevSubSysDevID: case kDevSubSysVendorID: case kDevVendorID: + case kDevErrCntFeatures: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); *val = std::stoi(tempStr, 0, 16); @@ -485,7 +486,6 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { case kDevErrCntSDMA: case kDevErrCntUMC: case kDevErrCntGFX: - case kDevErrCntFeatures: case kDevMemPageBad: return readDevInfoMultiLineStr(type, val); break; diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc index 03c5b70df5..834914ba9c 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc @@ -98,6 +98,7 @@ void TestErrCntRead::Run(void) { std::cout << "\t**Error Count Enabled Mask get is not supported on this machine" << std::endl; + continue; } else { CHK_ERR_ASRT(err) IF_VERB(STANDARD) { @@ -109,17 +110,11 @@ void TestErrCntRead::Run(void) { b <= RSMI_GPU_BLOCK_LAST; b = b*2) { err = rsmi_dev_ecc_status_get(i, static_cast(b), &err_state); - if (err == RSMI_STATUS_NOT_SUPPORTED) { - std::cout << "\t**Error Count Status for " << - GetBlockNameStr(static_cast(b)) << - ": Not supported on this machine" << std::endl; - } else { - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**Error Count status for " << - GetBlockNameStr(static_cast(b)) << - " block: " << GetErrStateNameStr(err_state) << std::endl; - } + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Error Count status for " << + GetBlockNameStr(static_cast(b)) << + " block: " << GetErrStateNameStr(err_state) << std::endl; } err = rsmi_dev_ecc_count_get(i, static_cast(b), &ec); @@ -127,7 +122,7 @@ void TestErrCntRead::Run(void) { if (err == RSMI_STATUS_NOT_SUPPORTED) { std::cout << "\t**Error Count for " << GetBlockNameStr(static_cast(b)) << - ": Not supported on this machine" << std::endl; + ": Not supported for this device" << std::endl; } else { CHK_ERR_ASRT(err) IF_VERB(STANDARD) { diff --git a/projects/amdsmi/tests/rocm_smi_test/test_common.cc b/projects/amdsmi/tests/rocm_smi_test/test_common.cc index 6a7dfd112b..16c97832c1 100755 --- a/projects/amdsmi/tests/rocm_smi_test/test_common.cc +++ b/projects/amdsmi/tests/rocm_smi_test/test_common.cc @@ -59,8 +59,19 @@ static const std::map kBlockNameMap = { {RSMI_GPU_BLOCK_UMC, "UMC"}, {RSMI_GPU_BLOCK_SDMA, "SDMA"}, {RSMI_GPU_BLOCK_GFX, "GFX"}, + {RSMI_GPU_BLOCK_MMHUB, "MMHUB"}, + {RSMI_GPU_BLOCK_ATHUB, "ATHUB"}, + {RSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"}, + {RSMI_GPU_BLOCK_HDP, "HDP"}, + {RSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"}, + {RSMI_GPU_BLOCK_DF, "DF"}, + {RSMI_GPU_BLOCK_SMN, "SMN"}, + {RSMI_GPU_BLOCK_SEM, "SEM"}, + {RSMI_GPU_BLOCK_MP0, "MP0"}, + {RSMI_GPU_BLOCK_MP1, "MP1"}, + {RSMI_GPU_BLOCK_FUSE, "FUSE"}, }; -static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX, +static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_FUSE, "kBlockNameMap needs to be updated"); static const char * kRasErrStateStrings[] = { @@ -70,8 +81,8 @@ static const char * kRasErrStateStrings[] = { "Single, Correctable", // RSMI_RAS_ERR_STATE_SING_C "Multiple, Uncorrectable", // RSMI_RAS_ERR_STATE_MULT_UC "Poison" // RSMI_RAS_ERR_STATE_POISON - "off", // RSMI_RAS_ERR_STATE_DISABLED - "on", // RSMI_RAS_ERR_STATE_ENABLED + "Off", // RSMI_RAS_ERR_STATE_DISABLED + "On", // RSMI_RAS_ERR_STATE_ENABLED }; static_assert( sizeof(kRasErrStateStrings)/sizeof(char *) == (RSMI_RAS_ERR_STATE_LAST + 1),