Adjust how we read ECC block counter status

This change corresponds to kernel changes.

Change-Id: Ibd977e8b3338349036cb16e55fb0b2c9c187726d


[ROCm/amdsmi commit: aaecfd6fff]
This commit is contained in:
Chris Freehill
2019-08-09 11:32:16 -05:00
parent fe56aa131f
commit 6d6689b02c
5 changed files with 68 additions and 83 deletions
+27 -11
View File
@@ -361,10 +361,19 @@ typedef enum {
RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST, //!< UMC block
RSMI_GPU_BLOCK_SDMA = 0x0000000000000002, //!< SDMA block
RSMI_GPU_BLOCK_GFX = 0x0000000000000004, //!< GFX block
RSMI_GPU_BLOCK_MMHUB = 0x0000000000000008, //!< MMHUB block
RSMI_GPU_BLOCK_ATHUB = 0x0000000000000010, //!< ATHUB block
RSMI_GPU_BLOCK_PCIE_BIF = 0x0000000000000020, //!< PCIE_BIF block
RSMI_GPU_BLOCK_HDP = 0x0000000000000040, //!< HDP block
RSMI_GPU_BLOCK_XGMI_WAFL = 0x0000000000000080, //!< XGMI block
RSMI_GPU_BLOCK_DF = 0x0000000000000100, //!< DF block
RSMI_GPU_BLOCK_SMN = 0x0000000000000200, //!< SMN block
RSMI_GPU_BLOCK_SEM = 0x0000000000000400, //!< SEM block
RSMI_GPU_BLOCK_MP0 = 0x0000000000000800, //!< MP0 block
RSMI_GPU_BLOCK_MP1 = 0x0000000000001000, //!< MP1 block
RSMI_GPU_BLOCK_FUSE = 0x0000000000002000, //!< Fuse block
// New enum elements will be added as support is added for other blocks
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX, //!< The highest bit position
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_FUSE, //!< The highest bit position
//!< for supported blocks
RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000
} rsmi_gpu_block_t;
@@ -1818,6 +1827,10 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block,
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
* ::RSMI_NOT_SUPPORTED will be returned if either ECC is not enabled for the
* specified block @p block, or if there is no kernel support for ECC for
* that block.
*
*/
rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind,
rsmi_gpu_block_t block, rsmi_error_count_t *ec);
@@ -1826,22 +1839,25 @@ rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind,
* @brief Retrieve the enabled ECC bit-mask
*
* @details Given a device index @p dv_ind, and a pointer to a uint64_t @p
* enabled_mask, this function will write a bit_mask to memory pointed to by
* @p enabled_mask. Upon a successful call, the bitmask can then be AND'd with
* elements of the ::rsmi_gpu_block_t ennumeration to determine if the
* corresponding block has ECC enabled. Note that the bits above
* ::RSMI_GPU_BLOCK_LAST correspond to blocks that do not yet have ECC support.
* enabled_mask, this function will write bits to memory pointed to by
* @p enabled_blocks. Upon a successful call, @p enabled_blocks can then be
* AND'd with elements of the ::rsmi_gpu_block_t ennumeration to determine if
* the corresponding block has ECC enabled. Note that whether a block has ECC
* enabled or not in the device is independent of whether there is kernel
* support for error counting for that block. Although a block may be enabled,
* but there may not be kernel support for reading error counters for that
* block.
*
* @param[in] dv_ind a device index
*
* @param[inout] enabled_mask A pointer to a uint64_t to which the enabled
* mask will be written
* @param[inout] enabled_blocks A pointer to a uint64_t to which the enabled
* blocks bits will be written
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*/
rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
uint64_t *enabled_mask);
uint64_t *enabled_blocks);
/**
* @brief Retrieve the ECC status for a GPU block
+19 -56
View File
@@ -471,20 +471,19 @@ rsmi_num_monitor_devices(uint32_t *num_devices) {
}
rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
uint64_t *enabled_mask) {
uint64_t *enabled_blks) {
TRY
rsmi_status_t ret;
if (enabled_mask == nullptr) {
if (enabled_blks == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
rsmi_status_t ret;
std::string feature_line;
std::string tmp_str;
DEVICE_MUTEX
std::vector<std::string> val_vec;
ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec);
ret = get_dev_value_line(amd::smi::kDevErrCntFeatures, dv_ind, &feature_line);
if (ret == RSMI_STATUS_FILE_ERROR) {
return RSMI_STATUS_NOT_SUPPORTED;
}
@@ -492,39 +491,22 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
return ret;
}
std::string junk;
std::istringstream fs1(val_vec[0]);
std::string mask_str;
std::istringstream fs1(feature_line);
fs1 >> junk;
assert(junk == "feature");
fs1 >> junk;
assert(junk == "mask:");
fs1 >> mask_str;
fs1 >> tmp_str; // ignore
assert(tmp_str == "feature");
fs1 >> tmp_str; // ignore
assert(tmp_str == "mask:");
fs1 >> tmp_str;
errno = 0;
*enabled_mask = strtoul(mask_str.c_str(), nullptr, 16);
*enabled_blks = strtoul(tmp_str.c_str(), nullptr, 16);
assert(errno == 0);
return errno_to_rsmi_status(errno);
CATCH
}
static const char *kRSMIGpuBlkUMCFName = "umc";
static const char *kRSMIGpuBlkSDMAFName = "sdma";
static const char *kRSMIGpuBlkGFXFName = "gfx";
static const std::map<rsmi_gpu_block_t, const char *> kRocmSMIBlockMap = {
{RSMI_GPU_BLOCK_UMC, kRSMIGpuBlkUMCFName},
{RSMI_GPU_BLOCK_SDMA, kRSMIGpuBlkSDMAFName},
{RSMI_GPU_BLOCK_GFX, kRSMIGpuBlkGFXFName},
};
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX,
"rsmi_gpu_block_t and/or above name map need to be updated"
" and then this assert");
static const std::map<std::string, rsmi_ras_err_state_t> kRocmSMIStateMap = {
{"none", RSMI_RAS_ERR_STATE_NONE},
{"disabled", RSMI_RAS_ERR_STATE_DISABLED},
@@ -549,11 +531,11 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
return RSMI_STATUS_INVALID_ARGS;
}
rsmi_status_t ret;
std::vector<std::string> val_vec;
uint64_t features_mask;
DEVICE_MUTEX
ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec);
ret = rsmi_dev_ecc_enabled_get(dv_ind, &features_mask);
if (ret == RSMI_STATUS_FILE_ERROR) {
return RSMI_STATUS_NOT_SUPPORTED;
@@ -562,29 +544,10 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
return ret;
}
std::string blk_line;
std::string search_str = kRocmSMIBlockMap.at(block);
std::string sysfs_junk = " ras feature mask:";
std::string state_str;
*state = (features_mask & block) ?
RSMI_RAS_ERR_STATE_ENABLED : RSMI_RAS_ERR_STATE_DISABLED;
search_str += ":";
for (uint32_t i = 1; i < val_vec.size(); ++i) { // Skip features line
std::istringstream fs1(val_vec[i]);
fs1 >> blk_line;
if (blk_line == search_str || blk_line == kRocmSMIBlockMap.at(block)) {
if (blk_line.back() != ':')
fs1.ignore(sysfs_junk.length(), ':');
fs1 >> state_str;
assert(kRocmSMIStateMap.count(state_str));
*state = kRocmSMIStateMap.at(state_str);
return RSMI_STATUS_SUCCESS;
}
}
assert(!"Block was not found");
*state = RSMI_RAS_ERR_STATE_INVALID;
return RSMI_STATUS_NOT_FOUND;
return RSMI_STATUS_SUCCESS;
CATCH
}
+1 -1
View File
@@ -415,6 +415,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
case kDevSubSysDevID:
case kDevSubSysVendorID:
case kDevVendorID:
case kDevErrCntFeatures:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
*val = std::stoi(tempStr, 0, 16);
@@ -485,7 +486,6 @@ int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
case kDevErrCntSDMA:
case kDevErrCntUMC:
case kDevErrCntGFX:
case kDevErrCntFeatures:
case kDevMemPageBad:
return readDevInfoMultiLineStr(type, val);
break;
@@ -98,6 +98,7 @@ void TestErrCntRead::Run(void) {
std::cout <<
"\t**Error Count Enabled Mask get is not supported on this machine"
<< std::endl;
continue;
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
@@ -109,17 +110,11 @@ void TestErrCntRead::Run(void) {
b <= RSMI_GPU_BLOCK_LAST; b = b*2) {
err = rsmi_dev_ecc_status_get(i, static_cast<rsmi_gpu_block_t>(b),
&err_state);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Error Count Status for " <<
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
": Not supported on this machine" << std::endl;
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Error Count status for " <<
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
" block: " << GetErrStateNameStr(err_state) << std::endl;
}
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Error Count status for " <<
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
" block: " << GetErrStateNameStr(err_state) << std::endl;
}
err = rsmi_dev_ecc_count_get(i, static_cast<rsmi_gpu_block_t>(b), &ec);
@@ -127,7 +122,7 @@ void TestErrCntRead::Run(void) {
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Error Count for " <<
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
": Not supported on this machine" << std::endl;
": Not supported for this device" << std::endl;
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
@@ -59,8 +59,19 @@ static const std::map<rsmi_gpu_block_t, const char *> kBlockNameMap = {
{RSMI_GPU_BLOCK_UMC, "UMC"},
{RSMI_GPU_BLOCK_SDMA, "SDMA"},
{RSMI_GPU_BLOCK_GFX, "GFX"},
{RSMI_GPU_BLOCK_MMHUB, "MMHUB"},
{RSMI_GPU_BLOCK_ATHUB, "ATHUB"},
{RSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"},
{RSMI_GPU_BLOCK_HDP, "HDP"},
{RSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"},
{RSMI_GPU_BLOCK_DF, "DF"},
{RSMI_GPU_BLOCK_SMN, "SMN"},
{RSMI_GPU_BLOCK_SEM, "SEM"},
{RSMI_GPU_BLOCK_MP0, "MP0"},
{RSMI_GPU_BLOCK_MP1, "MP1"},
{RSMI_GPU_BLOCK_FUSE, "FUSE"},
};
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX,
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_FUSE,
"kBlockNameMap needs to be updated");
static const char * kRasErrStateStrings[] = {
@@ -70,8 +81,8 @@ static const char * kRasErrStateStrings[] = {
"Single, Correctable", // RSMI_RAS_ERR_STATE_SING_C
"Multiple, Uncorrectable", // RSMI_RAS_ERR_STATE_MULT_UC
"Poison" // RSMI_RAS_ERR_STATE_POISON
"off", // RSMI_RAS_ERR_STATE_DISABLED
"on", // RSMI_RAS_ERR_STATE_ENABLED
"Off", // RSMI_RAS_ERR_STATE_DISABLED
"On", // RSMI_RAS_ERR_STATE_ENABLED
};
static_assert(
sizeof(kRasErrStateStrings)/sizeof(char *) == (RSMI_RAS_ERR_STATE_LAST + 1),