Adjust how we read ECC block counter status
This change corresponds to kernel changes.
Change-Id: Ibd977e8b3338349036cb16e55fb0b2c9c187726d
[ROCm/amdsmi commit: aaecfd6fff]
This commit is contained in:
@@ -361,10 +361,19 @@ typedef enum {
|
||||
RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST, //!< UMC block
|
||||
RSMI_GPU_BLOCK_SDMA = 0x0000000000000002, //!< SDMA block
|
||||
RSMI_GPU_BLOCK_GFX = 0x0000000000000004, //!< GFX block
|
||||
RSMI_GPU_BLOCK_MMHUB = 0x0000000000000008, //!< MMHUB block
|
||||
RSMI_GPU_BLOCK_ATHUB = 0x0000000000000010, //!< ATHUB block
|
||||
RSMI_GPU_BLOCK_PCIE_BIF = 0x0000000000000020, //!< PCIE_BIF block
|
||||
RSMI_GPU_BLOCK_HDP = 0x0000000000000040, //!< HDP block
|
||||
RSMI_GPU_BLOCK_XGMI_WAFL = 0x0000000000000080, //!< XGMI block
|
||||
RSMI_GPU_BLOCK_DF = 0x0000000000000100, //!< DF block
|
||||
RSMI_GPU_BLOCK_SMN = 0x0000000000000200, //!< SMN block
|
||||
RSMI_GPU_BLOCK_SEM = 0x0000000000000400, //!< SEM block
|
||||
RSMI_GPU_BLOCK_MP0 = 0x0000000000000800, //!< MP0 block
|
||||
RSMI_GPU_BLOCK_MP1 = 0x0000000000001000, //!< MP1 block
|
||||
RSMI_GPU_BLOCK_FUSE = 0x0000000000002000, //!< Fuse block
|
||||
|
||||
// New enum elements will be added as support is added for other blocks
|
||||
|
||||
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX, //!< The highest bit position
|
||||
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_FUSE, //!< The highest bit position
|
||||
//!< for supported blocks
|
||||
RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000
|
||||
} rsmi_gpu_block_t;
|
||||
@@ -1818,6 +1827,10 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block,
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
* ::RSMI_NOT_SUPPORTED will be returned if either ECC is not enabled for the
|
||||
* specified block @p block, or if there is no kernel support for ECC for
|
||||
* that block.
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind,
|
||||
rsmi_gpu_block_t block, rsmi_error_count_t *ec);
|
||||
@@ -1826,22 +1839,25 @@ rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind,
|
||||
* @brief Retrieve the enabled ECC bit-mask
|
||||
*
|
||||
* @details Given a device index @p dv_ind, and a pointer to a uint64_t @p
|
||||
* enabled_mask, this function will write a bit_mask to memory pointed to by
|
||||
* @p enabled_mask. Upon a successful call, the bitmask can then be AND'd with
|
||||
* elements of the ::rsmi_gpu_block_t ennumeration to determine if the
|
||||
* corresponding block has ECC enabled. Note that the bits above
|
||||
* ::RSMI_GPU_BLOCK_LAST correspond to blocks that do not yet have ECC support.
|
||||
* enabled_mask, this function will write bits to memory pointed to by
|
||||
* @p enabled_blocks. Upon a successful call, @p enabled_blocks can then be
|
||||
* AND'd with elements of the ::rsmi_gpu_block_t ennumeration to determine if
|
||||
* the corresponding block has ECC enabled. Note that whether a block has ECC
|
||||
* enabled or not in the device is independent of whether there is kernel
|
||||
* support for error counting for that block. Although a block may be enabled,
|
||||
* but there may not be kernel support for reading error counters for that
|
||||
* block.
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[inout] enabled_mask A pointer to a uint64_t to which the enabled
|
||||
* mask will be written
|
||||
* @param[inout] enabled_blocks A pointer to a uint64_t to which the enabled
|
||||
* blocks bits will be written
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
|
||||
uint64_t *enabled_mask);
|
||||
uint64_t *enabled_blocks);
|
||||
|
||||
/**
|
||||
* @brief Retrieve the ECC status for a GPU block
|
||||
|
||||
@@ -471,20 +471,19 @@ rsmi_num_monitor_devices(uint32_t *num_devices) {
|
||||
}
|
||||
|
||||
rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
|
||||
uint64_t *enabled_mask) {
|
||||
uint64_t *enabled_blks) {
|
||||
TRY
|
||||
rsmi_status_t ret;
|
||||
|
||||
if (enabled_mask == nullptr) {
|
||||
if (enabled_blks == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
rsmi_status_t ret;
|
||||
std::string feature_line;
|
||||
std::string tmp_str;
|
||||
|
||||
|
||||
DEVICE_MUTEX
|
||||
|
||||
std::vector<std::string> val_vec;
|
||||
|
||||
ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec);
|
||||
|
||||
ret = get_dev_value_line(amd::smi::kDevErrCntFeatures, dv_ind, &feature_line);
|
||||
if (ret == RSMI_STATUS_FILE_ERROR) {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
@@ -492,39 +491,22 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string junk;
|
||||
std::istringstream fs1(val_vec[0]);
|
||||
std::string mask_str;
|
||||
std::istringstream fs1(feature_line);
|
||||
|
||||
fs1 >> junk;
|
||||
assert(junk == "feature");
|
||||
fs1 >> junk;
|
||||
assert(junk == "mask:");
|
||||
fs1 >> mask_str;
|
||||
fs1 >> tmp_str; // ignore
|
||||
assert(tmp_str == "feature");
|
||||
fs1 >> tmp_str; // ignore
|
||||
assert(tmp_str == "mask:");
|
||||
fs1 >> tmp_str;
|
||||
|
||||
errno = 0;
|
||||
*enabled_mask = strtoul(mask_str.c_str(), nullptr, 16);
|
||||
*enabled_blks = strtoul(tmp_str.c_str(), nullptr, 16);
|
||||
assert(errno == 0);
|
||||
|
||||
return errno_to_rsmi_status(errno);
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
|
||||
static const char *kRSMIGpuBlkUMCFName = "umc";
|
||||
static const char *kRSMIGpuBlkSDMAFName = "sdma";
|
||||
static const char *kRSMIGpuBlkGFXFName = "gfx";
|
||||
|
||||
static const std::map<rsmi_gpu_block_t, const char *> kRocmSMIBlockMap = {
|
||||
{RSMI_GPU_BLOCK_UMC, kRSMIGpuBlkUMCFName},
|
||||
{RSMI_GPU_BLOCK_SDMA, kRSMIGpuBlkSDMAFName},
|
||||
{RSMI_GPU_BLOCK_GFX, kRSMIGpuBlkGFXFName},
|
||||
};
|
||||
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX,
|
||||
"rsmi_gpu_block_t and/or above name map need to be updated"
|
||||
" and then this assert");
|
||||
|
||||
static const std::map<std::string, rsmi_ras_err_state_t> kRocmSMIStateMap = {
|
||||
{"none", RSMI_RAS_ERR_STATE_NONE},
|
||||
{"disabled", RSMI_RAS_ERR_STATE_DISABLED},
|
||||
@@ -549,11 +531,11 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
rsmi_status_t ret;
|
||||
std::vector<std::string> val_vec;
|
||||
uint64_t features_mask;
|
||||
|
||||
DEVICE_MUTEX
|
||||
|
||||
ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec);
|
||||
ret = rsmi_dev_ecc_enabled_get(dv_ind, &features_mask);
|
||||
|
||||
if (ret == RSMI_STATUS_FILE_ERROR) {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
@@ -562,29 +544,10 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string blk_line;
|
||||
std::string search_str = kRocmSMIBlockMap.at(block);
|
||||
std::string sysfs_junk = " ras feature mask:";
|
||||
std::string state_str;
|
||||
*state = (features_mask & block) ?
|
||||
RSMI_RAS_ERR_STATE_ENABLED : RSMI_RAS_ERR_STATE_DISABLED;
|
||||
|
||||
search_str += ":";
|
||||
|
||||
for (uint32_t i = 1; i < val_vec.size(); ++i) { // Skip features line
|
||||
std::istringstream fs1(val_vec[i]);
|
||||
|
||||
fs1 >> blk_line;
|
||||
if (blk_line == search_str || blk_line == kRocmSMIBlockMap.at(block)) {
|
||||
if (blk_line.back() != ':')
|
||||
fs1.ignore(sysfs_junk.length(), ':');
|
||||
fs1 >> state_str;
|
||||
assert(kRocmSMIStateMap.count(state_str));
|
||||
*state = kRocmSMIStateMap.at(state_str);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
assert(!"Block was not found");
|
||||
*state = RSMI_RAS_ERR_STATE_INVALID;
|
||||
return RSMI_STATUS_NOT_FOUND;
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
CATCH
|
||||
}
|
||||
|
||||
|
||||
@@ -415,6 +415,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
case kDevSubSysDevID:
|
||||
case kDevSubSysVendorID:
|
||||
case kDevVendorID:
|
||||
case kDevErrCntFeatures:
|
||||
ret = readDevInfoStr(type, &tempStr);
|
||||
RET_IF_NONZERO(ret);
|
||||
*val = std::stoi(tempStr, 0, 16);
|
||||
@@ -485,7 +486,6 @@ int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
|
||||
case kDevErrCntSDMA:
|
||||
case kDevErrCntUMC:
|
||||
case kDevErrCntGFX:
|
||||
case kDevErrCntFeatures:
|
||||
case kDevMemPageBad:
|
||||
return readDevInfoMultiLineStr(type, val);
|
||||
break;
|
||||
|
||||
@@ -98,6 +98,7 @@ void TestErrCntRead::Run(void) {
|
||||
std::cout <<
|
||||
"\t**Error Count Enabled Mask get is not supported on this machine"
|
||||
<< std::endl;
|
||||
continue;
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
@@ -109,17 +110,11 @@ void TestErrCntRead::Run(void) {
|
||||
b <= RSMI_GPU_BLOCK_LAST; b = b*2) {
|
||||
err = rsmi_dev_ecc_status_get(i, static_cast<rsmi_gpu_block_t>(b),
|
||||
&err_state);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**Error Count Status for " <<
|
||||
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
|
||||
": Not supported on this machine" << std::endl;
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Error Count status for " <<
|
||||
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
|
||||
" block: " << GetErrStateNameStr(err_state) << std::endl;
|
||||
}
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Error Count status for " <<
|
||||
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
|
||||
" block: " << GetErrStateNameStr(err_state) << std::endl;
|
||||
}
|
||||
|
||||
err = rsmi_dev_ecc_count_get(i, static_cast<rsmi_gpu_block_t>(b), &ec);
|
||||
@@ -127,7 +122,7 @@ void TestErrCntRead::Run(void) {
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**Error Count for " <<
|
||||
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
|
||||
": Not supported on this machine" << std::endl;
|
||||
": Not supported for this device" << std::endl;
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
|
||||
@@ -59,8 +59,19 @@ static const std::map<rsmi_gpu_block_t, const char *> kBlockNameMap = {
|
||||
{RSMI_GPU_BLOCK_UMC, "UMC"},
|
||||
{RSMI_GPU_BLOCK_SDMA, "SDMA"},
|
||||
{RSMI_GPU_BLOCK_GFX, "GFX"},
|
||||
{RSMI_GPU_BLOCK_MMHUB, "MMHUB"},
|
||||
{RSMI_GPU_BLOCK_ATHUB, "ATHUB"},
|
||||
{RSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"},
|
||||
{RSMI_GPU_BLOCK_HDP, "HDP"},
|
||||
{RSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"},
|
||||
{RSMI_GPU_BLOCK_DF, "DF"},
|
||||
{RSMI_GPU_BLOCK_SMN, "SMN"},
|
||||
{RSMI_GPU_BLOCK_SEM, "SEM"},
|
||||
{RSMI_GPU_BLOCK_MP0, "MP0"},
|
||||
{RSMI_GPU_BLOCK_MP1, "MP1"},
|
||||
{RSMI_GPU_BLOCK_FUSE, "FUSE"},
|
||||
};
|
||||
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX,
|
||||
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_FUSE,
|
||||
"kBlockNameMap needs to be updated");
|
||||
|
||||
static const char * kRasErrStateStrings[] = {
|
||||
@@ -70,8 +81,8 @@ static const char * kRasErrStateStrings[] = {
|
||||
"Single, Correctable", // RSMI_RAS_ERR_STATE_SING_C
|
||||
"Multiple, Uncorrectable", // RSMI_RAS_ERR_STATE_MULT_UC
|
||||
"Poison" // RSMI_RAS_ERR_STATE_POISON
|
||||
"off", // RSMI_RAS_ERR_STATE_DISABLED
|
||||
"on", // RSMI_RAS_ERR_STATE_ENABLED
|
||||
"Off", // RSMI_RAS_ERR_STATE_DISABLED
|
||||
"On", // RSMI_RAS_ERR_STATE_ENABLED
|
||||
};
|
||||
static_assert(
|
||||
sizeof(kRasErrStateStrings)/sizeof(char *) == (RSMI_RAS_ERR_STATE_LAST + 1),
|
||||
|
||||
Reference in New Issue
Block a user