diff --git a/docs/ROCm_SMI_Manual.pdf b/docs/ROCm_SMI_Manual.pdf index cd8544cd1b..eddd3b9eba 100644 Binary files a/docs/ROCm_SMI_Manual.pdf and b/docs/ROCm_SMI_Manual.pdf differ diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 2a58a984eb..c54c1b4f91 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -57,14 +57,14 @@ extern "C" { * Main header file for the ROCm SMI library. * All required function, structure, enum, etc. definitions should be defined * in this file. - * + * * @brief The rocm_smi library api is new, and therefore subject to change * either at the ABI or API level. Instead of marking every function prototype as "unstable", we are * instead saying the API is unstable (i.e., changes are possible) while the * major version remains 0. This means that if the API/ABI changes, we will * not increment the major version to 1. Once the ABI stabilizes, we will * increment the major version to 1, and thereafter increment it on all ABI - * breaks. + * breaks. */ //! Guaranteed maximum possible number of supported frequencies @@ -230,18 +230,39 @@ typedef rsmi_power_profile_preset_masks_t rsmi_power_profile_preset_masks; * @brief This enum is used to identify different GPU blocks. */ typedef enum { - RSMI_GPU_BLOCK_FIRST = 0, + RSMI_GPU_BLOCK_INVALID = 0x0000000000000000, //!< Used to indicate an + //!< invalid block + RSMI_GPU_BLOCK_FIRST = 0x0000000000000001, - RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST, - RSMI_GPU_BLOCK_SDMA, - RSMI_GPU_BLOCK_GFX, + RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST, //!< UMC block + RSMI_GPU_BLOCK_SDMA = 0x0000000000000002, //!< SDMA block + RSMI_GPU_BLOCK_GFX = 0x0000000000000004, //!< GFX block - RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX + // New enum elements will be added as support is added for other blocks + + RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX, //!< The highest bit position + //!< for supported blocks + RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000 } rsmi_gpu_block_t; /// \cond Ignore in docs. typedef rsmi_gpu_block_t rsmi_gpu_block; /// \endcond +/** + * @brief The current ECC state + */ +typedef enum { + RSMI_RAS_ERR_STATE_NONE = 0, //!< No current errors + RSMI_RAS_ERR_STATE_PARITY, //!< ECC errors present, but type unknown + RSMI_RAS_ERR_STATE_SING_C, //!< Single correctable error + RSMI_RAS_ERR_STATE_MULT_UC, //!< Multiple uncorrectable errors + RSMI_RAS_ERR_STATE_POISON, //!< Firmware detected error and isolated + //!< page. Treat as uncorrectable. + + RSMI_RAS_ERR_STATE_LAST = RSMI_RAS_ERR_STATE_POISON, + RSMI_RAS_ERR_STATE_INVALID = 0xFFFFFFFF +} rsmi_ras_err_state_t; + /** * @brief Types of memory */ @@ -952,7 +973,7 @@ rsmi_status_t rsmi_dev_fan_rpms_get(uint32_t dv_ind, uint32_t sensor_ind, /** * @brief Get the fan speed for the specified device in RPMs. * - * @details Given a device index @p dv_ind + * @details Given a device index @p dv_ind * this function will get the fan speed. * * @param[in] dv_ind a device index @@ -1158,7 +1179,7 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, * * @param[in] dv_ind a device index * - * @param[in] odv a pointer to an ::rsmi_od_volt_freq_data_t structure + * @param[in] odv a pointer to an ::rsmi_od_volt_freq_data_t structure * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. */ @@ -1166,7 +1187,7 @@ rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv); /** - * @brief This function will retrieve the current valid regions in the + * @brief This function will retrieve the current valid regions in the * frequency/voltage space. * * @details Given a device index @p dv_ind, a pointer to an unsigned integer @@ -1176,7 +1197,7 @@ rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind, * that can be written to by this function. The caller should also * indicate the number of ::rsmi_freq_volt_region_t structures that can safely * be written to @p buffer in @p num_regions. - * + * * The number of regions to expect this function provide (@p num_regions) can * be obtained by calling ::rsmi_dev_od_volt_info_get(). * @@ -1186,7 +1207,7 @@ rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind, * ::rsmi_freq_volt_region_t structures that can be written to @p buffer. As * output, this is the number of ::rsmi_freq_volt_region_t structures that were * actually written. - * + * * @param[inout] buffer a caller provided buffer to which * ::rsmi_freq_volt_region_t structures will be written * @@ -1352,15 +1373,15 @@ rsmi_status_t rsmi_dev_od_freq_range_set(uint32_t dv_ind, rsmi_clk_type_t clk, /** * @brief Get the build version information for the currently running build of * RSMI. - * + * * @details Get the major, minor, patch and build string for RSMI build * currently in use through @p version - * + * * @param[inout] version A pointer to an ::rsmi_version_t structure that will * be updated with the version information upon return. - * + * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call - * + * */ rsmi_status_t rsmi_version_get(rsmi_version_t *version); @@ -1398,25 +1419,66 @@ rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len); /** * @brief Retrieve the error counts for a GPU block - * + * * @details Given a device index @p dv_ind, an ::rsmi_gpu_block_t @p block and a * pointer to an ::rsmi_error_count_t @p ec, this function will write the error * count values for the GPU block indicated by @p block to memory pointed to by * @p ec. - * + * * @param[in] dv_ind a device index - * + * * @param[in] block The block for which error counts should be retrieved - * + * * @param[inout] ec A pointer to an ::rsmi_error_count_t to which the error * counts should be written - * + * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * + * */ -rsmi_status_t rsmi_dev_error_count_get(uint32_t dv_ind, +rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t *ec); +/** + * @brief Retrieve the enabled ECC bit-mask + * + * @details Given a device index @p dv_ind, and a pointer to a uint64_t @p + * enabled_mask, this function will write a bit_mask to memory pointed to by + * @p enabled_mask. Upon a successful call, the bitmask can then be AND'd with + * elements of the ::rsmi_gpu_block_t ennumeration to determine if the + * corresponding block has ECC enabled. Note that the bits above + * ::RSMI_GPU_BLOCK_LAST correspond to blocks that do not yet have ECC support. + * + * @param[in] dv_ind a device index + * + * @param[inout] enabled_mask A pointer to a uint64_t to which the enabled + * mask will be written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, + uint64_t *enabled_mask); + +/** + * @brief Retrieve the ECC status for a GPU block + * + * @details Given a device index @p dv_ind, an ::rsmi_gpu_block_t @p block and + * a pointer to an ::rsmi_ras_err_state_t @p state, this function will write + * the current state for the GPU block indicated by @p block to memory pointed + * to by @p state. + * + * @param[in] dv_ind a device index + * + * @param[in] block The block for which error counts should be retrieved + * + * @param[inout] state A pointer to an ::rsmi_ras_err_state_t to which the + * ECC state should be written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, + rsmi_ras_err_state_t *state); /** * @brief Get a description of a provided RSMI error status * diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h index be4a74a3bb..94557204c0 100755 --- a/include/rocm_smi/rocm_smi_device.h +++ b/include/rocm_smi/rocm_smi_device.h @@ -77,6 +77,7 @@ enum DevInfoTypes { kDevErrCntSDMA, kDevErrCntUMC, kDevErrCntGFX, + kDevErrCntFeatures, kDevMemTotGTT, kDevMemTotVisVRAM, kDevMemTotVRAM, diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 2fbf95c1a8..4fd9ae5fc9 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -394,6 +394,9 @@ static rsmi_status_t get_dev_value_vec(amd::smi::DevInfoTypes type, int ret = dev->readDevInfo(type, val_vec); return errno_to_rsmi_status(ret); } +static bool is_power_of_2(uint64_t n) { + return n && !(n & (n - 1)); +} rsmi_status_t rsmi_init(uint64_t init_flags) { @@ -428,8 +431,116 @@ rsmi_num_monitor_devices(uint32_t *num_devices) { CATCH } +rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, + uint64_t *enabled_mask) { + TRY + rsmi_status_t ret; + + if (enabled_mask == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + std::vector val_vec; + + ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec); + + if (ret == RSMI_STATUS_FILE_ERROR) { + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + std::string junk; + std::istringstream fs1(val_vec[0]); + std::string mask_str; + + fs1 >> junk; + assert(junk == "feature"); + fs1 >> junk; + assert(junk == "mask:"); + fs1 >> mask_str; + + errno = 0; + *enabled_mask = strtoul(mask_str.c_str(), nullptr, 16); + assert(errno == 0); + + return errno_to_rsmi_status(errno); + + CATCH +} + + +static const char *kRSMIGpuBlkUMCFName = "umc"; +static const char *kRSMIGpuBlkSDMAFName = "sdma"; +static const char *kRSMIGpuBlkGFXFName = "gfx"; + +static const std::map kRocmSMIBlockMap = { + {RSMI_GPU_BLOCK_UMC, kRSMIGpuBlkUMCFName}, + {RSMI_GPU_BLOCK_SDMA, kRSMIGpuBlkSDMAFName}, + {RSMI_GPU_BLOCK_GFX, kRSMIGpuBlkGFXFName}, +}; +static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX, + "rsmi_gpu_block_t and/or above name map need to be updated" + " and then this assert"); + +static const std::map kRocmSMIStateMap = { + {"none", RSMI_RAS_ERR_STATE_NONE}, + {"parity", RSMI_RAS_ERR_STATE_PARITY}, + {"single_correctable", RSMI_RAS_ERR_STATE_SING_C}, + {"multi_uncorrectable", RSMI_RAS_ERR_STATE_MULT_UC}, + {"poison", RSMI_RAS_ERR_STATE_POISON}, +}; +static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_POISON, + "rsmi_gpu_block_t and/or above name map need to be updated" + " and then this assert"); + +rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, + rsmi_ras_err_state_t *state) { + TRY + if (state == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + if (!is_power_of_2(block)) { + return RSMI_STATUS_INVALID_ARGS; + } + rsmi_status_t ret; + std::vector val_vec; + + ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec); + + if (ret == RSMI_STATUS_FILE_ERROR) { + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + std::string blk_line; + std::string search_str = kRocmSMIBlockMap.at(block); + std::string state_str; + + search_str += ":"; + + for (uint32_t i = 1; i < val_vec.size(); ++i) { // Skip features line + std::istringstream fs1(val_vec[i]); + + fs1 >> blk_line; + + if (blk_line == search_str) { + fs1 >> state_str; + assert(kRocmSMIStateMap.count(state_str)); + *state = kRocmSMIStateMap.at(state_str); + return RSMI_STATUS_SUCCESS; + } + } + assert(!"Block was not found"); + *state = RSMI_RAS_ERR_STATE_INVALID; + return RSMI_STATUS_NOT_FOUND; + CATCH +} + rsmi_status_t -rsmi_dev_error_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, +rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t *ec) { std::vector val_vec; rsmi_status_t ret; @@ -454,7 +565,6 @@ rsmi_dev_error_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, break; default: - assert(!"Unsupported block provided to rsmi_dev_error_count_get()"); return RSMI_STATUS_NOT_SUPPORTED; } ret = get_dev_value_vec(type, dv_ind, &val_vec); @@ -512,7 +622,7 @@ get_id(uint32_t dv_ind, amd::smi::DevInfoTypes typ, uint16_t *id) { *id = strtoul(val_str.c_str(), nullptr, 16); assert(errno == 0); - return RSMI_STATUS_SUCCESS; + return errno_to_rsmi_status(errno); CATCH } @@ -832,10 +942,6 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, return RSMI_STATUS_SUCCESS; CATCH } - -static bool is_power_of_2(uint64_t n) { - return n && !(n & (n - 1)); -} static rsmi_status_t set_power_profile(uint32_t dv_ind, rsmi_power_profile_preset_masks_t profile) { TRY diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 748267b449..2a8cbc3f23 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -80,6 +80,7 @@ static const char *kDevPCIEThruPutFName = "pcie_bw"; static const char *kDevErrCntSDMAFName = "ras/sdma_err_count"; static const char *kDevErrCntUMCFName = "ras/umc_err_count"; static const char *kDevErrCntGFXFName = "ras/gfx_err_count"; +static const char *kDevErrCntFeaturesFName = "ras/features"; static const char *kDevMemTotGTTFName = "mem_info_gtt_total"; static const char *kDevMemTotVisVRAMFName = "mem_info_vis_vram_total"; static const char *kDevMemTotVRAMFName = "mem_info_vram_total"; @@ -119,6 +120,7 @@ static const std::map kDevAttribNameMap = { {kDevErrCntSDMA, kDevErrCntSDMAFName}, {kDevErrCntUMC, kDevErrCntUMCFName}, {kDevErrCntGFX, kDevErrCntGFXFName}, + {kDevErrCntFeatures, kDevErrCntFeaturesFName}, {kDevMemTotGTT, kDevMemTotGTTFName}, {kDevMemTotVisVRAM, kDevMemTotVisVRAMFName}, {kDevMemTotVRAM, kDevMemTotVRAMFName}, @@ -365,6 +367,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { case kDevErrCntSDMA: case kDevErrCntUMC: case kDevErrCntGFX: + case kDevErrCntFeatures: return readDevInfoMultiLineStr(type, val); break; diff --git a/tests/rocm_smi_test/functional/err_cnt_read.cc b/tests/rocm_smi_test/functional/err_cnt_read.cc index 15b90da948..675ce17f42 100755 --- a/tests/rocm_smi_test/functional/err_cnt_read.cc +++ b/tests/rocm_smi_test/functional/err_cnt_read.cc @@ -85,14 +85,44 @@ void TestErrCntRead::Close() { void TestErrCntRead::Run(void) { rsmi_status_t err; rsmi_error_count_t ec; + uint64_t enabled_mask; + rsmi_ras_err_state_t err_state; TestBase::Run(); for (uint32_t i = 0; i < num_monitor_devs(); ++i) { PrintDeviceHeader(i); - for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; ++b) { - err = rsmi_dev_error_count_get(i, static_cast(b), &ec); + err = rsmi_dev_ecc_enabled_get(i, &enabled_mask); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << + "\t**Error Count Enabled Mask for is not supported on this machine" + << std::endl; + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "Block Error Mask: 0x" << std::hex << enabled_mask << + std::endl; + } + } + for (uint32_t b = RSMI_GPU_BLOCK_FIRST; + b <= RSMI_GPU_BLOCK_LAST; b = b*2) { + err = rsmi_dev_ecc_status_get(i, static_cast(b), + &err_state); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Error Count Status for " << + GetBlockNameStr(static_cast(b)) << + ": Not supported on this machine" << std::endl; + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Error count status for " << + GetBlockNameStr(static_cast(b)) << + " block: " << GetErrStateNameStr(err_state) << std::endl; + } + } + + err = rsmi_dev_ecc_count_get(i, static_cast(b), &ec); if (err == RSMI_STATUS_NOT_SUPPORTED) { std::cout << "\t**Error Count for " << diff --git a/tests/rocm_smi_test/test_common.cc b/tests/rocm_smi_test/test_common.cc index 42c153de7f..c8688b23b8 100755 --- a/tests/rocm_smi_test/test_common.cc +++ b/tests/rocm_smi_test/test_common.cc @@ -60,6 +60,18 @@ static const std::map kBlockNameMap = { {RSMI_GPU_BLOCK_SDMA, "SDMA"}, {RSMI_GPU_BLOCK_GFX, "GFX"}, }; +static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX, + "kBlockNameMap needs to be updated"); + +static const std::map kErrStateNameMap = { + {RSMI_RAS_ERR_STATE_NONE, "None"}, + {RSMI_RAS_ERR_STATE_PARITY, "Error Unknown"}, + {RSMI_RAS_ERR_STATE_SING_C, "Single, Correctable"}, + {RSMI_RAS_ERR_STATE_MULT_UC, "Multiple, Uncorrectable"}, + {RSMI_RAS_ERR_STATE_POISON, "Poison"}, +}; +static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_POISON, + "kErrStateNameMap needs to be updated"); static const struct option long_options[] = { {"iterations", required_argument, nullptr, 'i'}, @@ -134,6 +146,9 @@ uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list) { const char *GetBlockNameStr(rsmi_gpu_block_t id) { return kBlockNameMap.at(id); } +const char *GetErrStateNameStr(rsmi_ras_err_state_t st) { + return kErrStateNameMap.at(st); +} const char *FreqEnumToStr(rsmi_clk_type rsmi_clk) { static_assert(RSMI_CLK_TYPE_LAST == RSMI_CLK_TYPE_MEM, diff --git a/tests/rocm_smi_test/test_common.h b/tests/rocm_smi_test/test_common.h index df95fe2f27..8033a4fb23 100755 --- a/tests/rocm_smi_test/test_common.h +++ b/tests/rocm_smi_test/test_common.h @@ -62,6 +62,7 @@ uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list); void PrintTestHeader(uint32_t dv_ind); const char *GetBlockNameStr(rsmi_gpu_block_t id); +const char *GetErrStateNameStr(rsmi_ras_err_state_t st); const char *FreqEnumToStr(rsmi_clk_type rsmi_clk); #if ENABLE_SMI