Added ECC enabled, status and get functions

This commit is contained in:
Chris Freehill
2019-04-03 11:17:43 -05:00
parent c77f3c0ebd
commit 4e679b9324
8 ha cambiato i file con 250 aggiunte e 32 eliminazioni
File binario non mostrato.
+85 -23
Vedi File
@@ -57,14 +57,14 @@ extern "C" {
* Main header file for the ROCm SMI library.
* All required function, structure, enum, etc. definitions should be defined
* in this file.
*
*
* @brief The rocm_smi library api is new, and therefore subject to change
* either at the ABI or API level. Instead of marking every function prototype as "unstable", we are
* instead saying the API is unstable (i.e., changes are possible) while the
* major version remains 0. This means that if the API/ABI changes, we will
* not increment the major version to 1. Once the ABI stabilizes, we will
* increment the major version to 1, and thereafter increment it on all ABI
* breaks.
* breaks.
*/
//! Guaranteed maximum possible number of supported frequencies
@@ -230,18 +230,39 @@ typedef rsmi_power_profile_preset_masks_t rsmi_power_profile_preset_masks;
* @brief This enum is used to identify different GPU blocks.
*/
typedef enum {
RSMI_GPU_BLOCK_FIRST = 0,
RSMI_GPU_BLOCK_INVALID = 0x0000000000000000, //!< Used to indicate an
//!< invalid block
RSMI_GPU_BLOCK_FIRST = 0x0000000000000001,
RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST,
RSMI_GPU_BLOCK_SDMA,
RSMI_GPU_BLOCK_GFX,
RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST, //!< UMC block
RSMI_GPU_BLOCK_SDMA = 0x0000000000000002, //!< SDMA block
RSMI_GPU_BLOCK_GFX = 0x0000000000000004, //!< GFX block
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX
// New enum elements will be added as support is added for other blocks
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX, //!< The highest bit position
//!< for supported blocks
RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000
} rsmi_gpu_block_t;
/// \cond Ignore in docs.
typedef rsmi_gpu_block_t rsmi_gpu_block;
/// \endcond
/**
* @brief The current ECC state
*/
typedef enum {
RSMI_RAS_ERR_STATE_NONE = 0, //!< No current errors
RSMI_RAS_ERR_STATE_PARITY, //!< ECC errors present, but type unknown
RSMI_RAS_ERR_STATE_SING_C, //!< Single correctable error
RSMI_RAS_ERR_STATE_MULT_UC, //!< Multiple uncorrectable errors
RSMI_RAS_ERR_STATE_POISON, //!< Firmware detected error and isolated
//!< page. Treat as uncorrectable.
RSMI_RAS_ERR_STATE_LAST = RSMI_RAS_ERR_STATE_POISON,
RSMI_RAS_ERR_STATE_INVALID = 0xFFFFFFFF
} rsmi_ras_err_state_t;
/**
* @brief Types of memory
*/
@@ -952,7 +973,7 @@ rsmi_status_t rsmi_dev_fan_rpms_get(uint32_t dv_ind, uint32_t sensor_ind,
/**
* @brief Get the fan speed for the specified device in RPMs.
*
* @details Given a device index @p dv_ind
* @details Given a device index @p dv_ind
* this function will get the fan speed.
*
* @param[in] dv_ind a device index
@@ -1158,7 +1179,7 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind,
*
* @param[in] dv_ind a device index
*
* @param[in] odv a pointer to an ::rsmi_od_volt_freq_data_t structure
* @param[in] odv a pointer to an ::rsmi_od_volt_freq_data_t structure
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*/
@@ -1166,7 +1187,7 @@ rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind,
rsmi_od_volt_freq_data_t *odv);
/**
* @brief This function will retrieve the current valid regions in the
* @brief This function will retrieve the current valid regions in the
* frequency/voltage space.
*
* @details Given a device index @p dv_ind, a pointer to an unsigned integer
@@ -1176,7 +1197,7 @@ rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind,
* that can be written to by this function. The caller should also
* indicate the number of ::rsmi_freq_volt_region_t structures that can safely
* be written to @p buffer in @p num_regions.
*
*
* The number of regions to expect this function provide (@p num_regions) can
* be obtained by calling ::rsmi_dev_od_volt_info_get().
*
@@ -1186,7 +1207,7 @@ rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind,
* ::rsmi_freq_volt_region_t structures that can be written to @p buffer. As
* output, this is the number of ::rsmi_freq_volt_region_t structures that were
* actually written.
*
*
* @param[inout] buffer a caller provided buffer to which
* ::rsmi_freq_volt_region_t structures will be written
*
@@ -1352,15 +1373,15 @@ rsmi_status_t rsmi_dev_od_freq_range_set(uint32_t dv_ind, rsmi_clk_type_t clk,
/**
* @brief Get the build version information for the currently running build of
* RSMI.
*
*
* @details Get the major, minor, patch and build string for RSMI build
* currently in use through @p version
*
*
* @param[inout] version A pointer to an ::rsmi_version_t structure that will
* be updated with the version information upon return.
*
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call
*
*
*/
rsmi_status_t
rsmi_version_get(rsmi_version_t *version);
@@ -1398,25 +1419,66 @@ rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len);
/**
* @brief Retrieve the error counts for a GPU block
*
*
* @details Given a device index @p dv_ind, an ::rsmi_gpu_block_t @p block and a
* pointer to an ::rsmi_error_count_t @p ec, this function will write the error
* count values for the GPU block indicated by @p block to memory pointed to by
* @p ec.
*
*
* @param[in] dv_ind a device index
*
*
* @param[in] block The block for which error counts should be retrieved
*
*
* @param[inout] ec A pointer to an ::rsmi_error_count_t to which the error
* counts should be written
*
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*
*/
rsmi_status_t rsmi_dev_error_count_get(uint32_t dv_ind,
rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind,
rsmi_gpu_block_t block, rsmi_error_count_t *ec);
/**
* @brief Retrieve the enabled ECC bit-mask
*
* @details Given a device index @p dv_ind, and a pointer to a uint64_t @p
* enabled_mask, this function will write a bit_mask to memory pointed to by
* @p enabled_mask. Upon a successful call, the bitmask can then be AND'd with
* elements of the ::rsmi_gpu_block_t ennumeration to determine if the
* corresponding block has ECC enabled. Note that the bits above
* ::RSMI_GPU_BLOCK_LAST correspond to blocks that do not yet have ECC support.
*
* @param[in] dv_ind a device index
*
* @param[inout] enabled_mask A pointer to a uint64_t to which the enabled
* mask will be written
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*/
rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
uint64_t *enabled_mask);
/**
* @brief Retrieve the ECC status for a GPU block
*
* @details Given a device index @p dv_ind, an ::rsmi_gpu_block_t @p block and
* a pointer to an ::rsmi_ras_err_state_t @p state, this function will write
* the current state for the GPU block indicated by @p block to memory pointed
* to by @p state.
*
* @param[in] dv_ind a device index
*
* @param[in] block The block for which error counts should be retrieved
*
* @param[inout] state A pointer to an ::rsmi_ras_err_state_t to which the
* ECC state should be written
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*/
rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
rsmi_ras_err_state_t *state);
/**
* @brief Get a description of a provided RSMI error status
*
+1
Vedi File
@@ -77,6 +77,7 @@ enum DevInfoTypes {
kDevErrCntSDMA,
kDevErrCntUMC,
kDevErrCntGFX,
kDevErrCntFeatures,
kDevMemTotGTT,
kDevMemTotVisVRAM,
kDevMemTotVRAM,
+113 -7
Vedi File
@@ -394,6 +394,9 @@ static rsmi_status_t get_dev_value_vec(amd::smi::DevInfoTypes type,
int ret = dev->readDevInfo(type, val_vec);
return errno_to_rsmi_status(ret);
}
static bool is_power_of_2(uint64_t n) {
return n && !(n & (n - 1));
}
rsmi_status_t
rsmi_init(uint64_t init_flags) {
@@ -428,8 +431,116 @@ rsmi_num_monitor_devices(uint32_t *num_devices) {
CATCH
}
rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
uint64_t *enabled_mask) {
TRY
rsmi_status_t ret;
if (enabled_mask == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
std::vector<std::string> val_vec;
ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec);
if (ret == RSMI_STATUS_FILE_ERROR) {
return RSMI_STATUS_NOT_SUPPORTED;
}
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
std::string junk;
std::istringstream fs1(val_vec[0]);
std::string mask_str;
fs1 >> junk;
assert(junk == "feature");
fs1 >> junk;
assert(junk == "mask:");
fs1 >> mask_str;
errno = 0;
*enabled_mask = strtoul(mask_str.c_str(), nullptr, 16);
assert(errno == 0);
return errno_to_rsmi_status(errno);
CATCH
}
static const char *kRSMIGpuBlkUMCFName = "umc";
static const char *kRSMIGpuBlkSDMAFName = "sdma";
static const char *kRSMIGpuBlkGFXFName = "gfx";
static const std::map<rsmi_gpu_block_t, const char *> kRocmSMIBlockMap = {
{RSMI_GPU_BLOCK_UMC, kRSMIGpuBlkUMCFName},
{RSMI_GPU_BLOCK_SDMA, kRSMIGpuBlkSDMAFName},
{RSMI_GPU_BLOCK_GFX, kRSMIGpuBlkGFXFName},
};
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX,
"rsmi_gpu_block_t and/or above name map need to be updated"
" and then this assert");
static const std::map<std::string, rsmi_ras_err_state_t> kRocmSMIStateMap = {
{"none", RSMI_RAS_ERR_STATE_NONE},
{"parity", RSMI_RAS_ERR_STATE_PARITY},
{"single_correctable", RSMI_RAS_ERR_STATE_SING_C},
{"multi_uncorrectable", RSMI_RAS_ERR_STATE_MULT_UC},
{"poison", RSMI_RAS_ERR_STATE_POISON},
};
static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_POISON,
"rsmi_gpu_block_t and/or above name map need to be updated"
" and then this assert");
rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
rsmi_ras_err_state_t *state) {
TRY
if (state == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
if (!is_power_of_2(block)) {
return RSMI_STATUS_INVALID_ARGS;
}
rsmi_status_t ret;
std::vector<std::string> val_vec;
ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec);
if (ret == RSMI_STATUS_FILE_ERROR) {
return RSMI_STATUS_NOT_SUPPORTED;
}
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
std::string blk_line;
std::string search_str = kRocmSMIBlockMap.at(block);
std::string state_str;
search_str += ":";
for (uint32_t i = 1; i < val_vec.size(); ++i) { // Skip features line
std::istringstream fs1(val_vec[i]);
fs1 >> blk_line;
if (blk_line == search_str) {
fs1 >> state_str;
assert(kRocmSMIStateMap.count(state_str));
*state = kRocmSMIStateMap.at(state_str);
return RSMI_STATUS_SUCCESS;
}
}
assert(!"Block was not found");
*state = RSMI_RAS_ERR_STATE_INVALID;
return RSMI_STATUS_NOT_FOUND;
CATCH
}
rsmi_status_t
rsmi_dev_error_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
rsmi_error_count_t *ec) {
std::vector<std::string> val_vec;
rsmi_status_t ret;
@@ -454,7 +565,6 @@ rsmi_dev_error_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
break;
default:
assert(!"Unsupported block provided to rsmi_dev_error_count_get()");
return RSMI_STATUS_NOT_SUPPORTED;
}
ret = get_dev_value_vec(type, dv_ind, &val_vec);
@@ -512,7 +622,7 @@ get_id(uint32_t dv_ind, amd::smi::DevInfoTypes typ, uint16_t *id) {
*id = strtoul(val_str.c_str(), nullptr, 16);
assert(errno == 0);
return RSMI_STATUS_SUCCESS;
return errno_to_rsmi_status(errno);
CATCH
}
@@ -832,10 +942,6 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind,
return RSMI_STATUS_SUCCESS;
CATCH
}
static bool is_power_of_2(uint64_t n) {
return n && !(n & (n - 1));
}
static rsmi_status_t set_power_profile(uint32_t dv_ind,
rsmi_power_profile_preset_masks_t profile) {
TRY
+3
Vedi File
@@ -80,6 +80,7 @@ static const char *kDevPCIEThruPutFName = "pcie_bw";
static const char *kDevErrCntSDMAFName = "ras/sdma_err_count";
static const char *kDevErrCntUMCFName = "ras/umc_err_count";
static const char *kDevErrCntGFXFName = "ras/gfx_err_count";
static const char *kDevErrCntFeaturesFName = "ras/features";
static const char *kDevMemTotGTTFName = "mem_info_gtt_total";
static const char *kDevMemTotVisVRAMFName = "mem_info_vis_vram_total";
static const char *kDevMemTotVRAMFName = "mem_info_vram_total";
@@ -119,6 +120,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevErrCntSDMA, kDevErrCntSDMAFName},
{kDevErrCntUMC, kDevErrCntUMCFName},
{kDevErrCntGFX, kDevErrCntGFXFName},
{kDevErrCntFeatures, kDevErrCntFeaturesFName},
{kDevMemTotGTT, kDevMemTotGTTFName},
{kDevMemTotVisVRAM, kDevMemTotVisVRAMFName},
{kDevMemTotVRAM, kDevMemTotVRAMFName},
@@ -365,6 +367,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
case kDevErrCntSDMA:
case kDevErrCntUMC:
case kDevErrCntGFX:
case kDevErrCntFeatures:
return readDevInfoMultiLineStr(type, val);
break;
@@ -85,14 +85,44 @@ void TestErrCntRead::Close() {
void TestErrCntRead::Run(void) {
rsmi_status_t err;
rsmi_error_count_t ec;
uint64_t enabled_mask;
rsmi_ras_err_state_t err_state;
TestBase::Run();
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
PrintDeviceHeader(i);
for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; ++b) {
err = rsmi_dev_error_count_get(i, static_cast<rsmi_gpu_block_t>(b), &ec);
err = rsmi_dev_ecc_enabled_get(i, &enabled_mask);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout <<
"\t**Error Count Enabled Mask for is not supported on this machine"
<< std::endl;
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "Block Error Mask: 0x" << std::hex << enabled_mask <<
std::endl;
}
}
for (uint32_t b = RSMI_GPU_BLOCK_FIRST;
b <= RSMI_GPU_BLOCK_LAST; b = b*2) {
err = rsmi_dev_ecc_status_get(i, static_cast<rsmi_gpu_block_t>(b),
&err_state);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Error Count Status for " <<
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
": Not supported on this machine" << std::endl;
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Error count status for " <<
GetBlockNameStr(static_cast<rsmi_gpu_block_t>(b)) <<
" block: " << GetErrStateNameStr(err_state) << std::endl;
}
}
err = rsmi_dev_ecc_count_get(i, static_cast<rsmi_gpu_block_t>(b), &ec);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Error Count for " <<
+15
Vedi File
@@ -60,6 +60,18 @@ static const std::map<rsmi_gpu_block_t, const char *> kBlockNameMap = {
{RSMI_GPU_BLOCK_SDMA, "SDMA"},
{RSMI_GPU_BLOCK_GFX, "GFX"},
};
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_GFX,
"kBlockNameMap needs to be updated");
static const std::map<rsmi_ras_err_state_t, const char *> kErrStateNameMap = {
{RSMI_RAS_ERR_STATE_NONE, "None"},
{RSMI_RAS_ERR_STATE_PARITY, "Error Unknown"},
{RSMI_RAS_ERR_STATE_SING_C, "Single, Correctable"},
{RSMI_RAS_ERR_STATE_MULT_UC, "Multiple, Uncorrectable"},
{RSMI_RAS_ERR_STATE_POISON, "Poison"},
};
static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_POISON,
"kErrStateNameMap needs to be updated");
static const struct option long_options[] = {
{"iterations", required_argument, nullptr, 'i'},
@@ -134,6 +146,9 @@ uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list) {
const char *GetBlockNameStr(rsmi_gpu_block_t id) {
return kBlockNameMap.at(id);
}
const char *GetErrStateNameStr(rsmi_ras_err_state_t st) {
return kErrStateNameMap.at(st);
}
const char *FreqEnumToStr(rsmi_clk_type rsmi_clk) {
static_assert(RSMI_CLK_TYPE_LAST == RSMI_CLK_TYPE_MEM,
+1
Vedi File
@@ -62,6 +62,7 @@ uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list);
void PrintTestHeader(uint32_t dv_ind);
const char *GetBlockNameStr(rsmi_gpu_block_t id);
const char *GetErrStateNameStr(rsmi_ras_err_state_t st);
const char *FreqEnumToStr(rsmi_clk_type rsmi_clk);
#if ENABLE_SMI