Add new API for RAS related information
The API to get the EEPROM version and ECC schema. Change-Id: Iee6b3c555541a33bf16bf9ac1fd60100dfff5643
This commit is contained in:
کامیت شده توسط
Maisam Arif
والد
69c35a4cff
کامیت
d92d4e4b38
@@ -122,6 +122,16 @@ int main() {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
amdsmi_ras_feature_t ras_feature;
|
||||
ret = amdsmi_get_gpu_ras_feature_info(
|
||||
processor_handles[j] ,&ras_feature);
|
||||
if (ret != AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
CHK_AMDSMI_RET(ret)
|
||||
printf("\tras_feature: version: %x, schema: %x\n",
|
||||
ras_feature.ras_eeprom_version, ras_feature.ecc_correction_schema_flag);
|
||||
}
|
||||
|
||||
|
||||
amdsmi_bdf_t bdf = {};
|
||||
ret = amdsmi_get_gpu_device_bdf(processor_handles[j], &bdf);
|
||||
CHK_AMDSMI_RET(ret)
|
||||
|
||||
@@ -1163,6 +1163,16 @@ typedef struct {
|
||||
/// @endcond
|
||||
} amdsmi_gpu_metrics_t;
|
||||
|
||||
/**
|
||||
* @brief This structure holds ras feature
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t ras_eeprom_version;
|
||||
// PARITY error(bit 0), Single Bit correctable (bit1),
|
||||
// Double bit error detection (bit2), Poison (bit 3).
|
||||
uint32_t ecc_correction_schema_flag; //!< ecc_correction_schema mask
|
||||
} amdsmi_ras_feature_t;
|
||||
|
||||
/**
|
||||
* @brief This structure holds error counts.
|
||||
*/
|
||||
@@ -2004,6 +2014,20 @@ amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle, amdsmi_mem
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *info);
|
||||
|
||||
/**
|
||||
* @brief Returns RAS features info.
|
||||
*
|
||||
* @param[in] processor_handle Device handle which to query
|
||||
*
|
||||
* @param[out] ras_feature RAS features that are currently enabled and supported on
|
||||
* the processor. Must be allocated by user.
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(
|
||||
amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Returns if RAS features are enabled or disabled for given block. It is not
|
||||
* supported on virtual machine guest
|
||||
|
||||
@@ -1059,6 +1059,16 @@ typedef struct {
|
||||
uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors
|
||||
} rsmi_error_count_t;
|
||||
|
||||
/**
|
||||
* @brief This structure holds ras feature
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t ras_eeprom_version;
|
||||
// PARITY error(bit 0), Single Bit correctable (bit1),
|
||||
// Double bit error detection (bit2), Poison (bit 3).
|
||||
uint32_t ecc_correction_schema_flag; //!< ecc_correction_schema mask
|
||||
} rsmi_ras_feature_info_t;
|
||||
|
||||
/**
|
||||
* @brief This structure contains information specific to a process.
|
||||
*/
|
||||
@@ -3279,6 +3289,33 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
rsmi_ras_err_state_t *state);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Returns RAS features info.
|
||||
*
|
||||
* @details Given a device index @p dv_ind, and
|
||||
* a pointer to an ::rsmi_ras_feature_info_t @p ras_feature, this function will write
|
||||
* the ras feature info to memory pointed to by @p ras_feature.
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[inout] ras_feature A pointer to an ::rsmi_ras_feature_info_t to which the
|
||||
* RAS info should be written
|
||||
* If this parameter is nullptr, this function will return
|
||||
* ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided,
|
||||
* arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the
|
||||
* provided arguments.
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS call was successful
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
|
||||
* support this function with the given arguments
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*/
|
||||
rsmi_status_t rsmi_ras_feature_info_get(
|
||||
uint32_t dv_ind, rsmi_ras_feature_info_t *ras_feature);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Get a description of a provided RSMI error status
|
||||
*
|
||||
|
||||
@@ -129,6 +129,8 @@ enum DevInfoTypes {
|
||||
kDevErrCntPCIEBIF,
|
||||
kDevErrCntHDP,
|
||||
kDevErrCntXGMIWAFL,
|
||||
kDevErrTableVersion,
|
||||
kDevErrRASSchema,
|
||||
kDevErrCntFeatures,
|
||||
kDevMemTotGTT,
|
||||
kDevMemTotVisVRAM,
|
||||
|
||||
@@ -849,6 +849,73 @@ get_id(uint32_t dv_ind, amd::smi::DevInfoTypes typ, uint16_t *id) {
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t rsmi_ras_feature_info_get(
|
||||
uint32_t dv_ind, rsmi_ras_feature_info_t *ras_feature) {
|
||||
TRY
|
||||
rsmi_status_t ret;
|
||||
std::string feature_line;
|
||||
std::string tmp_str;
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
CHK_SUPPORT_NAME_ONLY(ras_feature)
|
||||
|
||||
DEVICE_MUTEX
|
||||
|
||||
ret = get_dev_value_line(amd::smi::kDevErrTableVersion,
|
||||
dv_ind, &feature_line);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", returning get_dev_value_line() response = "
|
||||
<< amd::smi::getRSMIStatusString(ret);
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// table version: 0x10000
|
||||
const char* version_key = "table version: ";
|
||||
if (feature_line.rfind(version_key, 0) == 0) {
|
||||
errno = 0;
|
||||
auto eeprom_version = strtoul(
|
||||
feature_line.substr(strlen(version_key)).c_str(), nullptr, 16);
|
||||
if (errno == 0) {
|
||||
ras_feature->ras_eeprom_version = eeprom_version;
|
||||
} else {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
} else {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
ret = get_dev_value_line(amd::smi::kDevErrRASSchema,
|
||||
dv_ind, &feature_line);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", returning get_dev_value_line() response = "
|
||||
<< amd::smi::getRSMIStatusString(ret);
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
// schema: 0xf
|
||||
const char* schema_key = "schema: ";
|
||||
if (feature_line.rfind(schema_key, 0) == 0) {
|
||||
errno = 0;
|
||||
auto schema = strtoul(
|
||||
feature_line.substr(strlen(schema_key)).c_str(), nullptr, 16);
|
||||
if (errno == 0) {
|
||||
ras_feature->ecc_correction_schema_flag = schema;
|
||||
} else {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
} else {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
|
||||
std::ostringstream ss;
|
||||
|
||||
@@ -112,6 +112,8 @@ static const char *kDevErrCntPCIEBIFFName = "ras/pcie_bif_err_count";
|
||||
static const char *kDevErrCntHDPFName = "ras/hdp_err_count";
|
||||
static const char *kDevErrCntXGMIWAFLFName = "ras/xgmi_wafl_err_count";
|
||||
static const char *kDevErrCntFeaturesFName = "ras/features";
|
||||
static const char *kDevErrRASSchemaFName = "ras/schema";
|
||||
static const char *kDevErrTableVersionFName = "ras/version";
|
||||
static const char *kDevMemPageBadFName = "ras/gpu_vram_bad_pages";
|
||||
static const char *kDevMemTotGTTFName = "mem_info_gtt_total";
|
||||
static const char *kDevMemTotVisVRAMFName = "mem_info_vis_vram_total";
|
||||
@@ -269,6 +271,8 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
|
||||
{kDevErrCntHDP, kDevErrCntHDPFName},
|
||||
{kDevErrCntXGMIWAFL, kDevErrCntXGMIWAFLFName},
|
||||
{kDevErrCntFeatures, kDevErrCntFeaturesFName},
|
||||
{kDevErrTableVersion, kDevErrTableVersionFName},
|
||||
{kDevErrRASSchema, kDevErrRASSchemaFName},
|
||||
{kDevMemTotGTT, kDevMemTotGTTFName},
|
||||
{kDevMemTotVisVRAM, kDevMemTotVisVRAMFName},
|
||||
{kDevMemBusyPercent, kDevMemBusyPercentFName},
|
||||
@@ -432,6 +436,8 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
|
||||
{"rsmi_dev_od_volt_curve_regions_get", {{kDevPowerODVoltageFName}, {}}},
|
||||
{"rsmi_dev_ecc_enabled_get", {{kDevErrCntFeaturesFName}, {}}},
|
||||
{"rsmi_dev_ecc_status_get", {{kDevErrCntFeaturesFName}, {}}},
|
||||
{"rsmi_ras_feature_info_get", {{kDevErrRASSchemaFName,
|
||||
kDevErrTableVersionFName}, {}}},
|
||||
{"rsmi_dev_counter_group_supported", {{}, {}}},
|
||||
{"rsmi_dev_counter_create", {{}, {}}},
|
||||
{"rsmi_dev_xgmi_error_status", {{kDevXGMIErrorFName}, {}}},
|
||||
@@ -933,6 +939,8 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
case kDevPCieVendorID:
|
||||
case kDevErrCntFeatures:
|
||||
case kDevXGMIPhysicalID:
|
||||
case kDevErrRASSchema:
|
||||
case kDevErrTableVersion:
|
||||
ret = readDevInfoStr(type, &tempStr);
|
||||
RET_IF_NONZERO(ret);
|
||||
|
||||
|
||||
@@ -112,6 +112,8 @@ amd::smi::RocmSMI::devInfoTypesStrings = {
|
||||
{amd::smi::kDevErrCntHDP, amdSMI + "kDevErrCntHDP"},
|
||||
{amd::smi::kDevErrCntXGMIWAFL, amdSMI + "kDevErrCntXGMIWAFL"},
|
||||
{amd::smi::kDevErrCntFeatures, amdSMI + "kDevErrCntFeatures"},
|
||||
{amd::smi::kDevErrRASSchema, amdSMI + "kDevErrRASSchema"},
|
||||
{amd::smi::kDevErrTableVersion, amdSMI + "kDevErrTableVersion"},
|
||||
{amd::smi::kDevMemTotGTT, amdSMI + "kDevMemTotGTT"},
|
||||
{amd::smi::kDevMemTotVisVRAM, amdSMI + "kDevMemTotVisVRAM"},
|
||||
{amd::smi::kDevMemTotVRAM, amdSMI + "kDevMemTotVRAM"},
|
||||
|
||||
@@ -1600,6 +1600,34 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(
|
||||
amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
if (ras_feature == nullptr) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
|
||||
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle,
|
||||
&gpu_device);
|
||||
if (r != AMDSMI_STATUS_SUCCESS)
|
||||
return r;
|
||||
|
||||
rsmi_ras_feature_info_t rsmi_ras_feature;
|
||||
r = rsmi_wrapper(rsmi_ras_feature_info_get, processor_handle,
|
||||
&rsmi_ras_feature);
|
||||
|
||||
if (r != AMDSMI_STATUS_SUCCESS)
|
||||
return r;
|
||||
|
||||
ras_feature->ecc_correction_schema_flag
|
||||
= rsmi_ras_feature.ecc_correction_schema_flag;
|
||||
ras_feature->ras_eeprom_version = rsmi_ras_feature.ras_eeprom_version;
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_error_count_t *ec) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
مرجع در شماره جدید
Block a user