Add new API for RAS related information

The API to get the EEPROM version and ECC schema.

Change-Id: Iee6b3c555541a33bf16bf9ac1fd60100dfff5643
This commit is contained in:
Bill(Shuzhou) Liu
2023-10-10 10:11:16 -05:00
کامیت شده توسط Maisam Arif
والد 69c35a4cff
کامیت d92d4e4b38
8فایلهای تغییر یافته به همراه178 افزوده شده و 0 حذف شده
@@ -122,6 +122,16 @@ int main() {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
amdsmi_ras_feature_t ras_feature;
ret = amdsmi_get_gpu_ras_feature_info(
processor_handles[j] ,&ras_feature);
if (ret != AMDSMI_STATUS_NOT_SUPPORTED) {
CHK_AMDSMI_RET(ret)
printf("\tras_feature: version: %x, schema: %x\n",
ras_feature.ras_eeprom_version, ras_feature.ecc_correction_schema_flag);
}
amdsmi_bdf_t bdf = {};
ret = amdsmi_get_gpu_device_bdf(processor_handles[j], &bdf);
CHK_AMDSMI_RET(ret)
+24
مشاهده پرونده
@@ -1163,6 +1163,16 @@ typedef struct {
/// @endcond
} amdsmi_gpu_metrics_t;
/**
* @brief This structure holds ras feature
*/
typedef struct {
uint32_t ras_eeprom_version;
// PARITY error(bit 0), Single Bit correctable (bit1),
// Double bit error detection (bit2), Poison (bit 3).
uint32_t ecc_correction_schema_flag; //!< ecc_correction_schema mask
} amdsmi_ras_feature_t;
/**
* @brief This structure holds error counts.
*/
@@ -2004,6 +2014,20 @@ amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle, amdsmi_mem
amdsmi_status_t
amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *info);
/**
* @brief Returns RAS features info.
*
* @param[in] processor_handle Device handle which to query
*
* @param[out] ras_feature RAS features that are currently enabled and supported on
* the processor. Must be allocated by user.
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(
amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature);
/**
* @brief Returns if RAS features are enabled or disabled for given block. It is not
* supported on virtual machine guest
@@ -1059,6 +1059,16 @@ typedef struct {
uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors
} rsmi_error_count_t;
/**
* @brief This structure holds ras feature
*/
typedef struct {
uint32_t ras_eeprom_version;
// PARITY error(bit 0), Single Bit correctable (bit1),
// Double bit error detection (bit2), Poison (bit 3).
uint32_t ecc_correction_schema_flag; //!< ecc_correction_schema mask
} rsmi_ras_feature_info_t;
/**
* @brief This structure contains information specific to a process.
*/
@@ -3279,6 +3289,33 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
*/
rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
rsmi_ras_err_state_t *state);
/**
* @brief Returns RAS features info.
*
* @details Given a device index @p dv_ind, and
* a pointer to an ::rsmi_ras_feature_info_t @p ras_feature, this function will write
* the ras feature info to memory pointed to by @p ras_feature.
*
* @param[in] dv_ind a device index
*
* @param[inout] ras_feature A pointer to an ::rsmi_ras_feature_info_t to which the
* RAS info should be written
* If this parameter is nullptr, this function will return
* ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided,
* arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the
* provided arguments.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*/
rsmi_status_t rsmi_ras_feature_info_get(
uint32_t dv_ind, rsmi_ras_feature_info_t *ras_feature);
/**
* @brief Get a description of a provided RSMI error status
*
@@ -129,6 +129,8 @@ enum DevInfoTypes {
kDevErrCntPCIEBIF,
kDevErrCntHDP,
kDevErrCntXGMIWAFL,
kDevErrTableVersion,
kDevErrRASSchema,
kDevErrCntFeatures,
kDevMemTotGTT,
kDevMemTotVisVRAM,
+67
مشاهده پرونده
@@ -849,6 +849,73 @@ get_id(uint32_t dv_ind, amd::smi::DevInfoTypes typ, uint16_t *id) {
CATCH
}
rsmi_status_t rsmi_ras_feature_info_get(
uint32_t dv_ind, rsmi_ras_feature_info_t *ras_feature) {
TRY
rsmi_status_t ret;
std::string feature_line;
std::string tmp_str;
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
CHK_SUPPORT_NAME_ONLY(ras_feature)
DEVICE_MUTEX
ret = get_dev_value_line(amd::smi::kDevErrTableVersion,
dv_ind, &feature_line);
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", returning get_dev_value_line() response = "
<< amd::smi::getRSMIStatusString(ret);
LOG_ERROR(ss);
return ret;
}
// table version: 0x10000
const char* version_key = "table version: ";
if (feature_line.rfind(version_key, 0) == 0) {
errno = 0;
auto eeprom_version = strtoul(
feature_line.substr(strlen(version_key)).c_str(), nullptr, 16);
if (errno == 0) {
ras_feature->ras_eeprom_version = eeprom_version;
} else {
return RSMI_STATUS_NOT_SUPPORTED;
}
} else {
return RSMI_STATUS_NOT_SUPPORTED;
}
ret = get_dev_value_line(amd::smi::kDevErrRASSchema,
dv_ind, &feature_line);
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", returning get_dev_value_line() response = "
<< amd::smi::getRSMIStatusString(ret);
LOG_ERROR(ss);
return ret;
}
// schema: 0xf
const char* schema_key = "schema: ";
if (feature_line.rfind(schema_key, 0) == 0) {
errno = 0;
auto schema = strtoul(
feature_line.substr(strlen(schema_key)).c_str(), nullptr, 16);
if (errno == 0) {
ras_feature->ecc_correction_schema_flag = schema;
} else {
return RSMI_STATUS_NOT_SUPPORTED;
}
} else {
return RSMI_STATUS_NOT_SUPPORTED;
}
return RSMI_STATUS_SUCCESS;
CATCH
}
rsmi_status_t
rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
std::ostringstream ss;
@@ -112,6 +112,8 @@ static const char *kDevErrCntPCIEBIFFName = "ras/pcie_bif_err_count";
static const char *kDevErrCntHDPFName = "ras/hdp_err_count";
static const char *kDevErrCntXGMIWAFLFName = "ras/xgmi_wafl_err_count";
static const char *kDevErrCntFeaturesFName = "ras/features";
static const char *kDevErrRASSchemaFName = "ras/schema";
static const char *kDevErrTableVersionFName = "ras/version";
static const char *kDevMemPageBadFName = "ras/gpu_vram_bad_pages";
static const char *kDevMemTotGTTFName = "mem_info_gtt_total";
static const char *kDevMemTotVisVRAMFName = "mem_info_vis_vram_total";
@@ -269,6 +271,8 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevErrCntHDP, kDevErrCntHDPFName},
{kDevErrCntXGMIWAFL, kDevErrCntXGMIWAFLFName},
{kDevErrCntFeatures, kDevErrCntFeaturesFName},
{kDevErrTableVersion, kDevErrTableVersionFName},
{kDevErrRASSchema, kDevErrRASSchemaFName},
{kDevMemTotGTT, kDevMemTotGTTFName},
{kDevMemTotVisVRAM, kDevMemTotVisVRAMFName},
{kDevMemBusyPercent, kDevMemBusyPercentFName},
@@ -432,6 +436,8 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
{"rsmi_dev_od_volt_curve_regions_get", {{kDevPowerODVoltageFName}, {}}},
{"rsmi_dev_ecc_enabled_get", {{kDevErrCntFeaturesFName}, {}}},
{"rsmi_dev_ecc_status_get", {{kDevErrCntFeaturesFName}, {}}},
{"rsmi_ras_feature_info_get", {{kDevErrRASSchemaFName,
kDevErrTableVersionFName}, {}}},
{"rsmi_dev_counter_group_supported", {{}, {}}},
{"rsmi_dev_counter_create", {{}, {}}},
{"rsmi_dev_xgmi_error_status", {{kDevXGMIErrorFName}, {}}},
@@ -933,6 +939,8 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
case kDevPCieVendorID:
case kDevErrCntFeatures:
case kDevXGMIPhysicalID:
case kDevErrRASSchema:
case kDevErrTableVersion:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
@@ -112,6 +112,8 @@ amd::smi::RocmSMI::devInfoTypesStrings = {
{amd::smi::kDevErrCntHDP, amdSMI + "kDevErrCntHDP"},
{amd::smi::kDevErrCntXGMIWAFL, amdSMI + "kDevErrCntXGMIWAFL"},
{amd::smi::kDevErrCntFeatures, amdSMI + "kDevErrCntFeatures"},
{amd::smi::kDevErrRASSchema, amdSMI + "kDevErrRASSchema"},
{amd::smi::kDevErrTableVersion, amdSMI + "kDevErrTableVersion"},
{amd::smi::kDevMemTotGTT, amdSMI + "kDevMemTotGTT"},
{amd::smi::kDevMemTotVisVRAM, amdSMI + "kDevMemTotVisVRAM"},
{amd::smi::kDevMemTotVRAM, amdSMI + "kDevMemTotVRAM"},
+28
مشاهده پرونده
@@ -1600,6 +1600,34 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(
amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature) {
AMDSMI_CHECK_INIT();
if (ras_feature == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle,
&gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
rsmi_ras_feature_info_t rsmi_ras_feature;
r = rsmi_wrapper(rsmi_ras_feature_info_get, processor_handle,
&rsmi_ras_feature);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
ras_feature->ecc_correction_schema_flag
= rsmi_ras_feature.ecc_correction_schema_flag;
ras_feature->ras_eeprom_version = rsmi_ras_feature.ras_eeprom_version;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_error_count_t *ec) {
AMDSMI_CHECK_INIT();