diff --git a/example/amd_smi_nodrm_example.cc b/example/amd_smi_nodrm_example.cc index ec02ec53d8..3b4adce76a 100644 --- a/example/amd_smi_nodrm_example.cc +++ b/example/amd_smi_nodrm_example.cc @@ -122,6 +122,16 @@ int main() { return AMDSMI_STATUS_NOT_SUPPORTED; } + amdsmi_ras_feature_t ras_feature; + ret = amdsmi_get_gpu_ras_feature_info( + processor_handles[j] ,&ras_feature); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + printf("\tras_feature: version: %x, schema: %x\n", + ras_feature.ras_eeprom_version, ras_feature.ecc_correction_schema_flag); + } + + amdsmi_bdf_t bdf = {}; ret = amdsmi_get_gpu_device_bdf(processor_handles[j], &bdf); CHK_AMDSMI_RET(ret) diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 5546bc46c7..6de6ba028f 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -1163,6 +1163,16 @@ typedef struct { /// @endcond } amdsmi_gpu_metrics_t; +/** + * @brief This structure holds ras feature + */ +typedef struct { + uint32_t ras_eeprom_version; + // PARITY error(bit 0), Single Bit correctable (bit1), + // Double bit error detection (bit2), Poison (bit 3). + uint32_t ecc_correction_schema_flag; //!< ecc_correction_schema mask +} amdsmi_ras_feature_t; + /** * @brief This structure holds error counts. */ @@ -2004,6 +2014,20 @@ amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle, amdsmi_mem amdsmi_status_t amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *info); +/** + * @brief Returns RAS features info. + * + * @param[in] processor_handle Device handle which to query + * + * @param[out] ras_feature RAS features that are currently enabled and supported on + * the processor. Must be allocated by user. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_gpu_ras_feature_info( + amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature); + + /** * @brief Returns if RAS features are enabled or disabled for given block. It is not * supported on virtual machine guest diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 3e2735f870..67607fa186 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -1059,6 +1059,16 @@ typedef struct { uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors } rsmi_error_count_t; +/** + * @brief This structure holds ras feature + */ +typedef struct { + uint32_t ras_eeprom_version; + // PARITY error(bit 0), Single Bit correctable (bit1), + // Double bit error detection (bit2), Poison (bit 3). + uint32_t ecc_correction_schema_flag; //!< ecc_correction_schema mask +} rsmi_ras_feature_info_t; + /** * @brief This structure contains information specific to a process. */ @@ -3279,6 +3289,33 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, */ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_ras_err_state_t *state); + + +/** + * @brief Returns RAS features info. + * + * @details Given a device index @p dv_ind, and + * a pointer to an ::rsmi_ras_feature_info_t @p ras_feature, this function will write + * the ras feature info to memory pointed to by @p ras_feature. + * + * @param[in] dv_ind a device index + * + * @param[inout] ras_feature A pointer to an ::rsmi_ras_feature_info_t to which the + * RAS info should be written + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, + * arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the + * provided arguments. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + */ +rsmi_status_t rsmi_ras_feature_info_get( + uint32_t dv_ind, rsmi_ras_feature_info_t *ras_feature); + + /** * @brief Get a description of a provided RSMI error status * diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index 6aedfb0c1a..c728326691 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -129,6 +129,8 @@ enum DevInfoTypes { kDevErrCntPCIEBIF, kDevErrCntHDP, kDevErrCntXGMIWAFL, + kDevErrTableVersion, + kDevErrRASSchema, kDevErrCntFeatures, kDevMemTotGTT, kDevMemTotVisVRAM, diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 521c615016..6fe0c41d26 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -849,6 +849,73 @@ get_id(uint32_t dv_ind, amd::smi::DevInfoTypes typ, uint16_t *id) { CATCH } +rsmi_status_t rsmi_ras_feature_info_get( + uint32_t dv_ind, rsmi_ras_feature_info_t *ras_feature) { + TRY + rsmi_status_t ret; + std::string feature_line; + std::string tmp_str; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + + CHK_SUPPORT_NAME_ONLY(ras_feature) + + DEVICE_MUTEX + + ret = get_dev_value_line(amd::smi::kDevErrTableVersion, + dv_ind, &feature_line); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", returning get_dev_value_line() response = " + << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + + // table version: 0x10000 + const char* version_key = "table version: "; + if (feature_line.rfind(version_key, 0) == 0) { + errno = 0; + auto eeprom_version = strtoul( + feature_line.substr(strlen(version_key)).c_str(), nullptr, 16); + if (errno == 0) { + ras_feature->ras_eeprom_version = eeprom_version; + } else { + return RSMI_STATUS_NOT_SUPPORTED; + } + } else { + return RSMI_STATUS_NOT_SUPPORTED; + } + + ret = get_dev_value_line(amd::smi::kDevErrRASSchema, + dv_ind, &feature_line); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", returning get_dev_value_line() response = " + << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + // schema: 0xf + const char* schema_key = "schema: "; + if (feature_line.rfind(schema_key, 0) == 0) { + errno = 0; + auto schema = strtoul( + feature_line.substr(strlen(schema_key)).c_str(), nullptr, 16); + if (errno == 0) { + ras_feature->ecc_correction_schema_flag = schema; + } else { + return RSMI_STATUS_NOT_SUPPORTED; + } + } else { + return RSMI_STATUS_NOT_SUPPORTED; + } + + return RSMI_STATUS_SUCCESS; + CATCH +} + rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { std::ostringstream ss; diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 85aebef6f4..61fd9a3885 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -112,6 +112,8 @@ static const char *kDevErrCntPCIEBIFFName = "ras/pcie_bif_err_count"; static const char *kDevErrCntHDPFName = "ras/hdp_err_count"; static const char *kDevErrCntXGMIWAFLFName = "ras/xgmi_wafl_err_count"; static const char *kDevErrCntFeaturesFName = "ras/features"; +static const char *kDevErrRASSchemaFName = "ras/schema"; +static const char *kDevErrTableVersionFName = "ras/version"; static const char *kDevMemPageBadFName = "ras/gpu_vram_bad_pages"; static const char *kDevMemTotGTTFName = "mem_info_gtt_total"; static const char *kDevMemTotVisVRAMFName = "mem_info_vis_vram_total"; @@ -269,6 +271,8 @@ static const std::map kDevAttribNameMap = { {kDevErrCntHDP, kDevErrCntHDPFName}, {kDevErrCntXGMIWAFL, kDevErrCntXGMIWAFLFName}, {kDevErrCntFeatures, kDevErrCntFeaturesFName}, + {kDevErrTableVersion, kDevErrTableVersionFName}, + {kDevErrRASSchema, kDevErrRASSchemaFName}, {kDevMemTotGTT, kDevMemTotGTTFName}, {kDevMemTotVisVRAM, kDevMemTotVisVRAMFName}, {kDevMemBusyPercent, kDevMemBusyPercentFName}, @@ -432,6 +436,8 @@ static const std::map kDevFuncDependsMap = { {"rsmi_dev_od_volt_curve_regions_get", {{kDevPowerODVoltageFName}, {}}}, {"rsmi_dev_ecc_enabled_get", {{kDevErrCntFeaturesFName}, {}}}, {"rsmi_dev_ecc_status_get", {{kDevErrCntFeaturesFName}, {}}}, + {"rsmi_ras_feature_info_get", {{kDevErrRASSchemaFName, + kDevErrTableVersionFName}, {}}}, {"rsmi_dev_counter_group_supported", {{}, {}}}, {"rsmi_dev_counter_create", {{}, {}}}, {"rsmi_dev_xgmi_error_status", {{kDevXGMIErrorFName}, {}}}, @@ -933,6 +939,8 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevPCieVendorID: case kDevErrCntFeatures: case kDevXGMIPhysicalID: + case kDevErrRASSchema: + case kDevErrTableVersion: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); diff --git a/rocm_smi/src/rocm_smi_main.cc b/rocm_smi/src/rocm_smi_main.cc index 3b7293cd49..0a07449b8b 100755 --- a/rocm_smi/src/rocm_smi_main.cc +++ b/rocm_smi/src/rocm_smi_main.cc @@ -112,6 +112,8 @@ amd::smi::RocmSMI::devInfoTypesStrings = { {amd::smi::kDevErrCntHDP, amdSMI + "kDevErrCntHDP"}, {amd::smi::kDevErrCntXGMIWAFL, amdSMI + "kDevErrCntXGMIWAFL"}, {amd::smi::kDevErrCntFeatures, amdSMI + "kDevErrCntFeatures"}, + {amd::smi::kDevErrRASSchema, amdSMI + "kDevErrRASSchema"}, + {amd::smi::kDevErrTableVersion, amdSMI + "kDevErrTableVersion"}, {amd::smi::kDevMemTotGTT, amdSMI + "kDevMemTotGTT"}, {amd::smi::kDevMemTotVisVRAM, amdSMI + "kDevMemTotVisVRAM"}, {amd::smi::kDevMemTotVRAM, amdSMI + "kDevMemTotVRAM"}, diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index ff5988c126..1d1bae6231 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1600,6 +1600,34 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t amdsmi_get_gpu_ras_feature_info( + amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature) { + AMDSMI_CHECK_INIT(); + + if (ras_feature == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; + amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, + &gpu_device); + if (r != AMDSMI_STATUS_SUCCESS) + return r; + + rsmi_ras_feature_info_t rsmi_ras_feature; + r = rsmi_wrapper(rsmi_ras_feature_info_get, processor_handle, + &rsmi_ras_feature); + + if (r != AMDSMI_STATUS_SUCCESS) + return r; + + ras_feature->ecc_correction_schema_flag + = rsmi_ras_feature.ecc_correction_schema_flag; + ras_feature->ras_eeprom_version = rsmi_ras_feature.ras_eeprom_version; + + return AMDSMI_STATUS_SUCCESS; +} + amdsmi_status_t amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_error_count_t *ec) { AMDSMI_CHECK_INIT();