diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index eb41484e88..75da929b3c 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -260,6 +260,8 @@ typedef enum { AMDSMI_STATUS_ARG_PTR_NULL = 53, //!< Parsed argument is invalid AMDSMI_STATUS_AMDGPU_RESTART_ERR = 54, //!< AMDGPU restart failed AMDSMI_STATUS_SETTING_UNAVAILABLE = 55, //!< Setting is not available + AMDSMI_STATUS_CORRUPTED_EEPROM = 56, //!< EEPROM is corrupted + // General errors AMDSMI_STATUS_MAP_ERROR = 0xFFFFFFFE, //!< The internal library error did not map to a status code AMDSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred @@ -2640,6 +2642,44 @@ amdsmi_status_t amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *info); + +/** + * @brief Get the bad pages threshold of a processor. It is not supported on virtual + * machine guest + * + * @platform{gpu_bm_linux} @platform{host} + * + * @details This call will query the device @p processor_handle for the + * threshold of bad pages (written to @p threshold address). + * @param[in] processor_handle a processor handle + * @param[out] threshold of bad page count. + * + * @note This function requires root access + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold); + +/** + * @brief Verify the checksum of RAS EEPROM. It is not supported on virtual + * machine guest + * + * @platform{gpu_bm_linux} @platform{host} + * + * @details This call will verify the device @p processor_handle for the + * checksum of RAS EEPROM. + * @param[in] processor_handle a processor handle + * + * @note This function requires root access + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success + * AMDSMI_STATUS_CORRUPTED_EEPROM on the device's EEPROM corruption + * others on fail + */ +amdsmi_status_t +amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle); + /** * @brief Returns RAS features info. * diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h index 2749db8f04..97b9fa2756 100644 --- a/include/amd_smi/impl/amd_smi_utils.h +++ b/include/amd_smi/impl/amd_smi_utils.h @@ -45,6 +45,8 @@ amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, int *max_freq, int *min_freq, int *num_dpm, int *sleep_state_freq); amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, uint64_t *enabled_blocks); amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, uint32_t *num_pages, amdsmi_retired_page_record_t *info); +amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device, uint32_t *threshold); +amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device); amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt); amdsmi_status_t smi_amdgpu_get_driver_version(amd::smi::AMDSmiGPUDevice* device, int *length, char *version); amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uint32_t *pcie_speed); diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 7b1ba9ad13..69f049f0af 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -2354,6 +2354,40 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t +amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold) { + AMDSMI_CHECK_INIT(); + + if (threshold == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; + amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device); + if (r != AMDSMI_STATUS_SUCCESS) + return r; + + amdsmi_status_t status; + status = smi_amdgpu_get_bad_page_threshold(gpu_device, threshold); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle) { + AMDSMI_CHECK_INIT(); + + amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; + amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device); + if (r != AMDSMI_STATUS_SUCCESS) + return r; + + return smi_amdgpu_validate_ras_eeprom(gpu_device); +} + amdsmi_status_t amdsmi_get_gpu_ras_feature_info( amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature) { AMDSMI_CHECK_INIT(); diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc index 08b224c6ef..26be69e8e3 100644 --- a/src/amd_smi/amd_smi_utils.cc +++ b/src/amd_smi/amd_smi_utils.cc @@ -433,6 +433,54 @@ amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, return AMDSMI_STATUS_SUCCESS; } +static uint32_t GetDeviceIndex(const std::string s) { + std::string t = s; + size_t tmp = t.find_last_not_of("0123456789"); + t.erase(0, tmp+1); + + assert(stoi(t) >= 0); + return static_cast(stoi(t)); +} + +amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device, + uint32_t *threshold) { + if (!device->check_if_drm_is_supported()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + SMIGPUDEVICE_MUTEX(device->get_mutex()) + + //TODO: Accessing the node requires root privileges, and its interface may need to be exposed in another path + uint32_t index = GetDeviceIndex(device->get_gpu_path()); + std::string fullpath = "/sys/kernel/debug/dri/" + std::to_string(index) + std::string("/ras/bad_page_cnt_threshold"); + std::ifstream fs(fullpath.c_str()); + + if (fs.fail()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + std::string line; + getline(fs, line); + if (sscanf(line.c_str(), "%d", threshold) < 0) { + return AMDSMI_STATUS_API_FAILED; + } + + fs.close(); + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device) { + if (!device->check_if_drm_is_supported()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + SMIGPUDEVICE_MUTEX(device->get_mutex()) + + uint32_t index = GetDeviceIndex(device->get_gpu_path()); + //TODO: need to expose the corresponding interface to validate the checksum of ras eeprom table. + //verify fail: return AMDSMI_STATUS_CORRUPTED_EEPROM + return AMDSMI_STATUS_NOT_SUPPORTED; +} + amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt) { if (!device->check_if_drm_is_supported()) { return AMDSMI_STATUS_NOT_SUPPORTED;