[SWDEV-230863] add two new interfaces for background health check (#4)

1. Get the bad pages threshold of a processor.
2. Verify the checksum of RAS EEPROM

Signed-off-by: Meng Li <li.meng@amd.com>
Этот коммит содержится в:
Meng, Li (Jassmine)
2025-01-08 07:26:55 +08:00
коммит произвёл GitHub
родитель d32f2a109a
Коммит dc400d916e
4 изменённых файлов: 124 добавлений и 0 удалений
+40
Просмотреть файл
@@ -260,6 +260,8 @@ typedef enum {
AMDSMI_STATUS_ARG_PTR_NULL = 53, //!< Parsed argument is invalid
AMDSMI_STATUS_AMDGPU_RESTART_ERR = 54, //!< AMDGPU restart failed
AMDSMI_STATUS_SETTING_UNAVAILABLE = 55, //!< Setting is not available
AMDSMI_STATUS_CORRUPTED_EEPROM = 56, //!< EEPROM is corrupted
// General errors
AMDSMI_STATUS_MAP_ERROR = 0xFFFFFFFE, //!< The internal library error did not map to a status code
AMDSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
@@ -2640,6 +2642,44 @@ amdsmi_status_t
amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages,
amdsmi_retired_page_record_t *info);
/**
* @brief Get the bad pages threshold of a processor. It is not supported on virtual
* machine guest
*
* @platform{gpu_bm_linux} @platform{host}
*
* @details This call will query the device @p processor_handle for the
* threshold of bad pages (written to @p threshold address).
* @param[in] processor_handle a processor handle
* @param[out] threshold of bad page count.
*
* @note This function requires root access
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t
amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold);
/**
* @brief Verify the checksum of RAS EEPROM. It is not supported on virtual
* machine guest
*
* @platform{gpu_bm_linux} @platform{host}
*
* @details This call will verify the device @p processor_handle for the
* checksum of RAS EEPROM.
* @param[in] processor_handle a processor handle
*
* @note This function requires root access
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success
* AMDSMI_STATUS_CORRUPTED_EEPROM on the device's EEPROM corruption
* others on fail
*/
amdsmi_status_t
amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle);
/**
* @brief Returns RAS features info.
*
+2
Просмотреть файл
@@ -45,6 +45,8 @@ amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int
amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, int *max_freq, int *min_freq, int *num_dpm, int *sleep_state_freq);
amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, uint64_t *enabled_blocks);
amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, uint32_t *num_pages, amdsmi_retired_page_record_t *info);
amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device, uint32_t *threshold);
amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device);
amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt);
amdsmi_status_t smi_amdgpu_get_driver_version(amd::smi::AMDSmiGPUDevice* device, int *length, char *version);
amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uint32_t *pcie_speed);
+34
Просмотреть файл
@@ -2354,6 +2354,40 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold) {
AMDSMI_CHECK_INIT();
if (threshold == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
amdsmi_status_t status;
status = smi_amdgpu_get_bad_page_threshold(gpu_device, threshold);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle) {
AMDSMI_CHECK_INIT();
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
return smi_amdgpu_validate_ras_eeprom(gpu_device);
}
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(
amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature) {
AMDSMI_CHECK_INIT();
+48
Просмотреть файл
@@ -433,6 +433,54 @@ amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device,
return AMDSMI_STATUS_SUCCESS;
}
static uint32_t GetDeviceIndex(const std::string s) {
std::string t = s;
size_t tmp = t.find_last_not_of("0123456789");
t.erase(0, tmp+1);
assert(stoi(t) >= 0);
return static_cast<uint32_t>(stoi(t));
}
amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device,
uint32_t *threshold) {
if (!device->check_if_drm_is_supported()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
SMIGPUDEVICE_MUTEX(device->get_mutex())
//TODO: Accessing the node requires root privileges, and its interface may need to be exposed in another path
uint32_t index = GetDeviceIndex(device->get_gpu_path());
std::string fullpath = "/sys/kernel/debug/dri/" + std::to_string(index) + std::string("/ras/bad_page_cnt_threshold");
std::ifstream fs(fullpath.c_str());
if (fs.fail()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
std::string line;
getline(fs, line);
if (sscanf(line.c_str(), "%d", threshold) < 0) {
return AMDSMI_STATUS_API_FAILED;
}
fs.close();
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device) {
if (!device->check_if_drm_is_supported()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
SMIGPUDEVICE_MUTEX(device->get_mutex())
uint32_t index = GetDeviceIndex(device->get_gpu_path());
//TODO: need to expose the corresponding interface to validate the checksum of ras eeprom table.
//verify fail: return AMDSMI_STATUS_CORRUPTED_EEPROM
return AMDSMI_STATUS_NOT_SUPPORTED;
}
amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt) {
if (!device->check_if_drm_is_supported()) {
return AMDSMI_STATUS_NOT_SUPPORTED;