[SWDEV-230863] add two new interfaces for background health check (#4)
1. Get the bad pages threshold of a processor. 2. Verify the checksum of RAS EEPROM Signed-off-by: Meng Li <li.meng@amd.com>
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
d32f2a109a
Коммит
dc400d916e
@@ -260,6 +260,8 @@ typedef enum {
|
||||
AMDSMI_STATUS_ARG_PTR_NULL = 53, //!< Parsed argument is invalid
|
||||
AMDSMI_STATUS_AMDGPU_RESTART_ERR = 54, //!< AMDGPU restart failed
|
||||
AMDSMI_STATUS_SETTING_UNAVAILABLE = 55, //!< Setting is not available
|
||||
AMDSMI_STATUS_CORRUPTED_EEPROM = 56, //!< EEPROM is corrupted
|
||||
|
||||
// General errors
|
||||
AMDSMI_STATUS_MAP_ERROR = 0xFFFFFFFE, //!< The internal library error did not map to a status code
|
||||
AMDSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
|
||||
@@ -2640,6 +2642,44 @@ amdsmi_status_t
|
||||
amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages,
|
||||
amdsmi_retired_page_record_t *info);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Get the bad pages threshold of a processor. It is not supported on virtual
|
||||
* machine guest
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
*
|
||||
* @details This call will query the device @p processor_handle for the
|
||||
* threshold of bad pages (written to @p threshold address).
|
||||
* @param[in] processor_handle a processor handle
|
||||
* @param[out] threshold of bad page count.
|
||||
*
|
||||
* @note This function requires root access
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold);
|
||||
|
||||
/**
|
||||
* @brief Verify the checksum of RAS EEPROM. It is not supported on virtual
|
||||
* machine guest
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
*
|
||||
* @details This call will verify the device @p processor_handle for the
|
||||
* checksum of RAS EEPROM.
|
||||
* @param[in] processor_handle a processor handle
|
||||
*
|
||||
* @note This function requires root access
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success
|
||||
* AMDSMI_STATUS_CORRUPTED_EEPROM on the device's EEPROM corruption
|
||||
* others on fail
|
||||
*/
|
||||
amdsmi_status_t
|
||||
amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle);
|
||||
|
||||
/**
|
||||
* @brief Returns RAS features info.
|
||||
*
|
||||
|
||||
@@ -45,6 +45,8 @@ amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int
|
||||
amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, int *max_freq, int *min_freq, int *num_dpm, int *sleep_state_freq);
|
||||
amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, uint64_t *enabled_blocks);
|
||||
amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, uint32_t *num_pages, amdsmi_retired_page_record_t *info);
|
||||
amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device, uint32_t *threshold);
|
||||
amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device);
|
||||
amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt);
|
||||
amdsmi_status_t smi_amdgpu_get_driver_version(amd::smi::AMDSmiGPUDevice* device, int *length, char *version);
|
||||
amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uint32_t *pcie_speed);
|
||||
|
||||
@@ -2354,6 +2354,40 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
if (threshold == nullptr) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
|
||||
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
|
||||
if (r != AMDSMI_STATUS_SUCCESS)
|
||||
return r;
|
||||
|
||||
amdsmi_status_t status;
|
||||
status = smi_amdgpu_get_bad_page_threshold(gpu_device, threshold);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
|
||||
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
|
||||
if (r != AMDSMI_STATUS_SUCCESS)
|
||||
return r;
|
||||
|
||||
return smi_amdgpu_validate_ras_eeprom(gpu_device);
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(
|
||||
amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
@@ -433,6 +433,54 @@ amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device,
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static uint32_t GetDeviceIndex(const std::string s) {
|
||||
std::string t = s;
|
||||
size_t tmp = t.find_last_not_of("0123456789");
|
||||
t.erase(0, tmp+1);
|
||||
|
||||
assert(stoi(t) >= 0);
|
||||
return static_cast<uint32_t>(stoi(t));
|
||||
}
|
||||
|
||||
amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device,
|
||||
uint32_t *threshold) {
|
||||
if (!device->check_if_drm_is_supported()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
SMIGPUDEVICE_MUTEX(device->get_mutex())
|
||||
|
||||
//TODO: Accessing the node requires root privileges, and its interface may need to be exposed in another path
|
||||
uint32_t index = GetDeviceIndex(device->get_gpu_path());
|
||||
std::string fullpath = "/sys/kernel/debug/dri/" + std::to_string(index) + std::string("/ras/bad_page_cnt_threshold");
|
||||
std::ifstream fs(fullpath.c_str());
|
||||
|
||||
if (fs.fail()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
std::string line;
|
||||
getline(fs, line);
|
||||
if (sscanf(line.c_str(), "%d", threshold) < 0) {
|
||||
return AMDSMI_STATUS_API_FAILED;
|
||||
}
|
||||
|
||||
fs.close();
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device) {
|
||||
if (!device->check_if_drm_is_supported()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
SMIGPUDEVICE_MUTEX(device->get_mutex())
|
||||
|
||||
uint32_t index = GetDeviceIndex(device->get_gpu_path());
|
||||
//TODO: need to expose the corresponding interface to validate the checksum of ras eeprom table.
|
||||
//verify fail: return AMDSMI_STATUS_CORRUPTED_EEPROM
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt) {
|
||||
if (!device->check_if_drm_is_supported()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
|
||||
Ссылка в новой задаче
Block a user