From dc400d916ea3dc3fc7b4fe07c4a2c2a82b6bfa77 Mon Sep 17 00:00:00 2001
From: "Meng, Li (Jassmine)"
Date: Wed, 8 Jan 2025 07:26:55 +0800
Subject: [PATCH] [SWDEV-230863] add two new interfaces for background health
check (#4)
1. Get the bad pages threshold of a processor.
2. Verify the checksum of RAS EEPROM
Signed-off-by: Meng Li
---
include/amd_smi/amdsmi.h | 40 +++++++++++++++++++++++
include/amd_smi/impl/amd_smi_utils.h | 2 ++
src/amd_smi/amd_smi.cc | 34 ++++++++++++++++++++
src/amd_smi/amd_smi_utils.cc | 48 ++++++++++++++++++++++++++++
4 files changed, 124 insertions(+)
diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
index eb41484e88..75da929b3c 100644
--- a/include/amd_smi/amdsmi.h
+++ b/include/amd_smi/amdsmi.h
@@ -260,6 +260,8 @@ typedef enum {
AMDSMI_STATUS_ARG_PTR_NULL = 53, //!< Parsed argument is invalid
AMDSMI_STATUS_AMDGPU_RESTART_ERR = 54, //!< AMDGPU restart failed
AMDSMI_STATUS_SETTING_UNAVAILABLE = 55, //!< Setting is not available
+ AMDSMI_STATUS_CORRUPTED_EEPROM = 56, //!< EEPROM is corrupted
+
// General errors
AMDSMI_STATUS_MAP_ERROR = 0xFFFFFFFE, //!< The internal library error did not map to a status code
AMDSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
@@ -2640,6 +2642,44 @@ amdsmi_status_t
amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages,
amdsmi_retired_page_record_t *info);
+
+/**
+ * @brief Get the bad pages threshold of a processor. It is not supported on virtual
+ * machine guest
+ *
+ * @platform{gpu_bm_linux} @platform{host}
+ *
+ * @details This call will query the device @p processor_handle for the
+ * threshold of bad pages (written to @p threshold address).
+ * @param[in] processor_handle a processor handle
+ * @param[out] threshold of bad page count.
+ *
+ * @note This function requires root access
+ *
+ * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
+ */
+amdsmi_status_t
+amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold);
+
+/**
+ * @brief Verify the checksum of RAS EEPROM. It is not supported on virtual
+ * machine guest
+ *
+ * @platform{gpu_bm_linux} @platform{host}
+ *
+ * @details This call will verify the device @p processor_handle for the
+ * checksum of RAS EEPROM.
+ * @param[in] processor_handle a processor handle
+ *
+ * @note This function requires root access
+ *
+ * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success
+ * AMDSMI_STATUS_CORRUPTED_EEPROM on the device's EEPROM corruption
+ * others on fail
+ */
+amdsmi_status_t
+amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle);
+
/**
* @brief Returns RAS features info.
*
diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h
index 2749db8f04..97b9fa2756 100644
--- a/include/amd_smi/impl/amd_smi_utils.h
+++ b/include/amd_smi/impl/amd_smi_utils.h
@@ -45,6 +45,8 @@ amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int
amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, int *max_freq, int *min_freq, int *num_dpm, int *sleep_state_freq);
amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, uint64_t *enabled_blocks);
amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, uint32_t *num_pages, amdsmi_retired_page_record_t *info);
+amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device, uint32_t *threshold);
+amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device);
amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt);
amdsmi_status_t smi_amdgpu_get_driver_version(amd::smi::AMDSmiGPUDevice* device, int *length, char *version);
amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uint32_t *pcie_speed);
diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc
index 7b1ba9ad13..69f049f0af 100644
--- a/src/amd_smi/amd_smi.cc
+++ b/src/amd_smi/amd_smi.cc
@@ -2354,6 +2354,40 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t
return AMDSMI_STATUS_SUCCESS;
}
+amdsmi_status_t
+amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold) {
+ AMDSMI_CHECK_INIT();
+
+ if (threshold == nullptr) {
+ return AMDSMI_STATUS_INVAL;
+ }
+
+ amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
+ amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
+ if (r != AMDSMI_STATUS_SUCCESS)
+ return r;
+
+ amdsmi_status_t status;
+ status = smi_amdgpu_get_bad_page_threshold(gpu_device, threshold);
+ if (status != AMDSMI_STATUS_SUCCESS) {
+ return status;
+ }
+
+ return AMDSMI_STATUS_SUCCESS;
+}
+
+amdsmi_status_t
+amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle) {
+ AMDSMI_CHECK_INIT();
+
+ amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
+ amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
+ if (r != AMDSMI_STATUS_SUCCESS)
+ return r;
+
+ return smi_amdgpu_validate_ras_eeprom(gpu_device);
+}
+
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(
amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature) {
AMDSMI_CHECK_INIT();
diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc
index 08b224c6ef..26be69e8e3 100644
--- a/src/amd_smi/amd_smi_utils.cc
+++ b/src/amd_smi/amd_smi_utils.cc
@@ -433,6 +433,54 @@ amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device,
return AMDSMI_STATUS_SUCCESS;
}
+static uint32_t GetDeviceIndex(const std::string s) {
+ std::string t = s;
+ size_t tmp = t.find_last_not_of("0123456789");
+ t.erase(0, tmp+1);
+
+ assert(stoi(t) >= 0);
+ return static_cast(stoi(t));
+}
+
+amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device,
+ uint32_t *threshold) {
+ if (!device->check_if_drm_is_supported()) {
+ return AMDSMI_STATUS_NOT_SUPPORTED;
+ }
+ SMIGPUDEVICE_MUTEX(device->get_mutex())
+
+ //TODO: Accessing the node requires root privileges, and its interface may need to be exposed in another path
+ uint32_t index = GetDeviceIndex(device->get_gpu_path());
+ std::string fullpath = "/sys/kernel/debug/dri/" + std::to_string(index) + std::string("/ras/bad_page_cnt_threshold");
+ std::ifstream fs(fullpath.c_str());
+
+ if (fs.fail()) {
+ return AMDSMI_STATUS_NOT_SUPPORTED;
+ }
+
+ std::string line;
+ getline(fs, line);
+ if (sscanf(line.c_str(), "%d", threshold) < 0) {
+ return AMDSMI_STATUS_API_FAILED;
+ }
+
+ fs.close();
+
+ return AMDSMI_STATUS_SUCCESS;
+}
+
+amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device) {
+ if (!device->check_if_drm_is_supported()) {
+ return AMDSMI_STATUS_NOT_SUPPORTED;
+ }
+ SMIGPUDEVICE_MUTEX(device->get_mutex())
+
+ uint32_t index = GetDeviceIndex(device->get_gpu_path());
+ //TODO: need to expose the corresponding interface to validate the checksum of ras eeprom table.
+ //verify fail: return AMDSMI_STATUS_CORRUPTED_EEPROM
+ return AMDSMI_STATUS_NOT_SUPPORTED;
+}
+
amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt) {
if (!device->check_if_drm_is_supported()) {
return AMDSMI_STATUS_NOT_SUPPORTED;