[SWDEV-556483] Fix runtime PM suspend causing test failures (#1931)

Added runtime PM detection and DRM ioctl-based device wake
to handle GPUs in BACO state. Modified tests to wake
suspended devices before reading sysfs files.

---------

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
This commit is contained in:
Bindhiya Kanangot Balakrishnan
2025-11-25 13:36:45 -06:00
zatwierdzone przez GitHub
rodzic 47e53ec6f3
commit e8c3b22734
6 zmienionych plików z 134 dodań i 0 usunięć
+10
Wyświetl plik
@@ -4,6 +4,16 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/]
***All information listed below is for reference and subject to change.***
## rocm_smi_lib for ROCm 7.2.0
### Added
- **Added runtime power management detection and device wake support**.
- Implemented DRM ioctl-based device wake mechanism to handle GPUs in BACO state.
- Added `check_runtime_pm_status()` to detect runtime PM suspended devices.
- Added `wake_device()` to wake devices using DRM ioctl.
- Modified frequency and power tests to automatically wake suspended devices before reading sysfs files to prevent test failures caused by reading from devices in low-power states.
## rocm_smi_lib for ROCm 7.0.0
### Added
@@ -147,6 +147,8 @@ int countDigit(uint64_t n);
std::string find_file_in_folder(const std::string& folder,
const std::string& regex);
uint64_t get_multiplier_from_char(char units_char);
rsmi_status_t check_runtime_pm_status(uint32_t dv_ind, bool *is_suspended);
rsmi_status_t wake_device(uint32_t dv_ind);
template <typename T>
std::string print_int_as_hex(T i, bool showHexNotation = true,
int overloadBitSize = 0) {
@@ -49,6 +49,7 @@
#include <dirent.h>
#include <glob.h>
#include <sys/utsname.h>
#include <sys/ioctl.h>
#include <dlfcn.h>
#include <algorithm>
@@ -64,6 +65,7 @@
#include <string>
#include <vector>
#include <cmath>
#include <fcntl.h>
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_utils.h"
@@ -71,6 +73,7 @@
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_logger.h"
#include <libdrm/amdgpu_drm.h>
namespace amd {
namespace smi {
@@ -1317,5 +1320,88 @@ uint64_t get_multiplier_from_char(char units_char) {
return multiplier;
}
rsmi_status_t check_runtime_pm_status(uint32_t dv_ind, bool *is_suspended) {
GET_DEV_FROM_INDX
std::string runtime_status = "/sys/class/drm/card" +
std::to_string(dev->index()) + "/device/power/runtime_status";
std::string runtime_enabled = "/sys/class/drm/card" +
std::to_string(dev->index()) + "/device/power/runtime_enabled";
std::string status;
std::string enabled;
int ret = amd::smi::ReadSysfsStr(runtime_enabled, &enabled);
if (ret != 0) {
*is_suspended = false;
return RSMI_STATUS_SUCCESS;
}
if (enabled.find("disabled") != std::string::npos ||
enabled.find("forbidden") != std::string::npos) {
*is_suspended = false;
return RSMI_STATUS_SUCCESS;
}
ret = amd::smi::ReadSysfsStr(runtime_status, &status);
if (ret != 0) {
*is_suspended = false;
return RSMI_STATUS_SUCCESS;
}
*is_suspended = (status.find("suspended") != std::string::npos);
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | device index: " << dv_ind
<< " | runtime_enabled: " << enabled
<< " | runtime_status: " << status
<< " | is_suspended: " << *is_suspended;
LOG_DEBUG(ss);
return RSMI_STATUS_SUCCESS;
}
// Wake device from runtime suspend using DRM ioctl
rsmi_status_t wake_device(uint32_t dv_ind) {
GET_DEV_FROM_INDX
std::ostringstream ss;
const std::string regex("renderD([0-9]+)");
const std::string renderD_folder = "/sys/class/drm/card" +
std::to_string(dev->index()) + "/../";
std::string render_name = amd::smi::find_file_in_folder(renderD_folder, regex);
if (render_name.empty()) {
ss << __PRETTY_FUNCTION__ << " | Failed to find renderD device for card" << dev->index();
LOG_ERROR(ss);
return RSMI_STATUS_NOT_FOUND;
}
std::string render_path = "/dev/dri/" + render_name;
// Open the DRM device node
int fd = open(render_path.c_str(), O_RDWR | O_CLOEXEC);
if (fd < 0) {
ss << __PRETTY_FUNCTION__ << " | Failed to open DRM device: " << render_path
<< " | error: " << std::strerror(errno);
LOG_ERROR(ss);
return RSMI_STATUS_FILE_ERROR;
}
struct drm_amdgpu_info request = {};
// ioctl to wake the device from runtime suspend
int ret = ioctl(fd, DRM_IOCTL_AMDGPU_INFO, &request);
close(fd);
if (ret < 0 && errno != EINVAL) {
ss << __PRETTY_FUNCTION__ << " | ioctl failed | device index: " << dv_ind
<< " | error: " << std::strerror(errno);
LOG_ERROR(ss);
return RSMI_STATUS_FILE_ERROR;
}
ss << __PRETTY_FUNCTION__ << " | Successfully woke device using DRM ioctl"
<< " | device index: " << dv_ind;
LOG_INFO(ss);
return RSMI_STATUS_SUCCESS;
}
} // namespace smi
} // namespace amd
@@ -54,6 +54,7 @@
#include "gtest/gtest.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi_test/functional/frequencies_read_write.h"
#include "rocm_smi_test/test_common.h"
@@ -104,6 +105,17 @@ void TestFrequenciesReadWrite::Run(void) {
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
PrintDeviceHeader(dv_ind);
// Check and wake the device in runtime suspend
bool is_suspended = false;
ret = amd::smi::check_runtime_pm_status(dv_ind, &is_suspended);
if (ret == RSMI_STATUS_SUCCESS && is_suspended) {
ret = amd::smi::wake_device(dv_ind);
if (ret != RSMI_STATUS_SUCCESS) {
std::cout << "Failed to wake device, cannot read clock frequencies" << std::endl;
CHK_ERR_ASRT(ret)
}
}
for (uint32_t clk = RSMI_CLK_TYPE_FIRST; clk <= RSMI_CLK_TYPE_LAST; ++clk) {
rsmi_clk = (rsmi_clk_type)clk;
@@ -101,6 +101,17 @@ void TestPowerRead::Run(void) {
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
PrintDeviceHeader(i);
// Check and wake the device in runtime suspend
bool is_suspended = false;
err = amd::smi::check_runtime_pm_status(i, &is_suspended);
if (err == RSMI_STATUS_SUCCESS && is_suspended) {
err = amd::smi::wake_device(i);
if (err != RSMI_STATUS_SUCCESS) {
std::cout << "Failed to wake device, cannot read clock frequencies" << std::endl;
CHK_ERR_ASRT(err)
}
}
err = rsmi_dev_power_cap_get(i, 0, &val_ui64);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Power Cap not supported on this device." << std::endl;
@@ -53,6 +53,7 @@
#include "gtest/gtest.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi_test/functional/power_read_write.h"
#include "rocm_smi_test/test_common.h"
@@ -122,6 +123,18 @@ void TestPowerReadWrite::Run(void) {
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
PrintDeviceHeader(dv_ind);
// Check and wake the device in runtime suspend
bool is_suspended = false;
ret = amd::smi::check_runtime_pm_status(dv_ind, &is_suspended);
if (ret == RSMI_STATUS_SUCCESS && is_suspended) {
ret = amd::smi::wake_device(dv_ind);
if (ret != RSMI_STATUS_SUCCESS) {
std::cout << "Failed to wake device, cannot read clock frequencies"
<< std::endl;
CHK_ERR_ASRT(ret)
}
}
ret = rsmi_dev_power_profile_presets_get(dv_ind, 0, &status);
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
std::cout <<