diff --git a/projects/rocm-smi-lib/CHANGELOG.md b/projects/rocm-smi-lib/CHANGELOG.md index ab2eac0cd6..5da2efdbb0 100644 --- a/projects/rocm-smi-lib/CHANGELOG.md +++ b/projects/rocm-smi-lib/CHANGELOG.md @@ -4,6 +4,16 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/] ***All information listed below is for reference and subject to change.*** +## rocm_smi_lib for ROCm 7.2.0 + +### Added + +- **Added runtime power management detection and device wake support**. + - Implemented DRM ioctl-based device wake mechanism to handle GPUs in BACO state. + - Added `check_runtime_pm_status()` to detect runtime PM suspended devices. + - Added `wake_device()` to wake devices using DRM ioctl. + - Modified frequency and power tests to automatically wake suspended devices before reading sysfs files to prevent test failures caused by reading from devices in low-power states. + ## rocm_smi_lib for ROCm 7.0.0 ### Added diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h index 42d009d876..06af68370f 100644 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h @@ -147,6 +147,8 @@ int countDigit(uint64_t n); std::string find_file_in_folder(const std::string& folder, const std::string& regex); uint64_t get_multiplier_from_char(char units_char); +rsmi_status_t check_runtime_pm_status(uint32_t dv_ind, bool *is_suspended); +rsmi_status_t wake_device(uint32_t dv_ind); template std::string print_int_as_hex(T i, bool showHexNotation = true, int overloadBitSize = 0) { diff --git a/projects/rocm-smi-lib/src/rocm_smi_utils.cc b/projects/rocm-smi-lib/src/rocm_smi_utils.cc index 0800f081c1..b6ff53a428 100644 --- a/projects/rocm-smi-lib/src/rocm_smi_utils.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_utils.cc @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -64,6 +65,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -71,6 +73,7 @@ #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi_logger.h" +#include namespace amd { namespace smi { @@ -1317,5 +1320,88 @@ uint64_t get_multiplier_from_char(char units_char) { return multiplier; } +rsmi_status_t check_runtime_pm_status(uint32_t dv_ind, bool *is_suspended) { + GET_DEV_FROM_INDX + + std::string runtime_status = "/sys/class/drm/card" + + std::to_string(dev->index()) + "/device/power/runtime_status"; + std::string runtime_enabled = "/sys/class/drm/card" + + std::to_string(dev->index()) + "/device/power/runtime_enabled"; + + std::string status; + std::string enabled; + + int ret = amd::smi::ReadSysfsStr(runtime_enabled, &enabled); + if (ret != 0) { + *is_suspended = false; + return RSMI_STATUS_SUCCESS; + } + if (enabled.find("disabled") != std::string::npos || + enabled.find("forbidden") != std::string::npos) { + *is_suspended = false; + return RSMI_STATUS_SUCCESS; + } + + ret = amd::smi::ReadSysfsStr(runtime_status, &status); + if (ret != 0) { + *is_suspended = false; + return RSMI_STATUS_SUCCESS; + } + *is_suspended = (status.find("suspended") != std::string::npos); + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | device index: " << dv_ind + << " | runtime_enabled: " << enabled + << " | runtime_status: " << status + << " | is_suspended: " << *is_suspended; + LOG_DEBUG(ss); + + return RSMI_STATUS_SUCCESS; +} + +// Wake device from runtime suspend using DRM ioctl +rsmi_status_t wake_device(uint32_t dv_ind) { + GET_DEV_FROM_INDX + + std::ostringstream ss; + + const std::string regex("renderD([0-9]+)"); + const std::string renderD_folder = "/sys/class/drm/card" + + std::to_string(dev->index()) + "/../"; + std::string render_name = amd::smi::find_file_in_folder(renderD_folder, regex); + if (render_name.empty()) { + ss << __PRETTY_FUNCTION__ << " | Failed to find renderD device for card" << dev->index(); + LOG_ERROR(ss); + return RSMI_STATUS_NOT_FOUND; + } + std::string render_path = "/dev/dri/" + render_name; + + // Open the DRM device node + int fd = open(render_path.c_str(), O_RDWR | O_CLOEXEC); + if (fd < 0) { + ss << __PRETTY_FUNCTION__ << " | Failed to open DRM device: " << render_path + << " | error: " << std::strerror(errno); + LOG_ERROR(ss); + return RSMI_STATUS_FILE_ERROR; + } + + struct drm_amdgpu_info request = {}; + // ioctl to wake the device from runtime suspend + int ret = ioctl(fd, DRM_IOCTL_AMDGPU_INFO, &request); + close(fd); + + if (ret < 0 && errno != EINVAL) { + ss << __PRETTY_FUNCTION__ << " | ioctl failed | device index: " << dv_ind + << " | error: " << std::strerror(errno); + LOG_ERROR(ss); + return RSMI_STATUS_FILE_ERROR; + } + ss << __PRETTY_FUNCTION__ << " | Successfully woke device using DRM ioctl" + << " | device index: " << dv_ind; + LOG_INFO(ss); + + return RSMI_STATUS_SUCCESS; +} + } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read_write.cc index d43a1c0fa0..f1e5c287f9 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read_write.cc @@ -54,6 +54,7 @@ #include "gtest/gtest.h" #include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi_test/functional/frequencies_read_write.h" #include "rocm_smi_test/test_common.h" @@ -104,6 +105,17 @@ void TestFrequenciesReadWrite::Run(void) { for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { PrintDeviceHeader(dv_ind); + // Check and wake the device in runtime suspend + bool is_suspended = false; + ret = amd::smi::check_runtime_pm_status(dv_ind, &is_suspended); + if (ret == RSMI_STATUS_SUCCESS && is_suspended) { + ret = amd::smi::wake_device(dv_ind); + if (ret != RSMI_STATUS_SUCCESS) { + std::cout << "Failed to wake device, cannot read clock frequencies" << std::endl; + CHK_ERR_ASRT(ret) + } + } + for (uint32_t clk = RSMI_CLK_TYPE_FIRST; clk <= RSMI_CLK_TYPE_LAST; ++clk) { rsmi_clk = (rsmi_clk_type)clk; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/power_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/power_read.cc index 243ecc95f5..93398fe424 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/power_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/power_read.cc @@ -101,6 +101,17 @@ void TestPowerRead::Run(void) { for (uint32_t i = 0; i < num_monitor_devs(); ++i) { PrintDeviceHeader(i); + // Check and wake the device in runtime suspend + bool is_suspended = false; + err = amd::smi::check_runtime_pm_status(i, &is_suspended); + if (err == RSMI_STATUS_SUCCESS && is_suspended) { + err = amd::smi::wake_device(i); + if (err != RSMI_STATUS_SUCCESS) { + std::cout << "Failed to wake device, cannot read clock frequencies" << std::endl; + CHK_ERR_ASRT(err) + } + } + err = rsmi_dev_power_cap_get(i, 0, &val_ui64); if (err == RSMI_STATUS_NOT_SUPPORTED) { std::cout << "\t**Power Cap not supported on this device." << std::endl; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/power_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/power_read_write.cc index ac4dc2ff40..554a72cfd1 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/power_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/power_read_write.cc @@ -53,6 +53,7 @@ #include "gtest/gtest.h" #include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi_test/functional/power_read_write.h" #include "rocm_smi_test/test_common.h" @@ -122,6 +123,18 @@ void TestPowerReadWrite::Run(void) { for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { PrintDeviceHeader(dv_ind); + // Check and wake the device in runtime suspend + bool is_suspended = false; + ret = amd::smi::check_runtime_pm_status(dv_ind, &is_suspended); + if (ret == RSMI_STATUS_SUCCESS && is_suspended) { + ret = amd::smi::wake_device(dv_ind); + if (ret != RSMI_STATUS_SUCCESS) { + std::cout << "Failed to wake device, cannot read clock frequencies" + << std::endl; + CHK_ERR_ASRT(ret) + } + } + ret = rsmi_dev_power_profile_presets_get(dv_ind, 0, &status); if (ret == RSMI_STATUS_NOT_SUPPORTED) { std::cout <<