From 2630bf0a8c2ce64b76a6a258efadc523fbada768 Mon Sep 17 00:00:00 2001 From: "Castillo, Juan" Date: Mon, 14 Apr 2025 13:05:22 -0500 Subject: [PATCH] [SWDEV-516013]-rocm-smi runtime status check fix (#28) rocm-smi is not working in mGPU, Blocking DLM tests Updates include: - Creating check_runtime_status function to check for device status of active. - Added warning to users that No AMD GPUs are available, check power status/control. - Added check for empty string coming from HWMON, if emtpy returns unexpected data. --------- Signed-off-by: Juan Castillo --- python_smi_tools/rocm_smi.py | 22 +++++++++++++++- src/rocm_smi.cc | 50 +++++++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 7ad404cf7d..ada3b8e98e 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -21,6 +21,7 @@ import _thread import time import multiprocessing import trace +from os.path import exists from io import StringIO from time import ctime from subprocess import check_output @@ -3919,6 +3920,24 @@ def checkAmdGpus(deviceList): return False +def check_runtime_status()->bool: + """Check the runtime status of all paths along /sys/bus/pci/drivers/amdgpu/*/power/runtime_status. + + Returns: + bool: True if any status is "active", False if any status is "unsupported". + """ + base_path = "/sys/bus/pci/drivers/amdgpu" + for device in os.listdir(base_path): + if os.path.isdir(os.path.join(base_path, device)): + runtime_status_path = os.path.join(base_path, device, "power", "runtime_status") + if os.path.exists(runtime_status_path): + with open(runtime_status_path, 'r') as file: + status = file.read().strip() + if status == "active": + return True + return False + + def component_str(component): """ Returns the component String value @@ -4485,7 +4504,8 @@ if __name__ == '__main__': if not checkAmdGpus(deviceList): logging.warning('No AMD GPUs specified') - + if not check_runtime_status(): + logging.warning('AMD GPUs visible, but data is inaccessible. Check power control/runtime_status\n') if isConciseInfoRequested(args): showAllConcise(deviceList) if args.showhw: diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 218e2af391..f9ff48cdfb 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -368,10 +368,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type, return amd::smi::ErrnoToRsmiStatus(ret); } + if (val_str.empty()) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(type) + << " | Cause: SYSFS read was empty" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; + LOG_INFO(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + if (!amd::smi::IsInteger(val_str)) { std::ostringstream ss; - ss << "Expected integer value from monitor, but got \"" << val_str << "\""; - LOG_ERROR(ss); + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(type) + << " | Cause: Expected integer value from monitor, but got "<< val_str + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; + LOG_INFO(ss); return RSMI_STATUS_UNEXPECTED_DATA; } @@ -398,10 +419,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type, return amd::smi::ErrnoToRsmiStatus(ret); } + if (val_str.empty()) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(type) + << " | Cause: SYSFS read was empty" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; + LOG_INFO(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + if (!amd::smi::IsInteger(val_str)) { std::ostringstream ss; - ss << "Expected integer value from monitor, but got \"" << val_str << "\""; - LOG_ERROR(ss); + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(type) + << " | Cause: Expected integer value from monitor, but got "<< val_str + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; + LOG_INFO(ss); return RSMI_STATUS_UNEXPECTED_DATA; }