[SWDEV-516013]-rocm-smi runtime status check fix (#28)

rocm-smi is not working in mGPU, Blocking DLM tests
Updates include:
 - Creating check_runtime_status function to check for device status of active.
 - Added warning to users that No AMD GPUs are available, check power status/control.
 - Added check for empty string coming from HWMON, if emtpy returns unexpected data.

---------

Signed-off-by: Juan Castillo <juan.castillo@amd.com>

[ROCm/rocm_smi_lib commit: 2630bf0a8c]
This commit is contained in:
Castillo, Juan
2025-04-14 13:05:22 -05:00
zatwierdzone przez GitHub
rodzic 4d8e9cfa1d
commit 07c06318ad
2 zmienionych plików z 67 dodań i 5 usunięć
@@ -21,6 +21,7 @@ import _thread
import time
import multiprocessing
import trace
from os.path import exists
from io import StringIO
from time import ctime
from subprocess import check_output
@@ -3919,6 +3920,24 @@ def checkAmdGpus(deviceList):
return False
def check_runtime_status()->bool:
"""Check the runtime status of all paths along /sys/bus/pci/drivers/amdgpu/*/power/runtime_status.
Returns:
bool: True if any status is "active", False if any status is "unsupported".
"""
base_path = "/sys/bus/pci/drivers/amdgpu"
for device in os.listdir(base_path):
if os.path.isdir(os.path.join(base_path, device)):
runtime_status_path = os.path.join(base_path, device, "power", "runtime_status")
if os.path.exists(runtime_status_path):
with open(runtime_status_path, 'r') as file:
status = file.read().strip()
if status == "active":
return True
return False
def component_str(component):
""" Returns the component String value
@@ -4485,7 +4504,8 @@ if __name__ == '__main__':
if not checkAmdGpus(deviceList):
logging.warning('No AMD GPUs specified')
if not check_runtime_status():
logging.warning('AMD GPUs visible, but data is inaccessible. Check power control/runtime_status\n')
if isConciseInfoRequested(args):
showAllConcise(deviceList)
if args.showhw:
+46 -4
Wyświetl plik
@@ -368,10 +368,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type,
return amd::smi::ErrnoToRsmiStatus(ret);
}
if (val_str.empty()) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(type)
<< " | Cause: SYSFS read was empty"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
LOG_INFO(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
if (!amd::smi::IsInteger(val_str)) {
std::ostringstream ss;
ss << "Expected integer value from monitor, but got \"" << val_str << "\"";
LOG_ERROR(ss);
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(type)
<< " | Cause: Expected integer value from monitor, but got "<< val_str
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
LOG_INFO(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
@@ -398,10 +419,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type,
return amd::smi::ErrnoToRsmiStatus(ret);
}
if (val_str.empty()) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(type)
<< " | Cause: SYSFS read was empty"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
LOG_INFO(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}
if (!amd::smi::IsInteger(val_str)) {
std::ostringstream ss;
ss << "Expected integer value from monitor, but got \"" << val_str << "\"";
LOG_ERROR(ss);
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(type)
<< " | Cause: Expected integer value from monitor, but got "<< val_str
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
LOG_INFO(ss);
return RSMI_STATUS_UNEXPECTED_DATA;
}