[SWDEV-516013]-rocm-smi runtime status check fix (#28)
rocm-smi is not working in mGPU, Blocking DLM tests
Updates include:
- Creating check_runtime_status function to check for device status of active.
- Added warning to users that No AMD GPUs are available, check power status/control.
- Added check for empty string coming from HWMON, if emtpy returns unexpected data.
---------
Signed-off-by: Juan Castillo <juan.castillo@amd.com>
[ROCm/rocm_smi_lib commit: 2630bf0a8c]
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
4d8e9cfa1d
commit
07c06318ad
@@ -21,6 +21,7 @@ import _thread
|
||||
import time
|
||||
import multiprocessing
|
||||
import trace
|
||||
from os.path import exists
|
||||
from io import StringIO
|
||||
from time import ctime
|
||||
from subprocess import check_output
|
||||
@@ -3919,6 +3920,24 @@ def checkAmdGpus(deviceList):
|
||||
return False
|
||||
|
||||
|
||||
def check_runtime_status()->bool:
|
||||
"""Check the runtime status of all paths along /sys/bus/pci/drivers/amdgpu/*/power/runtime_status.
|
||||
|
||||
Returns:
|
||||
bool: True if any status is "active", False if any status is "unsupported".
|
||||
"""
|
||||
base_path = "/sys/bus/pci/drivers/amdgpu"
|
||||
for device in os.listdir(base_path):
|
||||
if os.path.isdir(os.path.join(base_path, device)):
|
||||
runtime_status_path = os.path.join(base_path, device, "power", "runtime_status")
|
||||
if os.path.exists(runtime_status_path):
|
||||
with open(runtime_status_path, 'r') as file:
|
||||
status = file.read().strip()
|
||||
if status == "active":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def component_str(component):
|
||||
""" Returns the component String value
|
||||
|
||||
@@ -4485,7 +4504,8 @@ if __name__ == '__main__':
|
||||
|
||||
if not checkAmdGpus(deviceList):
|
||||
logging.warning('No AMD GPUs specified')
|
||||
|
||||
if not check_runtime_status():
|
||||
logging.warning('AMD GPUs visible, but data is inaccessible. Check power control/runtime_status\n')
|
||||
if isConciseInfoRequested(args):
|
||||
showAllConcise(deviceList)
|
||||
if args.showhw:
|
||||
|
||||
@@ -368,10 +368,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type,
|
||||
return amd::smi::ErrnoToRsmiStatus(ret);
|
||||
}
|
||||
|
||||
if (val_str.empty()) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Cause: SYSFS read was empty"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
if (!amd::smi::IsInteger(val_str)) {
|
||||
std::ostringstream ss;
|
||||
ss << "Expected integer value from monitor, but got \"" << val_str << "\"";
|
||||
LOG_ERROR(ss);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Cause: Expected integer value from monitor, but got "<< val_str
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
@@ -398,10 +419,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type,
|
||||
return amd::smi::ErrnoToRsmiStatus(ret);
|
||||
}
|
||||
|
||||
if (val_str.empty()) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Cause: SYSFS read was empty"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
if (!amd::smi::IsInteger(val_str)) {
|
||||
std::ostringstream ss;
|
||||
ss << "Expected integer value from monitor, but got \"" << val_str << "\"";
|
||||
LOG_ERROR(ss);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Cause: Expected integer value from monitor, but got "<< val_str
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user