ROCm SMI Python CLI: --rasinject partial support

This implementation is copied directly from the previous rocm_smi.py
script; This feature is experimental and will be updated or removed with
feauture releases.

Signed-off-by: Elena Saknovitch
Change-Id: I5cd38266946302bc4123aeafaa825e13f704235e


[ROCm/rocm_smi_lib commit: 4117719edd]
This commit is contained in:
Elena Sakhnovitch
2020-10-22 17:12:32 -04:00
committad av Elena Sakhnovitch
förälder bbbdd0cb2c
incheckning c17f9e05e1
2 ändrade filer med 90 tillägg och 11 borttagningar
@@ -889,6 +889,80 @@ def resetGpu(device):
printLogSpacer()
def isRasControlAvailable(device):
""" Check if RAS control is available for a specified device.
Parameters:
device -- DRM device identifier
"""
path = os.path.join('/sys/kernel/debug/dri', 'card%d' % device, 'device', 'ras_ctrl')
if not doesDeviceExist(device) or not path or not os.path.isfile(path):
logging.warning('GPU[%s]\t: RAS control is not available')
return False
return True
def setRas(deviceList, rasAction, rasBlock, rasType):
""" Perform a RAS action on the devices
Parameters:
deviceList -- List of DRM devices (can be a single-item list)
rasAction -- [enable|disable|inject] RAS Action to perform
rasBlock -- [$validRasBlocks] RAS block
rasType -- [ce|ue] Error type to enable/disable
"""
global RETCODE
printLog(None, "This is experimental feature, use 'amdgpuras' tool for ras error manipulations for newer vbios")
if rasAction not in validRasActions:
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType),
None)
logging.debug('Action %s is not a valid RAS command' % rasAction)
return
if rasBlock not in validRasBlocks:
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType),
None)
printLog(None, 'Block %s is not a valid RAS block' % rasBlock)
return
if rasType not in validRasTypes:
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType),
None)
printLog(None, 'Memory error type %s is not a valid RAS memory type' % rasAction)
return
printLogSpacer()
# NOTE PSP FW doesn't support enabling disabled counters yet
for device in deviceList:
if isRasControlAvailable(device):
rasFilePath = path = os.path.join('/sys/kernel/debug/dri', 'card%d' % device, 'device', 'ras_ctrl')
rasCmd = '%s %s %s' % (rasAction, rasBlock, rasType)
#writeToSysfs analog to old cli
if not os.path.isfile(rasFilePath):
printLog(None, 'Unable to write to sysfs file', None)
logging.debug('%s does not exist', rasFilePath)
return False
try:
logging.debug('Writing value \'%s\' to file \'%s\'', rasCmd, rasFilePath)
with open(rasFilePath, 'w') as fs:
fs.write(rasFilePath + '\n') # Certain sysfs files require \n at the end
except (IOError, OSError):
printLog(None, 'Unable to write to sysfs file %s' % rasFilePath, None)
logging.warning('IO or OS error')
RETCODE = 1
printLogSpacer()
return
def setFanSpeed(deviceList, fan):
""" Set fan speed for a list of devices.
@@ -2698,14 +2772,11 @@ if __name__ == '__main__':
if args.resetxgmierr:
resetXgmiErr(deviceList)
if args.rasenable:
pass
# TODO: setRas(deviceList, \'enable\', args.rasenable[0], args.rasenable[1])
setRas(deviceList, 'enable', args.rasenable[0], args.rasenable[1])
if args.rasdisable:
pass
# TODO: setRas(deviceList, \'disable\', args.rasdisable[0], args.rasdisable[1])
setRas(deviceList, 'disable', args.rasdisable[0], args.rasdisable[1])
if args.rasinject:
pass
# TODO: setRas(deviceList, \'inject\', args.rasinject[0], args.rasinject[1])
setRas(deviceList, 'inject', args.rasinject[0], args.rasinject[1])
if args.load:
load(args.load, args.autorespond)
if args.save:
@@ -291,14 +291,22 @@ class rsmi_ras_err_state_t(c_int):
# Error type list correlates to rsmi_ras_err_state_t
rsmi_ras_err_stale_readable = ['no errors', 'ECC disabled',\
'unknown type err', 'single correctable err',\
'multiple uncorrectable err',\
'page isolated, treat as uncorrectable err',\
rsmi_ras_err_stale_readable = ['no errors', 'ECC disabled',
'unknown type err', 'single correctable err',
'multiple uncorrectable err',
'page isolated, treat as uncorrectable err',
'ECC enabled', 'status invalid']
rsmi_ras_err_stale_machine = ['none', 'disabled', 'unknown error',\
rsmi_ras_err_stale_machine = ['none', 'disabled', 'unknown error',
'sing', 'mult', 'position', 'enabled']
validRasTypes = ['ue', 'ce']
validRasActions = ['disable', 'enable', 'inject']
validRasBlocks = ['fuse', 'mp1', 'mp0', 'sem', 'smn', 'df', 'xgmi_wafl', 'hdp', 'pcie_bif',
'athub', 'mmhub', 'gfx', 'sdma', 'umc']
class rsmi_memory_type_t(c_int):
RSMI_MEM_TYPE_FIRST = 0