From c17f9e05e1367d02449bbfb1331ed19ccbade84d Mon Sep 17 00:00:00 2001 From: Elena Sakhnovitch Date: Thu, 22 Oct 2020 17:12:32 -0400 Subject: [PATCH] ROCm SMI Python CLI: --rasinject partial support This implementation is copied directly from the previous rocm_smi.py script; This feature is experimental and will be updated or removed with feauture releases. Signed-off-by: Elena Saknovitch Change-Id: I5cd38266946302bc4123aeafaa825e13f704235e [ROCm/rocm_smi_lib commit: 4117719edd092e8928c448f7540f62f8d9a4faa8] --- .../rocm-smi-lib/python_smi_tools/rocm_smi.py | 83 +++++++++++++++++-- .../python_smi_tools/rsmiBindings.py | 18 ++-- 2 files changed, 90 insertions(+), 11 deletions(-) diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index b84953e4e5..9e8c77e1c8 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -889,6 +889,80 @@ def resetGpu(device): printLogSpacer() +def isRasControlAvailable(device): + """ Check if RAS control is available for a specified device. + + Parameters: + device -- DRM device identifier + + """ + + path = os.path.join('/sys/kernel/debug/dri', 'card%d' % device, 'device', 'ras_ctrl') + + if not doesDeviceExist(device) or not path or not os.path.isfile(path): + logging.warning('GPU[%s]\t: RAS control is not available') + + return False + + return True + + +def setRas(deviceList, rasAction, rasBlock, rasType): + """ Perform a RAS action on the devices + Parameters: + deviceList -- List of DRM devices (can be a single-item list) + rasAction -- [enable|disable|inject] RAS Action to perform + rasBlock -- [$validRasBlocks] RAS block + rasType -- [ce|ue] Error type to enable/disable + + + """ + global RETCODE + printLog(None, "This is experimental feature, use 'amdgpuras' tool for ras error manipulations for newer vbios") + + if rasAction not in validRasActions: + printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType), + None) + logging.debug('Action %s is not a valid RAS command' % rasAction) + return + if rasBlock not in validRasBlocks: + printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType), + None) + printLog(None, 'Block %s is not a valid RAS block' % rasBlock) + return + + if rasType not in validRasTypes: + printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType), + None) + printLog(None, 'Memory error type %s is not a valid RAS memory type' % rasAction) + return + + printLogSpacer() + # NOTE PSP FW doesn't support enabling disabled counters yet + for device in deviceList: + if isRasControlAvailable(device): + rasFilePath = path = os.path.join('/sys/kernel/debug/dri', 'card%d' % device, 'device', 'ras_ctrl') + rasCmd = '%s %s %s' % (rasAction, rasBlock, rasType) + + #writeToSysfs analog to old cli + if not os.path.isfile(rasFilePath): + printLog(None, 'Unable to write to sysfs file', None) + logging.debug('%s does not exist', rasFilePath) + return False + try: + logging.debug('Writing value \'%s\' to file \'%s\'', rasCmd, rasFilePath) + with open(rasFilePath, 'w') as fs: + fs.write(rasFilePath + '\n') # Certain sysfs files require \n at the end + except (IOError, OSError): + printLog(None, 'Unable to write to sysfs file %s' % rasFilePath, None) + logging.warning('IO or OS error') + RETCODE = 1 + + printLogSpacer() + + return + + def setFanSpeed(deviceList, fan): """ Set fan speed for a list of devices. @@ -2698,14 +2772,11 @@ if __name__ == '__main__': if args.resetxgmierr: resetXgmiErr(deviceList) if args.rasenable: - pass - # TODO: setRas(deviceList, \'enable\', args.rasenable[0], args.rasenable[1]) + setRas(deviceList, 'enable', args.rasenable[0], args.rasenable[1]) if args.rasdisable: - pass - # TODO: setRas(deviceList, \'disable\', args.rasdisable[0], args.rasdisable[1]) + setRas(deviceList, 'disable', args.rasdisable[0], args.rasdisable[1]) if args.rasinject: - pass - # TODO: setRas(deviceList, \'inject\', args.rasinject[0], args.rasinject[1]) + setRas(deviceList, 'inject', args.rasinject[0], args.rasinject[1]) if args.load: load(args.load, args.autorespond) if args.save: diff --git a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py index 61f9d24a3a..87f0bb951a 100644 --- a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py +++ b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py @@ -291,14 +291,22 @@ class rsmi_ras_err_state_t(c_int): # Error type list correlates to rsmi_ras_err_state_t -rsmi_ras_err_stale_readable = ['no errors', 'ECC disabled',\ - 'unknown type err', 'single correctable err',\ - 'multiple uncorrectable err',\ - 'page isolated, treat as uncorrectable err',\ +rsmi_ras_err_stale_readable = ['no errors', 'ECC disabled', + 'unknown type err', 'single correctable err', + 'multiple uncorrectable err', + 'page isolated, treat as uncorrectable err', 'ECC enabled', 'status invalid'] -rsmi_ras_err_stale_machine = ['none', 'disabled', 'unknown error',\ +rsmi_ras_err_stale_machine = ['none', 'disabled', 'unknown error', 'sing', 'mult', 'position', 'enabled'] +validRasTypes = ['ue', 'ce'] + +validRasActions = ['disable', 'enable', 'inject'] + +validRasBlocks = ['fuse', 'mp1', 'mp0', 'sem', 'smn', 'df', 'xgmi_wafl', 'hdp', 'pcie_bif', + + 'athub', 'mmhub', 'gfx', 'sdma', 'umc'] + class rsmi_memory_type_t(c_int): RSMI_MEM_TYPE_FIRST = 0