diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index 05aad724df..05b049b044 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -16,6 +16,9 @@ import logging import os import sys import subprocess +import _thread +import time +from time import ctime from subprocess import check_output from rsmiBindings import * @@ -49,7 +52,7 @@ OUTPUT_SERIALIZATION = False # These are the valid clock types that can be returned/modified: # TODO: "clk_type_names" from rsmiBindings.py should fetch valid clocks from -# the same location asrocm_smi_device.cc instead of hardcoding the values +# the same location as rocm_smi_device.cc instead of hardcoding the values validClockNames = clk_type_names[1:-2] # The purpose of the [1:-2] here ^^^^ is to remove the duplicate elements at the # beginning and end of the clk_type_names list (specifically sclk and mclk) @@ -430,6 +433,35 @@ def printErrLog(device, err): logging.debug(errstr) +def printEventList(device, delay, eventList): + """ Print out notification events for a specified device + + @param device: DRM device identifier + @param delay: Notification delay in ms + @param eventList: List of event type names (can be a single-item list) + """ + print2DArray([['DEVICE\t', 'TIME\t', 'TYPE\t', 'DESCRIPTION']]) + mask = 0 + ret = rocmsmi.rsmi_event_notification_init(device) + if not rsmi_ret_ok(ret, device): + printErrLog(device, 'Unable to initialize event notifications.') + return + for eventType in eventList: + mask |= 2 ** notification_type_names.index(eventType.upper()) + ret = rocmsmi.rsmi_event_notification_mask_set(device, mask) + if not rsmi_ret_ok(ret, device): + printErrLog(device, 'Unable to set event notification mask.') + return + while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' + num_elements = c_uint32(1) + data = rsmi_evt_notification_data_t(1) + rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data)) + if len(data.message) > 0: + print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], \ + notification_type_names[data.event.value - 1], \ + data.message.decode('utf8') + '\r']]) + + def printLog(device, metricName, value): """ Print out to the SMI log @@ -2053,6 +2085,60 @@ def showVbiosVersion(deviceList): printLogSpacer() +class _Getch: + """ + Get a single character from standard input + """ + def __init__(self): + import sys, tty + def __call__(self): + import sys, termios, tty + fd = sys.stdin.fileno() + old_settings = termios.tcgetattr(fd) + try: + tty.setraw(sys.stdin.fileno()) + ch = sys.stdin.read(1) + finally: + termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) + return ch + + +def showEvents(deviceList, eventTypes): + """ Display a blocking list of events for a list of devices + + @param deviceList: List of DRM devices (can be a single-item list) + @param eventTypes: List of event type names (can be a single-item list) + """ + printLogSpacer(' Show Events ') + printLog(None, 'press \'q\' or \'ctrl + c\' to quit', None) + eventTypeList = [] + for event in eventTypes: # Cleaning list from wrong values + if event.replace(',', '').upper() in notification_type_names: + eventTypeList.append(event.replace(',', '').upper()) + else: + printErrLog(None, 'Ignoring unrecognized event type %s' % (event.replace(',', ''))) + if len(eventTypeList) == 0: + eventTypeList = notification_type_names + try: # Create a seperate thread for each GPU + for device in deviceList: + _thread.start_new_thread(printEventList, (device, 1000, eventTypeList)) + time.sleep(0.25) + except Exception as e: + printErrLog(device, 'Unable to start new thread. %s' % (e)) + return + while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' + getch = _Getch() + user_input = getch() + # Catch user input for q or Ctrl + c + if user_input == 'q' or user_input == '\x03': + for device in deviceList: + ret = rocmsmi.rsmi_event_notification_stop(device) + if not rsmi_ret_ok(ret, device): + printErrLog(device, 'Unable to end event notifications.') + print('\r') + break + + def showVersion(deviceList, component): """ Display the software version for the specified component @@ -2580,6 +2666,7 @@ if __name__ == '__main__': action='store_true') groupDisplayTop.add_argument('-i', '--showid', help='Show GPU ID', action='store_true') groupDisplayTop.add_argument('-v', '--showvbios', help='Show VBIOS version', action='store_true') + groupDisplayTop.add_argument('-e', '--showevents', help='Show event list', metavar='EVENT', type=str, nargs='*') groupDisplayTop.add_argument('--showdriverversion', help='Show kernel driver version', action='store_true') groupDisplayTop.add_argument('--showfwinfo', help='Show FW information', metavar='BLOCK', type=str, nargs='*') groupDisplayTop.add_argument('--showmclkrange', help='Show mclk range', action='store_true') @@ -2808,6 +2895,8 @@ if __name__ == '__main__': showUId(deviceList) if args.showvbios: showVbiosVersion(deviceList) + if args.showevents or str(args.showevents) == '[]': + showEvents(deviceList, args.showevents) if args.resetclocks: resetClocks(deviceList) if args.showtemp: diff --git a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py index 616b0bd960..d11ab62058 100644 --- a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py +++ b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py @@ -86,6 +86,18 @@ class rsmi_dev_perf_level_t(c_int): RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 +notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_RESET'] + + +class rsmi_evt_notification_type_t(c_int): + RSMI_EVT_NOTIF_VMFAULT = 0 + RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT + RSMI_EVT_NOTIF_THERMAL_THROTTLE = 1 + RSMI_EVT_NOTIF_GPU_PRE_RESET = 2 + RSMI_EVT_NOTIF_GPU_POST_RESET = 3 + RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET + + class rsmi_voltage_metric_t(c_int): RSMI_VOLT_CURRENT = 0 RSMI_VOLT_FIRST = RSMI_VOLT_CURRENT @@ -506,6 +518,12 @@ class rsmi_error_count_t(Structure): ('uncorrectable_err', c_uint64)] +class rsmi_evt_notification_data_t(Structure): + _fields_ = [('dv_ind', c_uint32), + ('event', rsmi_evt_notification_type_t), + ('message', c_char*64)] + + class rsmi_process_info_t(Structure): _fields_ = [('process_id', c_uint32), ('pasid', c_uint32),