ROCm SMI Python CLI: Add showevent Functionality

Implement showevent functionality in the ROCm SMI Python CLI.

It can be called using --showevents with any combination of:
VM_FAULT, THERMAL_THROTTLE, and/or GPU_RESET
For example:
./rocm-smi --showevents VM_FAULT, THERMAL_THROTTLE, GPU_RESET

Signed-off-by: Ori Messinger <Ori.Messinger@amd.com>
Change-Id: I905fd9c949e91423b79833a04ab89d6ba3760e62


[ROCm/rocm_smi_lib commit: a9e7e5a475]
This commit is contained in:
Ori Messinger
2021-03-17 00:24:29 -04:00
szülő 6f751e3fd5
commit 6b4889a3a4
2 fájl változott, egészen pontosan 108 új sor hozzáadva és 1 régi sor törölve
@@ -16,6 +16,9 @@ import logging
import os
import sys
import subprocess
import _thread
import time
from time import ctime
from subprocess import check_output
from rsmiBindings import *
@@ -49,7 +52,7 @@ OUTPUT_SERIALIZATION = False
# These are the valid clock types that can be returned/modified:
# TODO: "clk_type_names" from rsmiBindings.py should fetch valid clocks from
# the same location asrocm_smi_device.cc instead of hardcoding the values
# the same location as rocm_smi_device.cc instead of hardcoding the values
validClockNames = clk_type_names[1:-2]
# The purpose of the [1:-2] here ^^^^ is to remove the duplicate elements at the
# beginning and end of the clk_type_names list (specifically sclk and mclk)
@@ -430,6 +433,35 @@ def printErrLog(device, err):
logging.debug(errstr)
def printEventList(device, delay, eventList):
""" Print out notification events for a specified device
@param device: DRM device identifier
@param delay: Notification delay in ms
@param eventList: List of event type names (can be a single-item list)
"""
print2DArray([['DEVICE\t', 'TIME\t', 'TYPE\t', 'DESCRIPTION']])
mask = 0
ret = rocmsmi.rsmi_event_notification_init(device)
if not rsmi_ret_ok(ret, device):
printErrLog(device, 'Unable to initialize event notifications.')
return
for eventType in eventList:
mask |= 2 ** notification_type_names.index(eventType.upper())
ret = rocmsmi.rsmi_event_notification_mask_set(device, mask)
if not rsmi_ret_ok(ret, device):
printErrLog(device, 'Unable to set event notification mask.')
return
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
num_elements = c_uint32(1)
data = rsmi_evt_notification_data_t(1)
rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data))
if len(data.message) > 0:
print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], \
notification_type_names[data.event.value - 1], \
data.message.decode('utf8') + '\r']])
def printLog(device, metricName, value):
""" Print out to the SMI log
@@ -2053,6 +2085,60 @@ def showVbiosVersion(deviceList):
printLogSpacer()
class _Getch:
"""
Get a single character from standard input
"""
def __init__(self):
import sys, tty
def __call__(self):
import sys, termios, tty
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
tty.setraw(sys.stdin.fileno())
ch = sys.stdin.read(1)
finally:
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
return ch
def showEvents(deviceList, eventTypes):
""" Display a blocking list of events for a list of devices
@param deviceList: List of DRM devices (can be a single-item list)
@param eventTypes: List of event type names (can be a single-item list)
"""
printLogSpacer(' Show Events ')
printLog(None, 'press \'q\' or \'ctrl + c\' to quit', None)
eventTypeList = []
for event in eventTypes: # Cleaning list from wrong values
if event.replace(',', '').upper() in notification_type_names:
eventTypeList.append(event.replace(',', '').upper())
else:
printErrLog(None, 'Ignoring unrecognized event type %s' % (event.replace(',', '')))
if len(eventTypeList) == 0:
eventTypeList = notification_type_names
try: # Create a seperate thread for each GPU
for device in deviceList:
_thread.start_new_thread(printEventList, (device, 1000, eventTypeList))
time.sleep(0.25)
except Exception as e:
printErrLog(device, 'Unable to start new thread. %s' % (e))
return
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
getch = _Getch()
user_input = getch()
# Catch user input for q or Ctrl + c
if user_input == 'q' or user_input == '\x03':
for device in deviceList:
ret = rocmsmi.rsmi_event_notification_stop(device)
if not rsmi_ret_ok(ret, device):
printErrLog(device, 'Unable to end event notifications.')
print('\r')
break
def showVersion(deviceList, component):
""" Display the software version for the specified component
@@ -2580,6 +2666,7 @@ if __name__ == '__main__':
action='store_true')
groupDisplayTop.add_argument('-i', '--showid', help='Show GPU ID', action='store_true')
groupDisplayTop.add_argument('-v', '--showvbios', help='Show VBIOS version', action='store_true')
groupDisplayTop.add_argument('-e', '--showevents', help='Show event list', metavar='EVENT', type=str, nargs='*')
groupDisplayTop.add_argument('--showdriverversion', help='Show kernel driver version', action='store_true')
groupDisplayTop.add_argument('--showfwinfo', help='Show FW information', metavar='BLOCK', type=str, nargs='*')
groupDisplayTop.add_argument('--showmclkrange', help='Show mclk range', action='store_true')
@@ -2808,6 +2895,8 @@ if __name__ == '__main__':
showUId(deviceList)
if args.showvbios:
showVbiosVersion(deviceList)
if args.showevents or str(args.showevents) == '[]':
showEvents(deviceList, args.showevents)
if args.resetclocks:
resetClocks(deviceList)
if args.showtemp:
@@ -86,6 +86,18 @@ class rsmi_dev_perf_level_t(c_int):
RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100
notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_RESET']
class rsmi_evt_notification_type_t(c_int):
RSMI_EVT_NOTIF_VMFAULT = 0
RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT
RSMI_EVT_NOTIF_THERMAL_THROTTLE = 1
RSMI_EVT_NOTIF_GPU_PRE_RESET = 2
RSMI_EVT_NOTIF_GPU_POST_RESET = 3
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET
class rsmi_voltage_metric_t(c_int):
RSMI_VOLT_CURRENT = 0
RSMI_VOLT_FIRST = RSMI_VOLT_CURRENT
@@ -506,6 +518,12 @@ class rsmi_error_count_t(Structure):
('uncorrectable_err', c_uint64)]
class rsmi_evt_notification_data_t(Structure):
_fields_ = [('dv_ind', c_uint32),
('event', rsmi_evt_notification_type_t),
('message', c_char*64)]
class rsmi_process_info_t(Structure):
_fields_ = [('process_id', c_uint32),
('pasid', c_uint32),