SWDEV-342812- Add NPS support

Updates:
    * Added rsmi_dev_nps_mode_set and rsmi_dev_nps_mode_get
    * Added ability to set multiple SYSFS files in debug build
    * Added ability to see user's env variables set for debug build
    * Added tests for rsmi_dev_nps_mode_set and rsmi_dev_nps_mode_get
    * Added ability to restart AMD GPU driver, used in nps_mode_set
    * Updated ROCm_SMI_Manual.pdf to include new APIs
    * Added progress bar for long running python_smi_tools, used
      in setting nps_mode if runs longer than .1 seconds

Change-Id: I6d61bedd28d7cba6aff432ad2d127ba741b7d15a
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Bu işleme şunda yer alıyor:
Charis Poag
2023-01-30 15:58:03 -06:00
ebeveyn ae10e842af
işleme 9ef376cd61
19 değiştirilmiş dosya ile 1198 ekleme ve 87 silme
+113 -6
Dosyayı Görüntüle
@@ -18,6 +18,9 @@ import sys
import subprocess
import _thread
import time
import multiprocessing
import trace
from io import StringIO
from time import ctime
from subprocess import check_output
from rsmiBindings import *
@@ -509,7 +512,7 @@ def printEventList(device, delay, eventList):
data.message.decode('utf8') + '\r']])
def printLog(device, metricName, value):
def printLog(device, metricName, value, extraSpace=False):
""" Print out to the SMI log
@param device: DRM device identifier
@@ -530,7 +533,13 @@ def printLog(device, metricName, value):
if device is None:
logstr = logstr[13:]
# Force thread safe printing
print(logstr + '\n', end='')
lock = multiprocessing.Lock()
lock.acquire()
if extraSpace:
print('\n' + logstr + '\n', end='', flush=True)
else:
print(logstr + '\n', end='', flush=True)
lock.release()
def printListLog(metricName, valuesList):
@@ -1336,6 +1345,76 @@ def setComputePartition(deviceList, computePartitionType):
printLogSpacer()
def progressbar(it, prefix="", size=60, out=sys.stdout):
count = len(it)
def show(j):
x = int(size*j/count)
lock = multiprocessing.Lock()
lock.acquire()
print("{}[{}{}] {}/{} secs remain".format(prefix, u""*x, "."*(size-x), j, count),
end='\r', file=out, flush=True)
lock.release()
show(0)
for i, item in enumerate(it):
yield item
show(i+1)
lock = multiprocessing.Lock()
lock.acquire()
print("\n", flush=True, file=out)
lock.release()
def showProgressbar(title="", timeInSeconds=13):
if title != "":
title += ": "
for i in progressbar(range(timeInSeconds), title, 40):
time.sleep(1)
def setNPSMode(deviceList, npsMode):
""" Sets nps mode (memory partition) for a list of devices
@param deviceList: List of DRM devices (can be a single-item list)
@param npsMode: NPS Mode type to set as
"""
printLogSpacer(' Set nps mode to %s ' % (str(npsMode).upper()))
for device in deviceList:
npsMode = npsMode.upper()
if npsMode not in nps_mode_type_l:
printErrLog(device, 'Invalid nps mode type %s'
'\nValid nps mode types are %s'
% ( npsMode.upper(),
(', '.join(map(str, nps_mode_type_l))) ))
return (None, None)
t1 = multiprocessing.Process(target=showProgressbar,
args=("Updating NPS mode",13,))
t1.start()
addExtraLine=True
start=time.time()
ret = rocmsmi.rsmi_dev_nps_mode_set(device,
rsmi_nps_mode_type_dict[npsMode])
stop=time.time()
duration=stop-start
if t1.is_alive():
t1.terminate()
t1.join()
if duration < float(0.1): # For longer runs, add extra line before output
addExtraLine=False # This is to prevent overriding progress bar
if rsmi_ret_ok(ret, device, silent=True):
printLog(device,
'Successfully set nps mode to %s' % (npsMode),
None, addExtraLine)
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
printLog(device, 'Permission denied', None, addExtraLine)
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None, addExtraLine)
else:
rsmi_ret_ok(ret, device)
printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.')
printLogSpacer()
def showAllConcise(deviceList):
""" Display critical info for all devices in a concise format
@@ -2780,9 +2859,28 @@ def showComputePartition(deviceList):
printLog(device, 'Not supported on the given system', None)
else:
rsmi_ret_ok(ret, device)
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.', None)
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
printLogSpacer()
def showNPSMode(deviceList):
""" Returns the current NPS mode for a list of devices
@param deviceList: List of DRM devices (can be a single-item list)
"""
npsMode = create_string_buffer(256)
printLogSpacer(' Current NPS Mode ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_nps_mode_get(device, npsMode, 256)
if rsmi_ret_ok(ret, device, silent=True) and npsMode.value.decode():
printLog(device, 'NPS Mode', npsMode.value.decode())
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
else:
rsmi_ret_ok(ret, device)
printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.')
printLogSpacer()
def checkAmdGpus(deviceList):
""" Check if there are any AMD GPUs being queried,
return False if there are none
@@ -3130,6 +3228,7 @@ if __name__ == '__main__':
action='store_true')
groupDisplay.add_argument('--shownodesbw', help='Shows the numa nodes ', action='store_true')
groupDisplay.add_argument('--showcomputepartition', help='Shows current compute partitioning ', action='store_true')
groupDisplay.add_argument('--shownpsmode', help='Shows current nps mode ', action='store_true')
groupActionReset.add_argument('-r', '--resetclocks', help='Reset clocks and OverDrive to default',
action='store_true')
@@ -3176,8 +3275,10 @@ if __name__ == '__main__':
metavar='SCLK', nargs=1)
groupAction.add_argument('--setcomputepartition', help='Set compute partition',
choices=compute_partition_type_l + [x.lower() for x in compute_partition_type_l],
type=str, nargs=1
)
type=str, nargs=1)
groupAction.add_argument('--setnpsmode', help='Set nps mode',
choices=nps_mode_type_l + [x.lower() for x in nps_mode_type_l],
type=str, nargs=1)
groupAction.add_argument('--rasenable', help='Enable RAS for specified block and error type', type=str, nargs=2,
metavar=('BLOCK', 'ERRTYPE'))
groupAction.add_argument('--rasdisable', help='Disable RAS for specified block and error type', type=str, nargs=2,
@@ -3215,7 +3316,8 @@ if __name__ == '__main__':
or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or args.setmemoverdrive \
or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \
args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \
args.setvc or args.setsrange or args.setmrange or args.setclock or args.setcomputepartition:
args.setvc or args.setsrange or args.setmrange or args.setclock or \
args.setcomputepartition or args.setnpsmode:
relaunchAsSudo()
# If there is one or more device specified, use that for all commands, otherwise use a
@@ -3278,6 +3380,7 @@ if __name__ == '__main__':
args.showreplaycount = True
args.showvc = True
args.showcomputepartition = True
args.shownpsmode = True
if not PRINT_JSON:
args.showprofile = True
@@ -3408,6 +3511,8 @@ if __name__ == '__main__':
showEnergy(deviceList)
if args.showcomputepartition:
showComputePartition(deviceList)
if args.shownpsmode:
showNPSMode(deviceList)
if args.setclock:
setClocks(deviceList, args.setclock[0], [int(args.setclock[1])])
if args.setsclk:
@@ -3448,6 +3553,8 @@ if __name__ == '__main__':
setPerfDeterminism(deviceList, args.setperfdeterminism[0])
if args.setcomputepartition:
setComputePartition(deviceList, args.setcomputepartition[0])
if args.setnpsmode:
setNPSMode(deviceList, args.setnpsmode[0])
if args.resetprofile:
resetProfile(deviceList)
if args.resetxgmierr:
+31 -1
Dosyayı Görüntüle
@@ -66,6 +66,10 @@ class rsmi_status_t(c_int):
RSMI_STATUS_INTERRUPT = 0xC
RSMI_STATUS_UNEXPECTED_SIZE = 0xD
RSMI_STATUS_NO_DATA = 0xE
RSMI_STATUS_UNEXPECTED_DATA = 0xF
RSMI_STATUS_BUSY = 0x10
RSMI_STATUS_REFCOUNT_OVERFLOW = 0x11
RSMI_STATUS_AMDGPU_RESTART_ERR = 0x12
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
@@ -86,6 +90,10 @@ rsmi_status_verbose_err_out = {
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occured during execution',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received',
rsmi_status_t.RSMI_STATUS_BUSY: 'Busy - resources are preventing call the ability to execute',
rsmi_status_t.RSMI_STATUS_REFCOUNT_OVERFLOW: 'Data overflow - data exceeded INT32_MAX',
rsmi_status_t.RSMI_STATUS_AMDGPU_RESTART_ERR: 'Could not successfully restart the amdgpu driver',
rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occured'
}
@@ -606,4 +614,26 @@ rsmi_compute_partition_type = rsmi_compute_partition_type_t
# Usage example to get corresponding names:
# compute_partition_type_l[rsmi_compute_partition_type_t.RSMI_COMPUTE_PARTITION_CPX]
# will return string 'CPX'
compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX']
compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX']
class rsmi_nps_mode_type_t(c_int):
RSMI_MEMORY_PARTITION_UNKNOWN = 0
RSMI_MEMORY_PARTITION_NPS1 = 1
RSMI_MEMORY_PARTITION_NPS2 = 2
RSMI_MEMORY_PARTITION_NPS4 = 3
RSMI_MEMORY_PARTITION_NPS8 = 4
rsmi_nps_mode_type_dict = {
'NPS1': 1,
'NPS2': 2,
'NPS4': 3,
'NPS8': 4
}
rsmi_nps_mode_type = rsmi_nps_mode_type_t
# nps_mode_type_l includes string names for the rsmi_compute_partition_type_t
# Usage example to get corresponding names:
# nps_mode_type_l[rsmi_nps_mode_type_t.RSMI_MEMORY_PARTITION_NPS2]
# will return string 'NPS2'
nps_mode_type_l = ['NPS1', 'NPS2', 'NPS4', 'NPS8']