SWDEV-342812- Add NPS support
Updates:
* Added rsmi_dev_nps_mode_set and rsmi_dev_nps_mode_get
* Added ability to set multiple SYSFS files in debug build
* Added ability to see user's env variables set for debug build
* Added tests for rsmi_dev_nps_mode_set and rsmi_dev_nps_mode_get
* Added ability to restart AMD GPU driver, used in nps_mode_set
* Updated ROCm_SMI_Manual.pdf to include new APIs
* Added progress bar for long running python_smi_tools, used
in setting nps_mode if runs longer than .1 seconds
Change-Id: I6d61bedd28d7cba6aff432ad2d127ba741b7d15a
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Bu işleme şunda yer alıyor:
@@ -18,6 +18,9 @@ import sys
|
||||
import subprocess
|
||||
import _thread
|
||||
import time
|
||||
import multiprocessing
|
||||
import trace
|
||||
from io import StringIO
|
||||
from time import ctime
|
||||
from subprocess import check_output
|
||||
from rsmiBindings import *
|
||||
@@ -509,7 +512,7 @@ def printEventList(device, delay, eventList):
|
||||
data.message.decode('utf8') + '\r']])
|
||||
|
||||
|
||||
def printLog(device, metricName, value):
|
||||
def printLog(device, metricName, value, extraSpace=False):
|
||||
""" Print out to the SMI log
|
||||
|
||||
@param device: DRM device identifier
|
||||
@@ -530,7 +533,13 @@ def printLog(device, metricName, value):
|
||||
if device is None:
|
||||
logstr = logstr[13:]
|
||||
# Force thread safe printing
|
||||
print(logstr + '\n', end='')
|
||||
lock = multiprocessing.Lock()
|
||||
lock.acquire()
|
||||
if extraSpace:
|
||||
print('\n' + logstr + '\n', end='', flush=True)
|
||||
else:
|
||||
print(logstr + '\n', end='', flush=True)
|
||||
lock.release()
|
||||
|
||||
|
||||
def printListLog(metricName, valuesList):
|
||||
@@ -1336,6 +1345,76 @@ def setComputePartition(deviceList, computePartitionType):
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
def progressbar(it, prefix="", size=60, out=sys.stdout):
|
||||
count = len(it)
|
||||
def show(j):
|
||||
x = int(size*j/count)
|
||||
lock = multiprocessing.Lock()
|
||||
lock.acquire()
|
||||
print("{}[{}{}] {}/{} secs remain".format(prefix, u"█"*x, "."*(size-x), j, count),
|
||||
end='\r', file=out, flush=True)
|
||||
lock.release()
|
||||
show(0)
|
||||
for i, item in enumerate(it):
|
||||
yield item
|
||||
show(i+1)
|
||||
lock = multiprocessing.Lock()
|
||||
lock.acquire()
|
||||
print("\n", flush=True, file=out)
|
||||
lock.release()
|
||||
|
||||
def showProgressbar(title="", timeInSeconds=13):
|
||||
if title != "":
|
||||
title += ": "
|
||||
for i in progressbar(range(timeInSeconds), title, 40):
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def setNPSMode(deviceList, npsMode):
|
||||
""" Sets nps mode (memory partition) for a list of devices
|
||||
|
||||
@param deviceList: List of DRM devices (can be a single-item list)
|
||||
@param npsMode: NPS Mode type to set as
|
||||
"""
|
||||
printLogSpacer(' Set nps mode to %s ' % (str(npsMode).upper()))
|
||||
for device in deviceList:
|
||||
npsMode = npsMode.upper()
|
||||
if npsMode not in nps_mode_type_l:
|
||||
printErrLog(device, 'Invalid nps mode type %s'
|
||||
'\nValid nps mode types are %s'
|
||||
% ( npsMode.upper(),
|
||||
(', '.join(map(str, nps_mode_type_l))) ))
|
||||
return (None, None)
|
||||
|
||||
t1 = multiprocessing.Process(target=showProgressbar,
|
||||
args=("Updating NPS mode",13,))
|
||||
t1.start()
|
||||
addExtraLine=True
|
||||
start=time.time()
|
||||
ret = rocmsmi.rsmi_dev_nps_mode_set(device,
|
||||
rsmi_nps_mode_type_dict[npsMode])
|
||||
stop=time.time()
|
||||
duration=stop-start
|
||||
if t1.is_alive():
|
||||
t1.terminate()
|
||||
t1.join()
|
||||
if duration < float(0.1): # For longer runs, add extra line before output
|
||||
addExtraLine=False # This is to prevent overriding progress bar
|
||||
|
||||
if rsmi_ret_ok(ret, device, silent=True):
|
||||
printLog(device,
|
||||
'Successfully set nps mode to %s' % (npsMode),
|
||||
None, addExtraLine)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
|
||||
printLog(device, 'Permission denied', None, addExtraLine)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None, addExtraLine)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device)
|
||||
printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.')
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
def showAllConcise(deviceList):
|
||||
""" Display critical info for all devices in a concise format
|
||||
|
||||
@@ -2780,9 +2859,28 @@ def showComputePartition(deviceList):
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device)
|
||||
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.', None)
|
||||
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
|
||||
printLogSpacer()
|
||||
|
||||
def showNPSMode(deviceList):
|
||||
""" Returns the current NPS mode for a list of devices
|
||||
|
||||
@param deviceList: List of DRM devices (can be a single-item list)
|
||||
"""
|
||||
npsMode = create_string_buffer(256)
|
||||
printLogSpacer(' Current NPS Mode ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_nps_mode_get(device, npsMode, 256)
|
||||
if rsmi_ret_ok(ret, device, silent=True) and npsMode.value.decode():
|
||||
printLog(device, 'NPS Mode', npsMode.value.decode())
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device)
|
||||
printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.')
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
def checkAmdGpus(deviceList):
|
||||
""" Check if there are any AMD GPUs being queried,
|
||||
return False if there are none
|
||||
@@ -3130,6 +3228,7 @@ if __name__ == '__main__':
|
||||
action='store_true')
|
||||
groupDisplay.add_argument('--shownodesbw', help='Shows the numa nodes ', action='store_true')
|
||||
groupDisplay.add_argument('--showcomputepartition', help='Shows current compute partitioning ', action='store_true')
|
||||
groupDisplay.add_argument('--shownpsmode', help='Shows current nps mode ', action='store_true')
|
||||
|
||||
groupActionReset.add_argument('-r', '--resetclocks', help='Reset clocks and OverDrive to default',
|
||||
action='store_true')
|
||||
@@ -3176,8 +3275,10 @@ if __name__ == '__main__':
|
||||
metavar='SCLK', nargs=1)
|
||||
groupAction.add_argument('--setcomputepartition', help='Set compute partition',
|
||||
choices=compute_partition_type_l + [x.lower() for x in compute_partition_type_l],
|
||||
type=str, nargs=1
|
||||
)
|
||||
type=str, nargs=1)
|
||||
groupAction.add_argument('--setnpsmode', help='Set nps mode',
|
||||
choices=nps_mode_type_l + [x.lower() for x in nps_mode_type_l],
|
||||
type=str, nargs=1)
|
||||
groupAction.add_argument('--rasenable', help='Enable RAS for specified block and error type', type=str, nargs=2,
|
||||
metavar=('BLOCK', 'ERRTYPE'))
|
||||
groupAction.add_argument('--rasdisable', help='Disable RAS for specified block and error type', type=str, nargs=2,
|
||||
@@ -3215,7 +3316,8 @@ if __name__ == '__main__':
|
||||
or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or args.setmemoverdrive \
|
||||
or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \
|
||||
args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \
|
||||
args.setvc or args.setsrange or args.setmrange or args.setclock or args.setcomputepartition:
|
||||
args.setvc or args.setsrange or args.setmrange or args.setclock or \
|
||||
args.setcomputepartition or args.setnpsmode:
|
||||
relaunchAsSudo()
|
||||
|
||||
# If there is one or more device specified, use that for all commands, otherwise use a
|
||||
@@ -3278,6 +3380,7 @@ if __name__ == '__main__':
|
||||
args.showreplaycount = True
|
||||
args.showvc = True
|
||||
args.showcomputepartition = True
|
||||
args.shownpsmode = True
|
||||
|
||||
if not PRINT_JSON:
|
||||
args.showprofile = True
|
||||
@@ -3408,6 +3511,8 @@ if __name__ == '__main__':
|
||||
showEnergy(deviceList)
|
||||
if args.showcomputepartition:
|
||||
showComputePartition(deviceList)
|
||||
if args.shownpsmode:
|
||||
showNPSMode(deviceList)
|
||||
if args.setclock:
|
||||
setClocks(deviceList, args.setclock[0], [int(args.setclock[1])])
|
||||
if args.setsclk:
|
||||
@@ -3448,6 +3553,8 @@ if __name__ == '__main__':
|
||||
setPerfDeterminism(deviceList, args.setperfdeterminism[0])
|
||||
if args.setcomputepartition:
|
||||
setComputePartition(deviceList, args.setcomputepartition[0])
|
||||
if args.setnpsmode:
|
||||
setNPSMode(deviceList, args.setnpsmode[0])
|
||||
if args.resetprofile:
|
||||
resetProfile(deviceList)
|
||||
if args.resetxgmierr:
|
||||
|
||||
@@ -66,6 +66,10 @@ class rsmi_status_t(c_int):
|
||||
RSMI_STATUS_INTERRUPT = 0xC
|
||||
RSMI_STATUS_UNEXPECTED_SIZE = 0xD
|
||||
RSMI_STATUS_NO_DATA = 0xE
|
||||
RSMI_STATUS_UNEXPECTED_DATA = 0xF
|
||||
RSMI_STATUS_BUSY = 0x10
|
||||
RSMI_STATUS_REFCOUNT_OVERFLOW = 0x11
|
||||
RSMI_STATUS_AMDGPU_RESTART_ERR = 0x12
|
||||
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
|
||||
|
||||
|
||||
@@ -86,6 +90,10 @@ rsmi_status_verbose_err_out = {
|
||||
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occured during execution',
|
||||
rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
|
||||
rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
|
||||
rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received',
|
||||
rsmi_status_t.RSMI_STATUS_BUSY: 'Busy - resources are preventing call the ability to execute',
|
||||
rsmi_status_t.RSMI_STATUS_REFCOUNT_OVERFLOW: 'Data overflow - data exceeded INT32_MAX',
|
||||
rsmi_status_t.RSMI_STATUS_AMDGPU_RESTART_ERR: 'Could not successfully restart the amdgpu driver',
|
||||
rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occured'
|
||||
}
|
||||
|
||||
@@ -606,4 +614,26 @@ rsmi_compute_partition_type = rsmi_compute_partition_type_t
|
||||
# Usage example to get corresponding names:
|
||||
# compute_partition_type_l[rsmi_compute_partition_type_t.RSMI_COMPUTE_PARTITION_CPX]
|
||||
# will return string 'CPX'
|
||||
compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX']
|
||||
compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX']
|
||||
|
||||
class rsmi_nps_mode_type_t(c_int):
|
||||
RSMI_MEMORY_PARTITION_UNKNOWN = 0
|
||||
RSMI_MEMORY_PARTITION_NPS1 = 1
|
||||
RSMI_MEMORY_PARTITION_NPS2 = 2
|
||||
RSMI_MEMORY_PARTITION_NPS4 = 3
|
||||
RSMI_MEMORY_PARTITION_NPS8 = 4
|
||||
|
||||
rsmi_nps_mode_type_dict = {
|
||||
'NPS1': 1,
|
||||
'NPS2': 2,
|
||||
'NPS4': 3,
|
||||
'NPS8': 4
|
||||
}
|
||||
|
||||
rsmi_nps_mode_type = rsmi_nps_mode_type_t
|
||||
|
||||
# nps_mode_type_l includes string names for the rsmi_compute_partition_type_t
|
||||
# Usage example to get corresponding names:
|
||||
# nps_mode_type_l[rsmi_nps_mode_type_t.RSMI_MEMORY_PARTITION_NPS2]
|
||||
# will return string 'NPS2'
|
||||
nps_mode_type_l = ['NPS1', 'NPS2', 'NPS4', 'NPS8']
|
||||
Yeni konuda referans
Bir kullanıcı engelle