[SWDEV-392571] Fix concise info when missing VRAM info
Updates:
* [rocm-smi] Added larger app width size, which helps
display missing device info
* [rocm-smi] Added better context when rsmi_ret_ok
does not return with RSMI_STATUS_SUCCESS
* [rocm-smi] Removed all references to an
undefined function (printLogNoDev())
* [rocm-smi] Fixed not detecting non-int
values when setting the voltage curve
* [rocm-smi] Added better context on missing
sysfs file when setting clock overdrive
values
* [rocm-smi] Fixed getMemInfo() calls not
referencing tuple values (making it easier
to read)
* [rocm-smi] Silenced concise info spitting
out errors for missing VRAM files, instead
display which metric is "unsupported" if
the files are missing
* [rocm-smi] Updated function descriptions for
rsmi_ret_ok & getMemInfo
* [rocm-smi] Updated getMemInfo to provide a
quiet call, to silence for concise info calls.
This provides a way to keep the output clean.
* [rocm-smi-lib] Added when using debug sysfs
files, to state, which enums are enabled
for debug
Change-Id: I0e9e0c97ccf71467ced0e1a1f71803327a8be2b7
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/amdsmi commit: 6be92b9e26]
Этот коммит содержится в:
@@ -47,7 +47,7 @@ headerString = ' ROCm System Management Interface '
|
||||
footerString = ' End of ROCm SMI Log '
|
||||
|
||||
# Output formatting
|
||||
appWidth = 80
|
||||
appWidth = 84
|
||||
deviceList = []
|
||||
|
||||
# Enable or disable serialized format
|
||||
@@ -197,7 +197,7 @@ def getBus(device):
|
||||
function = bdfid.value & 0x7
|
||||
|
||||
pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function)
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_pci_id'):
|
||||
return pic_id
|
||||
|
||||
|
||||
@@ -215,10 +215,10 @@ def getFanSpeed(device):
|
||||
fm = 0
|
||||
|
||||
ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel))
|
||||
if rsmi_ret_ok(ret, device, None, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_speed', True):
|
||||
fl = fanLevel.value
|
||||
ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax))
|
||||
if rsmi_ret_ok(ret, device, None, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_max_speed', True):
|
||||
fm = fanMax.value
|
||||
if fl == 0 or fm == 0:
|
||||
return (fl, 0) # to prevent division by zero crash
|
||||
@@ -245,7 +245,7 @@ def getId(device):
|
||||
"""
|
||||
dv_id = c_short()
|
||||
ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_device_id'):
|
||||
return hex(dv_id.value)
|
||||
|
||||
|
||||
@@ -256,16 +256,21 @@ def getMaxPower(device):
|
||||
"""
|
||||
power_cap = c_uint64()
|
||||
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_power_cap'):
|
||||
return power_cap.value / 1000000
|
||||
return -1
|
||||
|
||||
|
||||
def getMemInfo(device, memType):
|
||||
""" Return the specified memory usage for the specified device
|
||||
def getMemInfo(device, memType, quiet=False):
|
||||
""" Returns a tuple of (memory_used, memory_total) of
|
||||
the requested memory type usage for the device specified
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param type: [vram|vis_vram|gtt] Memory type to return
|
||||
@param quiet=Turn on to silience error output
|
||||
(you plan to handle manually). Default is off,
|
||||
which exposes any issue accessing the different
|
||||
memory types.
|
||||
"""
|
||||
memType = memType.upper()
|
||||
if memType not in memory_type_l:
|
||||
@@ -278,11 +283,11 @@ def getMemInfo(device, memType):
|
||||
memTotal = None
|
||||
|
||||
ret = rocmsmi.rsmi_dev_memory_usage_get(device, memory_type_l.index(memType), byref(memoryUse))
|
||||
if rsmi_ret_ok(ret, device, memType):
|
||||
if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), quiet):
|
||||
memUsed = memoryUse.value
|
||||
|
||||
ret = rocmsmi.rsmi_dev_memory_total_get(device, memory_type_l.index(memType), byref(memoryTot))
|
||||
if rsmi_ret_ok(ret, device, memType + ' total'):
|
||||
if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), quiet):
|
||||
memTotal = memoryTot.value
|
||||
return (memUsed, memTotal)
|
||||
|
||||
@@ -319,7 +324,7 @@ def getPerfLevel(device):
|
||||
"""
|
||||
perf = rsmi_dev_perf_level_t()
|
||||
ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_perf_level'):
|
||||
return perf_level_string(perf.value)
|
||||
return -1
|
||||
|
||||
@@ -336,7 +341,7 @@ def getPidList():
|
||||
""" Return a list of KFD process IDs """
|
||||
num_items = c_uint32()
|
||||
ret = rocmsmi.rsmi_compute_process_info_get(None, byref(num_items))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_compute_process_info'):
|
||||
buff_sz = num_items.value + 10
|
||||
procs = (rsmi_process_info_t * buff_sz)()
|
||||
procList = []
|
||||
@@ -354,7 +359,7 @@ def getPower(device):
|
||||
"""
|
||||
power = c_uint32()
|
||||
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power))
|
||||
if rsmi_ret_ok(ret, device, 'power'):
|
||||
if rsmi_ret_ok(ret, device, 'get_power_avg'):
|
||||
return power.value / 1000000
|
||||
return 'N/A'
|
||||
|
||||
@@ -368,7 +373,7 @@ def getRasEnablement(device, block):
|
||||
state = rsmi_ras_err_state_t()
|
||||
ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state))
|
||||
|
||||
if rsmi_ret_ok(ret, device, block, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True):
|
||||
return rsmi_ras_err_stale_machine[state.value].upper()
|
||||
return 'N/A'
|
||||
|
||||
@@ -382,7 +387,7 @@ def getTemp(device, sensor):
|
||||
temp = c_int64(0)
|
||||
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
|
||||
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), temp_type_lst.index(sensor), metric, byref(temp))
|
||||
if rsmi_ret_ok(ret, device, sensor, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), True):
|
||||
return temp.value / 1000
|
||||
return 'N/A'
|
||||
|
||||
@@ -406,7 +411,7 @@ def getVersion(deviceList, component):
|
||||
"""
|
||||
ver_str = create_string_buffer(256)
|
||||
ret = rocmsmi.rsmi_version_str_get(component, ver_str, 256)
|
||||
if rsmi_ret_ok(ret, None, component):
|
||||
if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component)):
|
||||
return ver_str.value.decode()
|
||||
return None
|
||||
|
||||
@@ -418,7 +423,7 @@ def getComputePartition(device):
|
||||
"""
|
||||
currentComputePartition = create_string_buffer(256)
|
||||
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
|
||||
if rsmi_ret_ok(ret, device, silent=True) and currentComputePartition.value.decode():
|
||||
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
|
||||
return str(currentComputePartition.value.decode())
|
||||
return "UNKNOWN"
|
||||
|
||||
@@ -430,7 +435,7 @@ def getMemoryPartition(device):
|
||||
"""
|
||||
currentNPSMode = create_string_buffer(256)
|
||||
ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256)
|
||||
if rsmi_ret_ok(ret, device, silent=True) and currentNPSMode.value.decode():
|
||||
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode():
|
||||
return str(currentNPSMode.value.decode())
|
||||
return "UNKNOWN"
|
||||
|
||||
@@ -518,13 +523,13 @@ def printEventList(device, delay, eventList):
|
||||
"""
|
||||
mask = 0
|
||||
ret = rocmsmi.rsmi_event_notification_init(device)
|
||||
if not rsmi_ret_ok(ret, device):
|
||||
if not rsmi_ret_ok(ret, device, 'event_notification_init'):
|
||||
printErrLog(device, 'Unable to initialize event notifications.')
|
||||
return
|
||||
for eventType in eventList:
|
||||
mask |= 2 ** notification_type_names.index(eventType.upper())
|
||||
ret = rocmsmi.rsmi_event_notification_mask_set(device, mask)
|
||||
if not rsmi_ret_ok(ret, device):
|
||||
if not rsmi_ret_ok(ret, device, 'set_event_notification_mask'):
|
||||
printErrLog(device, 'Unable to set event notification mask.')
|
||||
return
|
||||
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
|
||||
@@ -555,7 +560,7 @@ def printLog(device, metricName, value, extraSpace=False):
|
||||
else:
|
||||
logstr = 'GPU[%s]\t\t: %s' % (device, metricName)
|
||||
if device is None:
|
||||
logstr = logstr[13:]
|
||||
logstr = logstr.split(':')[1][1:]
|
||||
# Force thread safe printing
|
||||
lock = multiprocessing.Lock()
|
||||
lock.acquire()
|
||||
@@ -687,13 +692,13 @@ def checkIfSecondaryDie(device):
|
||||
power_cap = c_uint64()
|
||||
# secondary die can currently be determined by checking if all power1_* (power cap) values are equal to zero.
|
||||
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
|
||||
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
|
||||
if not (rsmi_ret_ok(ret, None, 'get_power_cap', False) and power_cap.value == 0):
|
||||
return False
|
||||
ret = rocmsmi.rsmi_dev_power_cap_default_get(device, byref(power_cap))
|
||||
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
|
||||
if not (rsmi_ret_ok(ret, None, 'get_power_cap_default', False) and power_cap.value == 0):
|
||||
return False
|
||||
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power_cap))
|
||||
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
|
||||
if not (rsmi_ret_ok(ret, None, 'get_power_avg', False) and power_cap.value == 0):
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -709,17 +714,17 @@ def resetClocks(deviceList):
|
||||
printLogSpacer(' Reset Clocks ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(0))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_overdrive_level'):
|
||||
printLog(device, 'OverDrive set to 0', None)
|
||||
else:
|
||||
printLog(device, 'Unable to reset OverDrive', None)
|
||||
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_perf_level'):
|
||||
printLog(device, 'Successfully reset clocks', None)
|
||||
else:
|
||||
printLog(device, 'Unable to reset clocks', None)
|
||||
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_perf_level'):
|
||||
printLog(device, 'Performance level reset to auto', None)
|
||||
else:
|
||||
printLog(device, 'Unable to reset performance level to auto', None)
|
||||
@@ -734,7 +739,7 @@ def resetFans(deviceList):
|
||||
for device in deviceList:
|
||||
sensor_ind = c_uint32(0)
|
||||
ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind)
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'reset_fan'):
|
||||
printLog(device, 'Successfully reset fan speed to driver control', None)
|
||||
printLogSpacer()
|
||||
|
||||
@@ -755,12 +760,12 @@ def resetProfile(deviceList):
|
||||
printLogSpacer(' Reset Profile ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_power_profile_set(device, 0, profileString('BOOTUP DEFAULT'))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_power_profile'):
|
||||
printLog(device, 'Successfully reset Power Profile', None)
|
||||
else:
|
||||
printErrLog(device, 'Unable to reset Power Profile')
|
||||
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_perf_level'):
|
||||
printLog(device, 'Successfully reset Performance Level', None)
|
||||
else:
|
||||
printErrLog(device, 'Unable to reset Performance Level')
|
||||
@@ -806,7 +811,7 @@ def resetComputePartition(deviceList):
|
||||
for device in deviceList:
|
||||
originalPartition = getComputePartition(device)
|
||||
ret = rocmsmi.rsmi_dev_compute_partition_reset(device)
|
||||
if rsmi_ret_ok(ret, device, silent=True):
|
||||
if rsmi_ret_ok(ret, device, 'reset_compute_partition', silent=True):
|
||||
resetBootState = getComputePartition(device)
|
||||
printLog(device, "Successfully reset compute partition (" +
|
||||
originalPartition + ") to boot state (" + resetBootState +
|
||||
@@ -816,7 +821,7 @@ def resetComputePartition(deviceList):
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device)
|
||||
rsmi_ret_ok(ret, device, 'reset_compute_partition')
|
||||
printErrLog(device, 'Failed to reset the compute partition to boot state')
|
||||
printLogSpacer()
|
||||
|
||||
@@ -842,7 +847,7 @@ def resetNpsMode(deviceList):
|
||||
t1.join()
|
||||
if duration < float(0.1): # For longer runs, add extra line before output
|
||||
addExtraLine=False # This is to prevent overriding progress bar
|
||||
if rsmi_ret_ok(ret, device, silent=True):
|
||||
if rsmi_ret_ok(ret, device, 'reset_NPS_mode', silent=True):
|
||||
resetBootState = getMemoryPartition(device)
|
||||
printLog(device, "Successfully reset nps mode (" +
|
||||
originalPartition + ") to boot state (" +
|
||||
@@ -852,7 +857,7 @@ def resetNpsMode(deviceList):
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None, addExtraLine)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device)
|
||||
rsmi_ret_ok(ret, device, 'reset_NPS_mode')
|
||||
printErrLog(device, 'Failed to reset nps mode to boot state')
|
||||
printLogSpacer()
|
||||
|
||||
@@ -906,16 +911,16 @@ def setVoltageCurve(deviceList, point, clk, volt, autoRespond):
|
||||
global RETCODE
|
||||
value = '%s %s %s' % (point, clk, volt)
|
||||
try:
|
||||
any(int(item) for item in value)
|
||||
any(int(item) for item in value.split())
|
||||
except ValueError:
|
||||
printLogNoDev('Unable to set Voltage curve')
|
||||
logging.error('Non-integer characters are present in %s', value)
|
||||
printErrLog(None, 'Unable to set Voltage curve')
|
||||
printErrLog(None, 'Non-integer characters are present in %s' %value)
|
||||
RETCODE = 1
|
||||
return
|
||||
confirmOutOfSpecWarning(autoRespond)
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_od_volt_info_set(device, int(point), int(clk), int(volt))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_voltage_curve'):
|
||||
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
|
||||
else:
|
||||
printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
|
||||
@@ -935,11 +940,12 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
|
||||
"""
|
||||
global RETCODE
|
||||
value = '%s %s %s' % (point, clk, volt)
|
||||
listOfValues = value.split(' ')
|
||||
try:
|
||||
any(int(item) for item in value.split())
|
||||
except ValueError:
|
||||
printLogNoDev('Unable to set PowerPlay table level')
|
||||
logging.error('Non-integer characters are present in %s', value)
|
||||
printErrLog(None, 'Unable to set PowerPlay table level')
|
||||
printErrLog(None, 'Non-integer characters are present in %s' %value)
|
||||
RETCODE = 1
|
||||
return
|
||||
confirmOutOfSpecWarning(autoRespond)
|
||||
@@ -947,7 +953,7 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
|
||||
if clkType == 'sclk':
|
||||
ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk),
|
||||
rsmi_clk_names_dict[clkType])
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)):
|
||||
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
|
||||
else:
|
||||
printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
|
||||
@@ -955,7 +961,7 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
|
||||
elif clkType == 'mclk':
|
||||
ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk),
|
||||
rsmi_clk_names_dict[clkType])
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)):
|
||||
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
|
||||
else:
|
||||
printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
|
||||
@@ -996,14 +1002,15 @@ def setClockOverDrive(deviceList, clktype, value, autoRespond):
|
||||
value = '20'
|
||||
if getPerfLevel(device) != 'MANUAL':
|
||||
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_perf_level_manual_' + str(clktype)):
|
||||
printLog(device, 'Performance level set to manual', None)
|
||||
else:
|
||||
printErrLog(device, 'Unable to set performance level to manual')
|
||||
if clktype == 'mclk':
|
||||
fsFile = os.path.join('/sys/class/drm', 'card%d' % (device), 'device', 'pp_mclk_od')
|
||||
if not os.path.isfile(fsFile):
|
||||
printLog(None, 'Unable to write to sysfs file', None)
|
||||
printLog(None, 'Unable to write to sysfs file (' + fsFile +
|
||||
'), file does not exist', None)
|
||||
logging.debug('%s does not exist', fsFile)
|
||||
continue
|
||||
try:
|
||||
@@ -1011,14 +1018,14 @@ def setClockOverDrive(deviceList, clktype, value, autoRespond):
|
||||
with open(fsFile, 'w') as fs:
|
||||
fs.write(value + '\n')
|
||||
except (IOError, OSError):
|
||||
printLog(None, 'Unable to write to sysfs file %s' % fsFile, None)
|
||||
printLog(None, 'Unable to write to sysfs file %s' %fsFile, None)
|
||||
logging.warning('IO or OS error')
|
||||
RETCODE = 1
|
||||
continue
|
||||
printLog(device, 'Successfully set %s OverDrive to %s%%' % (clktype, value), None)
|
||||
elif clktype == 'sclk':
|
||||
ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(int(value)))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_overdrive_level_' + str(clktype)):
|
||||
printLog(device, 'Successfully set %s OverDrive to %s%%' % (clktype, value), None)
|
||||
else:
|
||||
printLog(device, 'Unable to set %s OverDrive to %s%%' % (clktype, value), None)
|
||||
@@ -1069,7 +1076,7 @@ def setClocks(deviceList, clktype, clk):
|
||||
# Check if the performance level is manual, if not then set it to manual
|
||||
if getPerfLevel(device).lower() != 'manual':
|
||||
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_perf_level_manual'):
|
||||
printLog(device, 'Performance level was set to manual', None)
|
||||
else:
|
||||
printErrLog(device, 'Unable to set performance level to manual')
|
||||
@@ -1079,7 +1086,7 @@ def setClocks(deviceList, clktype, clk):
|
||||
# Validate frequency bitmask
|
||||
freq = rsmi_frequencies_t()
|
||||
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clktype], byref(freq))
|
||||
if rsmi_ret_ok(ret, device, clktype) == False:
|
||||
if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clktype)) == False:
|
||||
RETCODE = 1
|
||||
return
|
||||
# The freq_bitmask should be less than 2^(freqs.num_supported)
|
||||
@@ -1090,7 +1097,7 @@ def setClocks(deviceList, clktype, clk):
|
||||
return
|
||||
|
||||
ret = rocmsmi.rsmi_dev_gpu_clk_freq_set(device, rsmi_clk_names_dict[clktype], freq_bitmask)
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_gpu_clk_freq_' + str(clktype)):
|
||||
printLog(device, 'Successfully set %s bitmask to' % (clktype), hex(freq_bitmask))
|
||||
else:
|
||||
printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask)))
|
||||
@@ -1099,7 +1106,7 @@ def setClocks(deviceList, clktype, clk):
|
||||
# Validate the bandwidth bitmask
|
||||
bw = rsmi_pcie_bandwidth_t()
|
||||
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
|
||||
if rsmi_ret_ok(ret, device, 'PCIe') == False:
|
||||
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth') == False:
|
||||
RETCODE = 1
|
||||
return
|
||||
# The freq_bitmask should be less than 2^(bw.transfer_rate.num_supported)
|
||||
@@ -1110,7 +1117,7 @@ def setClocks(deviceList, clktype, clk):
|
||||
return
|
||||
|
||||
ret = rocmsmi.rsmi_dev_pci_bandwidth_set(device, freq_bitmask)
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_PCIe_bandwidth'):
|
||||
printLog(device, 'Successfully set %s to level bitmask' % (clktype), hex(freq_bitmask))
|
||||
else:
|
||||
printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask)))
|
||||
@@ -1135,7 +1142,7 @@ def setPerfDeterminism(deviceList, clkvalue):
|
||||
return
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_perf_determinism_mode_set(device, int(clkvalue))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_perf_determinism'):
|
||||
printLog(device, 'Successfully enabled performance determinism and set GFX clock frequency', str(clkvalue))
|
||||
else:
|
||||
printErrLog(device, 'Unable to set performance determinism and clock frequency to %s' % (str(clkvalue)))
|
||||
@@ -1159,7 +1166,7 @@ def resetGpu(device):
|
||||
RETCODE = 1
|
||||
return
|
||||
ret = rocmsmi.rsmi_dev_gpu_reset(resetDev)
|
||||
if rsmi_ret_ok(ret, resetDev):
|
||||
if rsmi_ret_ok(ret, resetDev, 'reset_gpu'):
|
||||
printLog(resetDev, 'Successfully reset GPU %d' % (resetDev), None)
|
||||
else:
|
||||
printErrLog(resetDev, 'Unable to reset GPU %d' % (resetDev))
|
||||
@@ -1258,7 +1265,7 @@ def setFanSpeed(deviceList, fan):
|
||||
else:
|
||||
fanLevel = int(str(fan))
|
||||
ret = rocmsmi.rsmi_dev_fan_speed_set(device, 0, int(fanLevel))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_fan_speed'):
|
||||
printLog(device, 'Successfully set fan speed to level %s' % (str(int(fanLevel))), None)
|
||||
printLogSpacer()
|
||||
|
||||
@@ -1277,7 +1284,7 @@ def setPerformanceLevel(deviceList, level):
|
||||
logging.error('Invalid Performance level: %s', level)
|
||||
else:
|
||||
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(validLevels.index(level)))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_perf_level'):
|
||||
printLog(device, 'Performance level set to %s' % (str(level)), None)
|
||||
printLogSpacer()
|
||||
|
||||
@@ -1336,7 +1343,7 @@ def setPowerOverDrive(deviceList, value, autoRespond):
|
||||
new_power_cap.value = int(value) * 1000000
|
||||
|
||||
ret = rocmsmi.rsmi_dev_power_cap_range_get(device, 0, byref(power_cap_max), byref(power_cap_min))
|
||||
if rsmi_ret_ok(ret, device) == False:
|
||||
if rsmi_ret_ok(ret, device, 'get_power_cap_range') == False:
|
||||
printErrLog(device, 'Unable to parse Power OverDrive range')
|
||||
RETCODE = 1
|
||||
continue
|
||||
@@ -1360,11 +1367,11 @@ def setPowerOverDrive(deviceList, value, autoRespond):
|
||||
specWarningConfirmed = True
|
||||
|
||||
ret = rocmsmi.rsmi_dev_power_cap_set(device, 0, new_power_cap)
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'set_power_cap'):
|
||||
if int(value) == 0:
|
||||
power_cap = c_uint64()
|
||||
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_power_cap'):
|
||||
if not PRINT_JSON:
|
||||
printLog(device,
|
||||
'Successfully reset Power OverDrive to: %sW' % (int(power_cap.value / 1000000)), None)
|
||||
@@ -1395,7 +1402,7 @@ def setProfile(deviceList, profile):
|
||||
for device in deviceList:
|
||||
# Get previous profile
|
||||
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
|
||||
if rsmi_ret_ok(ret, device, 'previous profile'):
|
||||
if rsmi_ret_ok(ret, device, 'get_power_profile'):
|
||||
previousProfile = profileString(status.current)
|
||||
# Get desired profile
|
||||
desiredProfile = 'UNKNOWN'
|
||||
@@ -1412,10 +1419,10 @@ def setProfile(deviceList, profile):
|
||||
return
|
||||
else:
|
||||
ret = rocmsmi.rsmi_dev_power_profile_set(device, 0, profileString(desiredProfile))
|
||||
if rsmi_ret_ok(ret, device, 'set profile'):
|
||||
if rsmi_ret_ok(ret, device, 'set_power_profile'):
|
||||
# Get current profile
|
||||
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
|
||||
if rsmi_ret_ok(ret, device, 'current profile'):
|
||||
if rsmi_ret_ok(ret, device, 'get_power_profile_presets'):
|
||||
currentProfile = profileString(status.current)
|
||||
if currentProfile == desiredProfile:
|
||||
printLog(device, 'Successfully set profile to', desiredProfile)
|
||||
@@ -1441,7 +1448,7 @@ def setComputePartition(deviceList, computePartitionType):
|
||||
return (None, None)
|
||||
ret = rocmsmi.rsmi_dev_compute_partition_set(device,
|
||||
rsmi_compute_partition_type_dict[computePartitionType])
|
||||
if rsmi_ret_ok(ret, device, silent=True):
|
||||
if rsmi_ret_ok(ret, device, 'set_compute_partition', silent=True):
|
||||
printLog(device,
|
||||
'Successfully set compute partition to %s' % (computePartitionType),
|
||||
None)
|
||||
@@ -1453,7 +1460,7 @@ def setComputePartition(deviceList, computePartitionType):
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device)
|
||||
rsmi_ret_ok(ret, device, 'set_compute_partition')
|
||||
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
|
||||
printLogSpacer()
|
||||
|
||||
@@ -1514,7 +1521,7 @@ def setNPSMode(deviceList, npsMode):
|
||||
if duration < float(0.1): # For longer runs, add extra line before output
|
||||
addExtraLine=False # This is to prevent overriding progress bar
|
||||
|
||||
if rsmi_ret_ok(ret, device, silent=True):
|
||||
if rsmi_ret_ok(ret, device, 'set_NPS_mode', silent=True):
|
||||
printLog(device,
|
||||
'Successfully set nps mode to %s' % (npsMode),
|
||||
None, addExtraLine)
|
||||
@@ -1523,7 +1530,7 @@ def setNPSMode(deviceList, npsMode):
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None, addExtraLine)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device)
|
||||
rsmi_ret_ok(ret, device, 'set_NPS_mode')
|
||||
printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.')
|
||||
printLogSpacer()
|
||||
|
||||
@@ -1567,10 +1574,12 @@ def showAllConcise(deviceList):
|
||||
gpu_busy = str(getGpuUse(device)) + '%'
|
||||
else:
|
||||
gpu_busy = 'Unsupported'
|
||||
memInfo = getMemInfo(device, 'vram')
|
||||
vram_used, vram_total = getMemInfo(device, 'vram', True)
|
||||
mem_use_pct = 0
|
||||
if memInfo[0] != None and memInfo[1] != None and float(memInfo[1]) != 0:
|
||||
mem_use_pct = '% 3.0f%%' % (100 * (float(memInfo[0]) / float(memInfo[1])))
|
||||
if vram_used is None:
|
||||
mem_use_pct='Unsupported'
|
||||
if vram_used != None and vram_total != None and float(vram_total) != 0:
|
||||
mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total)))
|
||||
values['card%s' % (str(device))] = [device, temp, avgPwr, sclk, mclk, fan, str(perf).lower(), pwrCap,
|
||||
mem_use_pct, gpu_busy]
|
||||
val_widths = {}
|
||||
@@ -1650,7 +1659,7 @@ def showClocks(deviceList):
|
||||
freq_list = []
|
||||
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
|
||||
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
|
||||
if rsmi_ret_ok(ret, device, clk_type, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True):
|
||||
printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None)
|
||||
for x in range(freq.num_supported):
|
||||
fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000)
|
||||
@@ -1664,7 +1673,7 @@ def showClocks(deviceList):
|
||||
printLog(device, '', None)
|
||||
if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1:
|
||||
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
|
||||
if rsmi_ret_ok(ret, device, 'PCIe', True):
|
||||
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth', True):
|
||||
printLog(device, 'Supported %s frequencies on GPU%s' % ('PCIe', str(device)), None)
|
||||
freq_list = []
|
||||
for x in range(bw.transfer_rate.num_supported):
|
||||
@@ -1698,7 +1707,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
|
||||
if clk_defined:
|
||||
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], None) == 1:
|
||||
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], byref(freq))
|
||||
if rsmi_ret_ok(ret, device, clk_defined, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clk_defined), silent=True):
|
||||
levl = freq.current
|
||||
if levl >= freq.num_supported:
|
||||
printLog(device, '%s current clock frequency not found' % (clk_defined), None)
|
||||
@@ -1714,7 +1723,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
|
||||
for clk_type in sorted(rsmi_clk_names_dict):
|
||||
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
|
||||
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
|
||||
if rsmi_ret_ok(ret, device, clk_type, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_clk_freq_' + str(clk_type), True):
|
||||
levl = freq.current
|
||||
if levl >= freq.num_supported:
|
||||
printLog(device, '%s current clock frequency not found' % (clk_type), None)
|
||||
@@ -1730,7 +1739,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
|
||||
# pcie clocks
|
||||
if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1:
|
||||
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
|
||||
if rsmi_ret_ok(ret, device, 'PCIe', True):
|
||||
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth', True):
|
||||
current_f = bw.transfer_rate.current
|
||||
if current_f >= bw.transfer_rate.num_supported:
|
||||
printLog(device, 'PCIe current clock frequency not found', None )
|
||||
@@ -1768,7 +1777,7 @@ def showCurrentFans(deviceList):
|
||||
else:
|
||||
printLog(device, 'Fan Level', str(fanLevel) + ' (%s%%)' % (str(fanSpeed)))
|
||||
ret = rocmsmi.rsmi_dev_fan_rpms_get(device, sensor_ind, byref(rpmSpeed))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_rpms'):
|
||||
printLog(device, 'Fan RPM', rpmSpeed.value)
|
||||
printLogSpacer()
|
||||
|
||||
@@ -1809,7 +1818,7 @@ def showFwInfo(deviceList, fwType):
|
||||
for fw_name in firmware_blocks:
|
||||
fw_name = fw_name.upper()
|
||||
ret = rocmsmi.rsmi_dev_firmware_version_get(device, fw_block_names_l.index(fw_name), byref(fw_ver))
|
||||
if rsmi_ret_ok(ret, device, fw_name):
|
||||
if rsmi_ret_ok(ret, device, 'get_firmware_version_' + str(fw_name)):
|
||||
# The VCN, VCE, UVD, SOS and ASD firmware's value needs to be in hexadecimal
|
||||
if fw_name in ['VCN', 'VCE', 'UVD', 'SOS', 'ASD']:
|
||||
printLog(device, '%s firmware version' % (fw_name),
|
||||
@@ -1853,12 +1862,12 @@ def showGpusByPid(pidList):
|
||||
return
|
||||
for pid in pidList:
|
||||
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), None, byref(num_devices))
|
||||
if rsmi_ret_ok(ret, 'PID ' + pid):
|
||||
if rsmi_ret_ok(ret, metric=('PID ' + pid)):
|
||||
|
||||
dv_indices = (c_uint32 * num_devices.value)()
|
||||
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), dv_indices, byref(num_devices))
|
||||
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
|
||||
metricName = 'PID %s is using %s DRM device(s)' % (pid, str(num_devices.value))
|
||||
if (num_devices.value):
|
||||
printListLog(metricName, list(dv_indices))
|
||||
@@ -1900,7 +1909,7 @@ def getCoarseGrainUtil(device, typeName=None):
|
||||
utilization_counters[i].type = c_int(i)
|
||||
|
||||
ret = rocmsmi.rsmi_utilization_count_get(device, utilization_counters, length, byref(timestamp))
|
||||
if rsmi_ret_ok(ret, device, typeName, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_utilization_count_'+ str(typeName), True):
|
||||
return utilization_counters
|
||||
return -1
|
||||
|
||||
@@ -2026,7 +2035,7 @@ def showMemVendor(deviceList):
|
||||
printLogSpacer(' Memory Vendor ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_vram_vendor_get(device, vendor, 256)
|
||||
if rsmi_ret_ok(ret, device) and vendor.value.decode():
|
||||
if rsmi_ret_ok(ret, device, 'get_vram_vendor') and vendor.value.decode():
|
||||
printLog(device, 'GPU memory vendor', vendor.value.decode())
|
||||
else:
|
||||
logging.debug('GPU memory vendor missing or not supported')
|
||||
@@ -2046,13 +2055,13 @@ def showOverDrive(deviceList, odtype):
|
||||
odStr = 'GPU'
|
||||
ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(rsmi_od))
|
||||
od = rsmi_od.value
|
||||
if not rsmi_ret_ok(ret, device):
|
||||
if not rsmi_ret_ok(ret, device, 'get_overdrive_level_' + str(odtype)):
|
||||
continue
|
||||
elif odtype == 'mclk':
|
||||
odStr = 'GPU Memory'
|
||||
ret = rocmsmi.rsmi_dev_mem_overdrive_level_get(device, byref(rsmi_od))
|
||||
od = rsmi_od.value
|
||||
if not rsmi_ret_ok(ret, device):
|
||||
if not rsmi_ret_ok(ret, device, 'get_mem_overdrive_level_' + str(odtype)):
|
||||
continue
|
||||
else:
|
||||
printErrLog(device, 'Unable to retrieve OverDrive')
|
||||
@@ -2073,7 +2082,7 @@ def showPcieBw(deviceList):
|
||||
printLogSpacer(' Measured PCIe Bandwidth ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_pci_throughput_get(device, byref(sent), byref(received), byref(max_pkt_sz))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth'):
|
||||
# Use 1024.0 to ensure that the result is a float and not integer division
|
||||
bw = ((received.value + sent.value) * max_pkt_sz.value) / 1024.0 / 1024.0
|
||||
# Use the bwstr below to control precision on the string
|
||||
@@ -2131,15 +2140,15 @@ def showPids():
|
||||
sdmaUsage = 'UNKNOWN'
|
||||
cuOccupancy = 'UNKNOWN'
|
||||
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), None, byref(num_devices))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
|
||||
dv_indices = (c_uint32 * num_devices.value)()
|
||||
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), dv_indices, byref(num_devices))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
|
||||
gpuNumber = str(num_devices.value)
|
||||
else:
|
||||
logging.debug('Unable to fetch GPU number by PID')
|
||||
ret = rocmsmi.rsmi_compute_process_info_by_pid_get(int(pid), byref(proc))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_compute_process_info_by_pid'):
|
||||
vramUsage = proc.vram_usage
|
||||
sdmaUsage = proc.sdma_usage
|
||||
cuOccupancy = proc.cu_occupancy
|
||||
@@ -2184,7 +2193,7 @@ def showPowerPlayTable(deviceList):
|
||||
odvf = rsmi_od_volt_freq_data_t()
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
|
||||
if rsmi_ret_ok(ret, device, 'od volt'):
|
||||
if rsmi_ret_ok(ret, device, 'get_od_volt'):
|
||||
# TODO: Make this more dynamic and less hard-coded if possible
|
||||
printLog(device, 'OD_SCLK:', None)
|
||||
printLog(device, '0: %sMhz' % (int(odvf.curr_sclk_range.lower_bound / 1000000)), None)
|
||||
@@ -2224,7 +2233,7 @@ def showProductName(deviceList):
|
||||
# Retrieve card vendor
|
||||
ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, 256)
|
||||
# Only continue if GPU vendor is AMD
|
||||
if rsmi_ret_ok(ret, device) and isAmdDevice(device):
|
||||
if rsmi_ret_ok(ret, device, 'get_vendor_name') and isAmdDevice(device):
|
||||
try:
|
||||
device_vendor = vendor.value.decode()
|
||||
except UnicodeDecodeError:
|
||||
@@ -2232,7 +2241,7 @@ def showProductName(deviceList):
|
||||
device_vendor = "N/A"
|
||||
# Retrieve the device series
|
||||
ret = rocmsmi.rsmi_dev_name_get(device, series, 256)
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_name'):
|
||||
try:
|
||||
device_series = series.value.decode()
|
||||
printLog(device, 'Card series', '\t\t' + device_series)
|
||||
@@ -2240,7 +2249,7 @@ def showProductName(deviceList):
|
||||
printErrLog(device, "Unable to read card series")
|
||||
# Retrieve the device model
|
||||
ret = rocmsmi.rsmi_dev_subsystem_name_get(device, model, 256)
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_subsystem_name'):
|
||||
try:
|
||||
device_model = model.value.decode()
|
||||
# padHexValue is used for applications that expect 4-digit card models
|
||||
@@ -2254,7 +2263,7 @@ def showProductName(deviceList):
|
||||
# device_sku = sku.value.decode()
|
||||
# Retrieve the device SKU as a substring from VBIOS
|
||||
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
|
||||
if rsmi_ret_ok(ret, device) and vbios.value.decode():
|
||||
if rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode():
|
||||
# Device SKU is just the characters in between the two '-' in vbios_version
|
||||
if vbios.value.decode().count('-') == 2 and len(str(vbios.value.decode().split('-')[1])) > 1:
|
||||
device_sku = vbios.value.decode().split('-')[1]
|
||||
@@ -2282,7 +2291,7 @@ def showProfile(deviceList):
|
||||
status = rsmi_power_profile_status_t()
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
|
||||
if rsmi_ret_ok(ret, device, 'profiles', silent=False):
|
||||
if rsmi_ret_ok(ret, device, 'get_power_profiles', silent=False):
|
||||
binaryMaskString = str(format(status.available_profiles, '07b'))[::-1]
|
||||
bitMaskPosition = 0
|
||||
profileNumber = 0
|
||||
@@ -2314,7 +2323,7 @@ def showRange(deviceList, rangeType):
|
||||
odvf = rsmi_od_volt_freq_data_t()
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
|
||||
if rsmi_ret_ok(ret, device, 'od volt', silent=False):
|
||||
if rsmi_ret_ok(ret, device, 'get_od_volt', silent=False):
|
||||
if rangeType == 'sclk':
|
||||
printLog(device, 'Valid sclk range: %sMhz - %sMhz' % (
|
||||
int(odvf.curr_sclk_range.lower_bound / 1000000), int(odvf.curr_sclk_range.upper_bound / 1000000)), None)
|
||||
@@ -2362,7 +2371,7 @@ def showRasInfo(deviceList, rasType):
|
||||
for block in rasBlocks:
|
||||
row = []
|
||||
ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state))
|
||||
if rsmi_ret_ok(ret, device, block, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True):
|
||||
row.append(block)
|
||||
row.append(rsmi_ras_err_stale_machine[state.value].upper())
|
||||
# Now add the error count
|
||||
@@ -2424,7 +2433,7 @@ def showSerialNumber(deviceList):
|
||||
printErrLog(device, "FRU Serial Number contains non-alphanumeric characters. FRU is likely corrupted")
|
||||
continue
|
||||
|
||||
if rsmi_ret_ok(ret, device) and sn.value.decode():
|
||||
if rsmi_ret_ok(ret, device, 'get_serial_number') and sn.value.decode():
|
||||
printLog(device, 'Serial Number', sn.value.decode())
|
||||
else:
|
||||
printLog(device, 'Serial Number', 'N/A')
|
||||
@@ -2440,7 +2449,7 @@ def showUId(deviceList):
|
||||
for device in deviceList:
|
||||
dv_uid = c_uint64()
|
||||
ret = rocmsmi.rsmi_dev_unique_id_get(device, byref(dv_uid))
|
||||
if rsmi_ret_ok(ret, device, None, True) and str(hex(dv_uid.value)):
|
||||
if rsmi_ret_ok(ret, device, 'get_unique_id', True) and str(hex(dv_uid.value)):
|
||||
printLog(device, 'Unique ID', hex(dv_uid.value))
|
||||
else:
|
||||
printLog(device, 'Unique ID', 'N/A')
|
||||
@@ -2510,7 +2519,7 @@ def showEvents(deviceList, eventTypes):
|
||||
if user_input == 'q' or user_input == '\x03':
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_event_notification_stop(device)
|
||||
if not rsmi_ret_ok(ret, device):
|
||||
if not rsmi_ret_ok(ret, device, 'stop_event_notification'):
|
||||
printErrLog(device, 'Unable to end event notifications.')
|
||||
print('\r')
|
||||
break
|
||||
@@ -2633,7 +2642,7 @@ def showVoltage(deviceList):
|
||||
met = rsmi_voltage_metric_t(0)
|
||||
voltage = c_uint64()
|
||||
ret = rocmsmi.rsmi_dev_volt_metric_get(device, vtype, met, byref(voltage))
|
||||
if rsmi_ret_ok(ret, device) and str(voltage.value):
|
||||
if rsmi_ret_ok(ret, device, 'get_volt_metric') and str(voltage.value):
|
||||
printLog(device, 'Voltage (mV)', str(voltage.value))
|
||||
else:
|
||||
logging.debug('GPU voltage not supported')
|
||||
@@ -2649,7 +2658,7 @@ def showVoltageCurve(deviceList):
|
||||
odvf = rsmi_od_volt_freq_data_t()
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
|
||||
if rsmi_ret_ok(ret, device, 'od volt', silent=False):
|
||||
if rsmi_ret_ok(ret, device, 'get_od_volt_info', silent=False):
|
||||
for position in range(3):
|
||||
printLog(device, 'Voltage point %d: %sMhz %smV' % (
|
||||
position, int(list(odvf.curve.vc_points)[position].frequency / 1000000),
|
||||
@@ -2704,7 +2713,7 @@ def showAccessibleTopology(deviceList):
|
||||
for srcdevice in deviceList:
|
||||
for destdevice in deviceList:
|
||||
ret = rocmsmi.rsmi_is_P2P_accessible(srcdevice, destdevice, byref(accessible))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='is_P2P_accessible'):
|
||||
gpu_links_type[srcdevice][destdevice] = accessible.value
|
||||
else:
|
||||
printErrLog(srcdevice, 'Cannot read link accessibility: Unsupported on this machine')
|
||||
@@ -2743,7 +2752,7 @@ def showWeightTopology(deviceList):
|
||||
continue
|
||||
weight = c_uint64()
|
||||
ret = rocmsmi.rsmi_topo_get_link_weight(srcdevice, destdevice, byref(weight))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_link_weight_topology'):
|
||||
gpu_links_weight[srcdevice][destdevice] = weight
|
||||
else:
|
||||
printErrLog(srcdevice, 'Cannot read Link Weight: Not supported on this machine')
|
||||
@@ -2790,7 +2799,7 @@ def showHopsTopology(deviceList):
|
||||
continue
|
||||
hops = c_uint64()
|
||||
ret = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_link_type_topology'):
|
||||
gpu_links_hops[srcdevice][destdevice] = hops
|
||||
else:
|
||||
printErrLog(srcdevice, 'Cannot read Link Hops: Not supported on this machine')
|
||||
@@ -2836,7 +2845,7 @@ def showTypeTopology(deviceList):
|
||||
gpu_links_type[srcdevice][destdevice] = '0'
|
||||
continue
|
||||
ret = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_link_topology_type'):
|
||||
if (linktype.value == 1):
|
||||
gpu_links_type[srcdevice][destdevice] = "PCIE"
|
||||
elif (linktype.value == 2):
|
||||
@@ -2878,13 +2887,13 @@ def showNumaTopology(deviceList):
|
||||
numa_numbers = c_uint32()
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_topo_get_numa_node_number(device, byref(numa_numbers))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_numa_node_number'):
|
||||
printLog(device, "(Topology) Numa Node", numa_numbers.value)
|
||||
else:
|
||||
printErrLog(device, "Cannot read Numa Node")
|
||||
|
||||
ret = rocmsmi.rsmi_topo_numa_affinity_get(device, byref(numa_numbers))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_numa_affinity_topology'):
|
||||
printLog(device, "(Topology) Numa Affinity", numa_numbers.value)
|
||||
else:
|
||||
printErrLog(device, 'Cannot read Numa Affinity')
|
||||
@@ -2927,13 +2936,13 @@ def showNodesBw(deviceList):
|
||||
ret = rocmsmi.rsmi_minmax_bandwidth_get(srcdevice, destdevice, byref(minBW), byref(maxBW))
|
||||
#verify that link type is xgmi
|
||||
ret2 = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype))
|
||||
if rsmi_ret_ok(ret2," {} to {}".format(srcdevice, destdevice), None, True):
|
||||
if rsmi_ret_ok(ret2," {} to {}".format(srcdevice, destdevice), 'get_link_topology_type', True):
|
||||
if linktype.value != 2:
|
||||
nonXgmi = True
|
||||
silent= True
|
||||
gpu_links_type[srcdevice][destdevice] = "N/A"
|
||||
|
||||
if rsmi_ret_ok(ret, " {} to {}".format(srcdevice, destdevice),None,silent):
|
||||
if rsmi_ret_ok(ret, " {} to {}".format(srcdevice, destdevice), 'get_link_topology_type',silent):
|
||||
gpu_links_type[srcdevice][destdevice] = "{}-{}".format(minBW.value, maxBW.value)
|
||||
else:
|
||||
gpu_links_type[srcdevice][destdevice] = "N/A"
|
||||
@@ -2965,12 +2974,12 @@ def showComputePartition(deviceList):
|
||||
printLogSpacer(' Current Compute Partition ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
|
||||
if rsmi_ret_ok(ret, device, silent=True) and currentComputePartition.value.decode():
|
||||
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
|
||||
printLog(device, 'Compute Partition', currentComputePartition.value.decode())
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device)
|
||||
rsmi_ret_ok(ret, device, 'get_compute_partition')
|
||||
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
|
||||
printLogSpacer()
|
||||
|
||||
@@ -2983,12 +2992,12 @@ def showNPSMode(deviceList):
|
||||
printLogSpacer(' Current NPS Mode ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_nps_mode_get(device, npsMode, 256)
|
||||
if rsmi_ret_ok(ret, device, silent=True) and npsMode.value.decode():
|
||||
if rsmi_ret_ok(ret, device, 'get_NPS_mode',silent=True) and npsMode.value.decode():
|
||||
printLog(device, 'NPS Mode', npsMode.value.decode())
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device)
|
||||
rsmi_ret_ok(ret, device, 'get_NPS_mode')
|
||||
printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.')
|
||||
printLogSpacer()
|
||||
|
||||
@@ -3088,7 +3097,7 @@ def listDevices():
|
||||
""" Returns a list of GPU devices """
|
||||
numberOfDevices = c_uint32(0)
|
||||
ret = rocmsmi.rsmi_num_monitor_devices(byref(numberOfDevices))
|
||||
if rsmi_ret_ok(ret):
|
||||
if rsmi_ret_ok(ret, metric='get_num_monitor_devices'):
|
||||
deviceList = list(range(numberOfDevices.value))
|
||||
return deviceList
|
||||
else:
|
||||
@@ -3178,6 +3187,8 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
|
||||
@param device: DRM device identifier
|
||||
@param my_ret: Return of RSMI call (rocm_smi_lib API)
|
||||
@param metric: Parameter of GPU currently being analyzed
|
||||
@param silent: Echo verbose error reponse.
|
||||
True siliences err output, False does not silience err output (default).
|
||||
"""
|
||||
global RETCODE
|
||||
global PRINT_JSON
|
||||
@@ -3194,7 +3205,7 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
|
||||
logging.debug('%s', returnString)
|
||||
if not silent:
|
||||
if my_ret in rsmi_status_verbose_err_out:
|
||||
printLog(device, rsmi_status_verbose_err_out[my_ret], None)
|
||||
printLog(device, metric + ", " + rsmi_status_verbose_err_out[my_ret], None)
|
||||
RETCODE = my_ret
|
||||
return False
|
||||
return True
|
||||
@@ -3226,14 +3237,14 @@ def save(deviceList, savefilepath):
|
||||
for clk_type in sorted(rsmi_clk_names_dict):
|
||||
clocks[device] = clocks.get(device, {})
|
||||
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
|
||||
if rsmi_ret_ok(ret, device, clk_type, True):
|
||||
if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clk_type), True):
|
||||
clocks[device][clk_type] = str(freq.current)
|
||||
else:
|
||||
clocks[device][clk_type] = '0'
|
||||
fanSpeeds[device] = getFanSpeed(device)[0]
|
||||
od = c_uint32()
|
||||
ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(od))
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if rsmi_ret_ok(ret, device, 'get_overdrive_level'):
|
||||
overDriveGpu[device] = str(od.value)
|
||||
else:
|
||||
overDriveGpu[device] = '0'
|
||||
@@ -3241,7 +3252,7 @@ def save(deviceList, savefilepath):
|
||||
overDriveGpuMem[device] = '0'
|
||||
status = rsmi_power_profile_status_t()
|
||||
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
|
||||
if rsmi_ret_ok(ret, device, 'profile'):
|
||||
if rsmi_ret_ok(ret, device, 'get_profile_presets'):
|
||||
profiles[device] = str(str(bin(status.current))[2:][::-1].index('1') + 1)
|
||||
else:
|
||||
profiles[device] = str('UNKNOWN')
|
||||
|
||||
@@ -520,10 +520,12 @@ void RocmSMI::printEnvVarInfo(void) {
|
||||
}
|
||||
for (auto it=env_vars_.enum_overrides.begin();
|
||||
it != env_vars_.enum_overrides.end(); ++it) {
|
||||
std::cout << *it;
|
||||
DevInfoTypes type = static_cast<DevInfoTypes>(*it);
|
||||
std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type)
|
||||
+ ")");
|
||||
auto temp_it = it;
|
||||
if(++temp_it != env_vars_.enum_overrides.end()) {
|
||||
std::cout << ",";
|
||||
std::cout << ", ";
|
||||
}
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
|
||||
Ссылка в новой задаче
Block a user