[SWDEV-392571] Fix concise info when missing VRAM info

Updates:
    * [rocm-smi] Added larger app width size, which helps
      display missing device info
    * [rocm-smi] Added better context when rsmi_ret_ok
      does not return with RSMI_STATUS_SUCCESS
    * [rocm-smi] Removed all references to an
      undefined function (printLogNoDev())
    * [rocm-smi] Fixed not detecting non-int
      values when setting the voltage curve
    * [rocm-smi] Added better context on missing
      sysfs file when setting clock overdrive
      values
    * [rocm-smi] Fixed getMemInfo() calls not
      referencing tuple values (making it easier
      to read)
    * [rocm-smi] Silenced concise info spitting
      out errors for missing VRAM files, instead
      display which metric is "unsupported" if
      the files are missing
    * [rocm-smi] Updated function descriptions for
      rsmi_ret_ok & getMemInfo
    * [rocm-smi] Updated getMemInfo to provide a
      quiet call, to silence for concise info calls.
      This provides a way to keep the output clean.
    * [rocm-smi-lib] Added when using debug sysfs
      files, to state, which enums are enabled
      for debug

Change-Id: I0e9e0c97ccf71467ced0e1a1f71803327a8be2b7
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: 6be92b9e26]
Этот коммит содержится в:
Charis Poag
2023-04-13 10:43:52 -05:00
родитель 759d14709d
Коммит fc18ccd37a
2 изменённых файлов: 130 добавлений и 117 удалений
+126 -115
Просмотреть файл
@@ -47,7 +47,7 @@ headerString = ' ROCm System Management Interface '
footerString = ' End of ROCm SMI Log '
# Output formatting
appWidth = 80
appWidth = 84
deviceList = []
# Enable or disable serialized format
@@ -197,7 +197,7 @@ def getBus(device):
function = bdfid.value & 0x7
pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function)
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_pci_id'):
return pic_id
@@ -215,10 +215,10 @@ def getFanSpeed(device):
fm = 0
ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel))
if rsmi_ret_ok(ret, device, None, True):
if rsmi_ret_ok(ret, device, 'get_fan_speed', True):
fl = fanLevel.value
ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax))
if rsmi_ret_ok(ret, device, None, True):
if rsmi_ret_ok(ret, device, 'get_fan_max_speed', True):
fm = fanMax.value
if fl == 0 or fm == 0:
return (fl, 0) # to prevent division by zero crash
@@ -245,7 +245,7 @@ def getId(device):
"""
dv_id = c_short()
ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_device_id'):
return hex(dv_id.value)
@@ -256,16 +256,21 @@ def getMaxPower(device):
"""
power_cap = c_uint64()
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_power_cap'):
return power_cap.value / 1000000
return -1
def getMemInfo(device, memType):
""" Return the specified memory usage for the specified device
def getMemInfo(device, memType, quiet=False):
""" Returns a tuple of (memory_used, memory_total) of
the requested memory type usage for the device specified
@param device: DRM device identifier
@param type: [vram|vis_vram|gtt] Memory type to return
@param quiet=Turn on to silience error output
(you plan to handle manually). Default is off,
which exposes any issue accessing the different
memory types.
"""
memType = memType.upper()
if memType not in memory_type_l:
@@ -278,11 +283,11 @@ def getMemInfo(device, memType):
memTotal = None
ret = rocmsmi.rsmi_dev_memory_usage_get(device, memory_type_l.index(memType), byref(memoryUse))
if rsmi_ret_ok(ret, device, memType):
if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), quiet):
memUsed = memoryUse.value
ret = rocmsmi.rsmi_dev_memory_total_get(device, memory_type_l.index(memType), byref(memoryTot))
if rsmi_ret_ok(ret, device, memType + ' total'):
if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), quiet):
memTotal = memoryTot.value
return (memUsed, memTotal)
@@ -319,7 +324,7 @@ def getPerfLevel(device):
"""
perf = rsmi_dev_perf_level_t()
ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_perf_level'):
return perf_level_string(perf.value)
return -1
@@ -336,7 +341,7 @@ def getPidList():
""" Return a list of KFD process IDs """
num_items = c_uint32()
ret = rocmsmi.rsmi_compute_process_info_get(None, byref(num_items))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_compute_process_info'):
buff_sz = num_items.value + 10
procs = (rsmi_process_info_t * buff_sz)()
procList = []
@@ -354,7 +359,7 @@ def getPower(device):
"""
power = c_uint32()
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power))
if rsmi_ret_ok(ret, device, 'power'):
if rsmi_ret_ok(ret, device, 'get_power_avg'):
return power.value / 1000000
return 'N/A'
@@ -368,7 +373,7 @@ def getRasEnablement(device, block):
state = rsmi_ras_err_state_t()
ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state))
if rsmi_ret_ok(ret, device, block, True):
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True):
return rsmi_ras_err_stale_machine[state.value].upper()
return 'N/A'
@@ -382,7 +387,7 @@ def getTemp(device, sensor):
temp = c_int64(0)
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), temp_type_lst.index(sensor), metric, byref(temp))
if rsmi_ret_ok(ret, device, sensor, True):
if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), True):
return temp.value / 1000
return 'N/A'
@@ -406,7 +411,7 @@ def getVersion(deviceList, component):
"""
ver_str = create_string_buffer(256)
ret = rocmsmi.rsmi_version_str_get(component, ver_str, 256)
if rsmi_ret_ok(ret, None, component):
if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component)):
return ver_str.value.decode()
return None
@@ -418,7 +423,7 @@ def getComputePartition(device):
"""
currentComputePartition = create_string_buffer(256)
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
if rsmi_ret_ok(ret, device, silent=True) and currentComputePartition.value.decode():
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
return str(currentComputePartition.value.decode())
return "UNKNOWN"
@@ -430,7 +435,7 @@ def getMemoryPartition(device):
"""
currentNPSMode = create_string_buffer(256)
ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256)
if rsmi_ret_ok(ret, device, silent=True) and currentNPSMode.value.decode():
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode():
return str(currentNPSMode.value.decode())
return "UNKNOWN"
@@ -518,13 +523,13 @@ def printEventList(device, delay, eventList):
"""
mask = 0
ret = rocmsmi.rsmi_event_notification_init(device)
if not rsmi_ret_ok(ret, device):
if not rsmi_ret_ok(ret, device, 'event_notification_init'):
printErrLog(device, 'Unable to initialize event notifications.')
return
for eventType in eventList:
mask |= 2 ** notification_type_names.index(eventType.upper())
ret = rocmsmi.rsmi_event_notification_mask_set(device, mask)
if not rsmi_ret_ok(ret, device):
if not rsmi_ret_ok(ret, device, 'set_event_notification_mask'):
printErrLog(device, 'Unable to set event notification mask.')
return
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
@@ -555,7 +560,7 @@ def printLog(device, metricName, value, extraSpace=False):
else:
logstr = 'GPU[%s]\t\t: %s' % (device, metricName)
if device is None:
logstr = logstr[13:]
logstr = logstr.split(':')[1][1:]
# Force thread safe printing
lock = multiprocessing.Lock()
lock.acquire()
@@ -687,13 +692,13 @@ def checkIfSecondaryDie(device):
power_cap = c_uint64()
# secondary die can currently be determined by checking if all power1_* (power cap) values are equal to zero.
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
if not (rsmi_ret_ok(ret, None, 'get_power_cap', False) and power_cap.value == 0):
return False
ret = rocmsmi.rsmi_dev_power_cap_default_get(device, byref(power_cap))
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
if not (rsmi_ret_ok(ret, None, 'get_power_cap_default', False) and power_cap.value == 0):
return False
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power_cap))
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
if not (rsmi_ret_ok(ret, None, 'get_power_avg', False) and power_cap.value == 0):
return False
return True
@@ -709,17 +714,17 @@ def resetClocks(deviceList):
printLogSpacer(' Reset Clocks ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(0))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_overdrive_level'):
printLog(device, 'OverDrive set to 0', None)
else:
printLog(device, 'Unable to reset OverDrive', None)
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_perf_level'):
printLog(device, 'Successfully reset clocks', None)
else:
printLog(device, 'Unable to reset clocks', None)
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_perf_level'):
printLog(device, 'Performance level reset to auto', None)
else:
printLog(device, 'Unable to reset performance level to auto', None)
@@ -734,7 +739,7 @@ def resetFans(deviceList):
for device in deviceList:
sensor_ind = c_uint32(0)
ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind)
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'reset_fan'):
printLog(device, 'Successfully reset fan speed to driver control', None)
printLogSpacer()
@@ -755,12 +760,12 @@ def resetProfile(deviceList):
printLogSpacer(' Reset Profile ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_power_profile_set(device, 0, profileString('BOOTUP DEFAULT'))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_power_profile'):
printLog(device, 'Successfully reset Power Profile', None)
else:
printErrLog(device, 'Unable to reset Power Profile')
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_perf_level'):
printLog(device, 'Successfully reset Performance Level', None)
else:
printErrLog(device, 'Unable to reset Performance Level')
@@ -806,7 +811,7 @@ def resetComputePartition(deviceList):
for device in deviceList:
originalPartition = getComputePartition(device)
ret = rocmsmi.rsmi_dev_compute_partition_reset(device)
if rsmi_ret_ok(ret, device, silent=True):
if rsmi_ret_ok(ret, device, 'reset_compute_partition', silent=True):
resetBootState = getComputePartition(device)
printLog(device, "Successfully reset compute partition (" +
originalPartition + ") to boot state (" + resetBootState +
@@ -816,7 +821,7 @@ def resetComputePartition(deviceList):
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
else:
rsmi_ret_ok(ret, device)
rsmi_ret_ok(ret, device, 'reset_compute_partition')
printErrLog(device, 'Failed to reset the compute partition to boot state')
printLogSpacer()
@@ -842,7 +847,7 @@ def resetNpsMode(deviceList):
t1.join()
if duration < float(0.1): # For longer runs, add extra line before output
addExtraLine=False # This is to prevent overriding progress bar
if rsmi_ret_ok(ret, device, silent=True):
if rsmi_ret_ok(ret, device, 'reset_NPS_mode', silent=True):
resetBootState = getMemoryPartition(device)
printLog(device, "Successfully reset nps mode (" +
originalPartition + ") to boot state (" +
@@ -852,7 +857,7 @@ def resetNpsMode(deviceList):
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None, addExtraLine)
else:
rsmi_ret_ok(ret, device)
rsmi_ret_ok(ret, device, 'reset_NPS_mode')
printErrLog(device, 'Failed to reset nps mode to boot state')
printLogSpacer()
@@ -906,16 +911,16 @@ def setVoltageCurve(deviceList, point, clk, volt, autoRespond):
global RETCODE
value = '%s %s %s' % (point, clk, volt)
try:
any(int(item) for item in value)
any(int(item) for item in value.split())
except ValueError:
printLogNoDev('Unable to set Voltage curve')
logging.error('Non-integer characters are present in %s', value)
printErrLog(None, 'Unable to set Voltage curve')
printErrLog(None, 'Non-integer characters are present in %s' %value)
RETCODE = 1
return
confirmOutOfSpecWarning(autoRespond)
for device in deviceList:
ret = rocmsmi.rsmi_dev_od_volt_info_set(device, int(point), int(clk), int(volt))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_voltage_curve'):
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
else:
printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
@@ -935,11 +940,12 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
"""
global RETCODE
value = '%s %s %s' % (point, clk, volt)
listOfValues = value.split(' ')
try:
any(int(item) for item in value.split())
except ValueError:
printLogNoDev('Unable to set PowerPlay table level')
logging.error('Non-integer characters are present in %s', value)
printErrLog(None, 'Unable to set PowerPlay table level')
printErrLog(None, 'Non-integer characters are present in %s' %value)
RETCODE = 1
return
confirmOutOfSpecWarning(autoRespond)
@@ -947,7 +953,7 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
if clkType == 'sclk':
ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk),
rsmi_clk_names_dict[clkType])
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)):
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
else:
printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
@@ -955,7 +961,7 @@ def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
elif clkType == 'mclk':
ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk),
rsmi_clk_names_dict[clkType])
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)):
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
else:
printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
@@ -996,14 +1002,15 @@ def setClockOverDrive(deviceList, clktype, value, autoRespond):
value = '20'
if getPerfLevel(device) != 'MANUAL':
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_perf_level_manual_' + str(clktype)):
printLog(device, 'Performance level set to manual', None)
else:
printErrLog(device, 'Unable to set performance level to manual')
if clktype == 'mclk':
fsFile = os.path.join('/sys/class/drm', 'card%d' % (device), 'device', 'pp_mclk_od')
if not os.path.isfile(fsFile):
printLog(None, 'Unable to write to sysfs file', None)
printLog(None, 'Unable to write to sysfs file (' + fsFile +
'), file does not exist', None)
logging.debug('%s does not exist', fsFile)
continue
try:
@@ -1011,14 +1018,14 @@ def setClockOverDrive(deviceList, clktype, value, autoRespond):
with open(fsFile, 'w') as fs:
fs.write(value + '\n')
except (IOError, OSError):
printLog(None, 'Unable to write to sysfs file %s' % fsFile, None)
printLog(None, 'Unable to write to sysfs file %s' %fsFile, None)
logging.warning('IO or OS error')
RETCODE = 1
continue
printLog(device, 'Successfully set %s OverDrive to %s%%' % (clktype, value), None)
elif clktype == 'sclk':
ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(int(value)))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_overdrive_level_' + str(clktype)):
printLog(device, 'Successfully set %s OverDrive to %s%%' % (clktype, value), None)
else:
printLog(device, 'Unable to set %s OverDrive to %s%%' % (clktype, value), None)
@@ -1069,7 +1076,7 @@ def setClocks(deviceList, clktype, clk):
# Check if the performance level is manual, if not then set it to manual
if getPerfLevel(device).lower() != 'manual':
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_perf_level_manual'):
printLog(device, 'Performance level was set to manual', None)
else:
printErrLog(device, 'Unable to set performance level to manual')
@@ -1079,7 +1086,7 @@ def setClocks(deviceList, clktype, clk):
# Validate frequency bitmask
freq = rsmi_frequencies_t()
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clktype], byref(freq))
if rsmi_ret_ok(ret, device, clktype) == False:
if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clktype)) == False:
RETCODE = 1
return
# The freq_bitmask should be less than 2^(freqs.num_supported)
@@ -1090,7 +1097,7 @@ def setClocks(deviceList, clktype, clk):
return
ret = rocmsmi.rsmi_dev_gpu_clk_freq_set(device, rsmi_clk_names_dict[clktype], freq_bitmask)
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_gpu_clk_freq_' + str(clktype)):
printLog(device, 'Successfully set %s bitmask to' % (clktype), hex(freq_bitmask))
else:
printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask)))
@@ -1099,7 +1106,7 @@ def setClocks(deviceList, clktype, clk):
# Validate the bandwidth bitmask
bw = rsmi_pcie_bandwidth_t()
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
if rsmi_ret_ok(ret, device, 'PCIe') == False:
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth') == False:
RETCODE = 1
return
# The freq_bitmask should be less than 2^(bw.transfer_rate.num_supported)
@@ -1110,7 +1117,7 @@ def setClocks(deviceList, clktype, clk):
return
ret = rocmsmi.rsmi_dev_pci_bandwidth_set(device, freq_bitmask)
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_PCIe_bandwidth'):
printLog(device, 'Successfully set %s to level bitmask' % (clktype), hex(freq_bitmask))
else:
printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask)))
@@ -1135,7 +1142,7 @@ def setPerfDeterminism(deviceList, clkvalue):
return
for device in deviceList:
ret = rocmsmi.rsmi_perf_determinism_mode_set(device, int(clkvalue))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_perf_determinism'):
printLog(device, 'Successfully enabled performance determinism and set GFX clock frequency', str(clkvalue))
else:
printErrLog(device, 'Unable to set performance determinism and clock frequency to %s' % (str(clkvalue)))
@@ -1159,7 +1166,7 @@ def resetGpu(device):
RETCODE = 1
return
ret = rocmsmi.rsmi_dev_gpu_reset(resetDev)
if rsmi_ret_ok(ret, resetDev):
if rsmi_ret_ok(ret, resetDev, 'reset_gpu'):
printLog(resetDev, 'Successfully reset GPU %d' % (resetDev), None)
else:
printErrLog(resetDev, 'Unable to reset GPU %d' % (resetDev))
@@ -1258,7 +1265,7 @@ def setFanSpeed(deviceList, fan):
else:
fanLevel = int(str(fan))
ret = rocmsmi.rsmi_dev_fan_speed_set(device, 0, int(fanLevel))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_fan_speed'):
printLog(device, 'Successfully set fan speed to level %s' % (str(int(fanLevel))), None)
printLogSpacer()
@@ -1277,7 +1284,7 @@ def setPerformanceLevel(deviceList, level):
logging.error('Invalid Performance level: %s', level)
else:
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(validLevels.index(level)))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_perf_level'):
printLog(device, 'Performance level set to %s' % (str(level)), None)
printLogSpacer()
@@ -1336,7 +1343,7 @@ def setPowerOverDrive(deviceList, value, autoRespond):
new_power_cap.value = int(value) * 1000000
ret = rocmsmi.rsmi_dev_power_cap_range_get(device, 0, byref(power_cap_max), byref(power_cap_min))
if rsmi_ret_ok(ret, device) == False:
if rsmi_ret_ok(ret, device, 'get_power_cap_range') == False:
printErrLog(device, 'Unable to parse Power OverDrive range')
RETCODE = 1
continue
@@ -1360,11 +1367,11 @@ def setPowerOverDrive(deviceList, value, autoRespond):
specWarningConfirmed = True
ret = rocmsmi.rsmi_dev_power_cap_set(device, 0, new_power_cap)
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'set_power_cap'):
if int(value) == 0:
power_cap = c_uint64()
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_power_cap'):
if not PRINT_JSON:
printLog(device,
'Successfully reset Power OverDrive to: %sW' % (int(power_cap.value / 1000000)), None)
@@ -1395,7 +1402,7 @@ def setProfile(deviceList, profile):
for device in deviceList:
# Get previous profile
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
if rsmi_ret_ok(ret, device, 'previous profile'):
if rsmi_ret_ok(ret, device, 'get_power_profile'):
previousProfile = profileString(status.current)
# Get desired profile
desiredProfile = 'UNKNOWN'
@@ -1412,10 +1419,10 @@ def setProfile(deviceList, profile):
return
else:
ret = rocmsmi.rsmi_dev_power_profile_set(device, 0, profileString(desiredProfile))
if rsmi_ret_ok(ret, device, 'set profile'):
if rsmi_ret_ok(ret, device, 'set_power_profile'):
# Get current profile
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
if rsmi_ret_ok(ret, device, 'current profile'):
if rsmi_ret_ok(ret, device, 'get_power_profile_presets'):
currentProfile = profileString(status.current)
if currentProfile == desiredProfile:
printLog(device, 'Successfully set profile to', desiredProfile)
@@ -1441,7 +1448,7 @@ def setComputePartition(deviceList, computePartitionType):
return (None, None)
ret = rocmsmi.rsmi_dev_compute_partition_set(device,
rsmi_compute_partition_type_dict[computePartitionType])
if rsmi_ret_ok(ret, device, silent=True):
if rsmi_ret_ok(ret, device, 'set_compute_partition', silent=True):
printLog(device,
'Successfully set compute partition to %s' % (computePartitionType),
None)
@@ -1453,7 +1460,7 @@ def setComputePartition(deviceList, computePartitionType):
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
else:
rsmi_ret_ok(ret, device)
rsmi_ret_ok(ret, device, 'set_compute_partition')
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
printLogSpacer()
@@ -1514,7 +1521,7 @@ def setNPSMode(deviceList, npsMode):
if duration < float(0.1): # For longer runs, add extra line before output
addExtraLine=False # This is to prevent overriding progress bar
if rsmi_ret_ok(ret, device, silent=True):
if rsmi_ret_ok(ret, device, 'set_NPS_mode', silent=True):
printLog(device,
'Successfully set nps mode to %s' % (npsMode),
None, addExtraLine)
@@ -1523,7 +1530,7 @@ def setNPSMode(deviceList, npsMode):
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None, addExtraLine)
else:
rsmi_ret_ok(ret, device)
rsmi_ret_ok(ret, device, 'set_NPS_mode')
printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.')
printLogSpacer()
@@ -1567,10 +1574,12 @@ def showAllConcise(deviceList):
gpu_busy = str(getGpuUse(device)) + '%'
else:
gpu_busy = 'Unsupported'
memInfo = getMemInfo(device, 'vram')
vram_used, vram_total = getMemInfo(device, 'vram', True)
mem_use_pct = 0
if memInfo[0] != None and memInfo[1] != None and float(memInfo[1]) != 0:
mem_use_pct = '% 3.0f%%' % (100 * (float(memInfo[0]) / float(memInfo[1])))
if vram_used is None:
mem_use_pct='Unsupported'
if vram_used != None and vram_total != None and float(vram_total) != 0:
mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total)))
values['card%s' % (str(device))] = [device, temp, avgPwr, sclk, mclk, fan, str(perf).lower(), pwrCap,
mem_use_pct, gpu_busy]
val_widths = {}
@@ -1650,7 +1659,7 @@ def showClocks(deviceList):
freq_list = []
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
if rsmi_ret_ok(ret, device, clk_type, True):
if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True):
printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None)
for x in range(freq.num_supported):
fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000)
@@ -1664,7 +1673,7 @@ def showClocks(deviceList):
printLog(device, '', None)
if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1:
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
if rsmi_ret_ok(ret, device, 'PCIe', True):
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth', True):
printLog(device, 'Supported %s frequencies on GPU%s' % ('PCIe', str(device)), None)
freq_list = []
for x in range(bw.transfer_rate.num_supported):
@@ -1698,7 +1707,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
if clk_defined:
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], None) == 1:
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], byref(freq))
if rsmi_ret_ok(ret, device, clk_defined, True):
if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clk_defined), silent=True):
levl = freq.current
if levl >= freq.num_supported:
printLog(device, '%s current clock frequency not found' % (clk_defined), None)
@@ -1714,7 +1723,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
for clk_type in sorted(rsmi_clk_names_dict):
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
if rsmi_ret_ok(ret, device, clk_type, True):
if rsmi_ret_ok(ret, device, 'get_clk_freq_' + str(clk_type), True):
levl = freq.current
if levl >= freq.num_supported:
printLog(device, '%s current clock frequency not found' % (clk_type), None)
@@ -1730,7 +1739,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
# pcie clocks
if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1:
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
if rsmi_ret_ok(ret, device, 'PCIe', True):
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth', True):
current_f = bw.transfer_rate.current
if current_f >= bw.transfer_rate.num_supported:
printLog(device, 'PCIe current clock frequency not found', None )
@@ -1768,7 +1777,7 @@ def showCurrentFans(deviceList):
else:
printLog(device, 'Fan Level', str(fanLevel) + ' (%s%%)' % (str(fanSpeed)))
ret = rocmsmi.rsmi_dev_fan_rpms_get(device, sensor_ind, byref(rpmSpeed))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_fan_rpms'):
printLog(device, 'Fan RPM', rpmSpeed.value)
printLogSpacer()
@@ -1809,7 +1818,7 @@ def showFwInfo(deviceList, fwType):
for fw_name in firmware_blocks:
fw_name = fw_name.upper()
ret = rocmsmi.rsmi_dev_firmware_version_get(device, fw_block_names_l.index(fw_name), byref(fw_ver))
if rsmi_ret_ok(ret, device, fw_name):
if rsmi_ret_ok(ret, device, 'get_firmware_version_' + str(fw_name)):
# The VCN, VCE, UVD, SOS and ASD firmware's value needs to be in hexadecimal
if fw_name in ['VCN', 'VCE', 'UVD', 'SOS', 'ASD']:
printLog(device, '%s firmware version' % (fw_name),
@@ -1853,12 +1862,12 @@ def showGpusByPid(pidList):
return
for pid in pidList:
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), None, byref(num_devices))
if rsmi_ret_ok(ret, 'PID ' + pid):
if rsmi_ret_ok(ret, metric=('PID ' + pid)):
dv_indices = (c_uint32 * num_devices.value)()
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), dv_indices, byref(num_devices))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
metricName = 'PID %s is using %s DRM device(s)' % (pid, str(num_devices.value))
if (num_devices.value):
printListLog(metricName, list(dv_indices))
@@ -1900,7 +1909,7 @@ def getCoarseGrainUtil(device, typeName=None):
utilization_counters[i].type = c_int(i)
ret = rocmsmi.rsmi_utilization_count_get(device, utilization_counters, length, byref(timestamp))
if rsmi_ret_ok(ret, device, typeName, True):
if rsmi_ret_ok(ret, device, 'get_utilization_count_'+ str(typeName), True):
return utilization_counters
return -1
@@ -2026,7 +2035,7 @@ def showMemVendor(deviceList):
printLogSpacer(' Memory Vendor ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_vram_vendor_get(device, vendor, 256)
if rsmi_ret_ok(ret, device) and vendor.value.decode():
if rsmi_ret_ok(ret, device, 'get_vram_vendor') and vendor.value.decode():
printLog(device, 'GPU memory vendor', vendor.value.decode())
else:
logging.debug('GPU memory vendor missing or not supported')
@@ -2046,13 +2055,13 @@ def showOverDrive(deviceList, odtype):
odStr = 'GPU'
ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(rsmi_od))
od = rsmi_od.value
if not rsmi_ret_ok(ret, device):
if not rsmi_ret_ok(ret, device, 'get_overdrive_level_' + str(odtype)):
continue
elif odtype == 'mclk':
odStr = 'GPU Memory'
ret = rocmsmi.rsmi_dev_mem_overdrive_level_get(device, byref(rsmi_od))
od = rsmi_od.value
if not rsmi_ret_ok(ret, device):
if not rsmi_ret_ok(ret, device, 'get_mem_overdrive_level_' + str(odtype)):
continue
else:
printErrLog(device, 'Unable to retrieve OverDrive')
@@ -2073,7 +2082,7 @@ def showPcieBw(deviceList):
printLogSpacer(' Measured PCIe Bandwidth ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_pci_throughput_get(device, byref(sent), byref(received), byref(max_pkt_sz))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth'):
# Use 1024.0 to ensure that the result is a float and not integer division
bw = ((received.value + sent.value) * max_pkt_sz.value) / 1024.0 / 1024.0
# Use the bwstr below to control precision on the string
@@ -2131,15 +2140,15 @@ def showPids():
sdmaUsage = 'UNKNOWN'
cuOccupancy = 'UNKNOWN'
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), None, byref(num_devices))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
dv_indices = (c_uint32 * num_devices.value)()
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), dv_indices, byref(num_devices))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
gpuNumber = str(num_devices.value)
else:
logging.debug('Unable to fetch GPU number by PID')
ret = rocmsmi.rsmi_compute_process_info_by_pid_get(int(pid), byref(proc))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_compute_process_info_by_pid'):
vramUsage = proc.vram_usage
sdmaUsage = proc.sdma_usage
cuOccupancy = proc.cu_occupancy
@@ -2184,7 +2193,7 @@ def showPowerPlayTable(deviceList):
odvf = rsmi_od_volt_freq_data_t()
for device in deviceList:
ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
if rsmi_ret_ok(ret, device, 'od volt'):
if rsmi_ret_ok(ret, device, 'get_od_volt'):
# TODO: Make this more dynamic and less hard-coded if possible
printLog(device, 'OD_SCLK:', None)
printLog(device, '0: %sMhz' % (int(odvf.curr_sclk_range.lower_bound / 1000000)), None)
@@ -2224,7 +2233,7 @@ def showProductName(deviceList):
# Retrieve card vendor
ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, 256)
# Only continue if GPU vendor is AMD
if rsmi_ret_ok(ret, device) and isAmdDevice(device):
if rsmi_ret_ok(ret, device, 'get_vendor_name') and isAmdDevice(device):
try:
device_vendor = vendor.value.decode()
except UnicodeDecodeError:
@@ -2232,7 +2241,7 @@ def showProductName(deviceList):
device_vendor = "N/A"
# Retrieve the device series
ret = rocmsmi.rsmi_dev_name_get(device, series, 256)
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_name'):
try:
device_series = series.value.decode()
printLog(device, 'Card series', '\t\t' + device_series)
@@ -2240,7 +2249,7 @@ def showProductName(deviceList):
printErrLog(device, "Unable to read card series")
# Retrieve the device model
ret = rocmsmi.rsmi_dev_subsystem_name_get(device, model, 256)
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_subsystem_name'):
try:
device_model = model.value.decode()
# padHexValue is used for applications that expect 4-digit card models
@@ -2254,7 +2263,7 @@ def showProductName(deviceList):
# device_sku = sku.value.decode()
# Retrieve the device SKU as a substring from VBIOS
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
if rsmi_ret_ok(ret, device) and vbios.value.decode():
if rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode():
# Device SKU is just the characters in between the two '-' in vbios_version
if vbios.value.decode().count('-') == 2 and len(str(vbios.value.decode().split('-')[1])) > 1:
device_sku = vbios.value.decode().split('-')[1]
@@ -2282,7 +2291,7 @@ def showProfile(deviceList):
status = rsmi_power_profile_status_t()
for device in deviceList:
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
if rsmi_ret_ok(ret, device, 'profiles', silent=False):
if rsmi_ret_ok(ret, device, 'get_power_profiles', silent=False):
binaryMaskString = str(format(status.available_profiles, '07b'))[::-1]
bitMaskPosition = 0
profileNumber = 0
@@ -2314,7 +2323,7 @@ def showRange(deviceList, rangeType):
odvf = rsmi_od_volt_freq_data_t()
for device in deviceList:
ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
if rsmi_ret_ok(ret, device, 'od volt', silent=False):
if rsmi_ret_ok(ret, device, 'get_od_volt', silent=False):
if rangeType == 'sclk':
printLog(device, 'Valid sclk range: %sMhz - %sMhz' % (
int(odvf.curr_sclk_range.lower_bound / 1000000), int(odvf.curr_sclk_range.upper_bound / 1000000)), None)
@@ -2362,7 +2371,7 @@ def showRasInfo(deviceList, rasType):
for block in rasBlocks:
row = []
ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state))
if rsmi_ret_ok(ret, device, block, True):
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True):
row.append(block)
row.append(rsmi_ras_err_stale_machine[state.value].upper())
# Now add the error count
@@ -2424,7 +2433,7 @@ def showSerialNumber(deviceList):
printErrLog(device, "FRU Serial Number contains non-alphanumeric characters. FRU is likely corrupted")
continue
if rsmi_ret_ok(ret, device) and sn.value.decode():
if rsmi_ret_ok(ret, device, 'get_serial_number') and sn.value.decode():
printLog(device, 'Serial Number', sn.value.decode())
else:
printLog(device, 'Serial Number', 'N/A')
@@ -2440,7 +2449,7 @@ def showUId(deviceList):
for device in deviceList:
dv_uid = c_uint64()
ret = rocmsmi.rsmi_dev_unique_id_get(device, byref(dv_uid))
if rsmi_ret_ok(ret, device, None, True) and str(hex(dv_uid.value)):
if rsmi_ret_ok(ret, device, 'get_unique_id', True) and str(hex(dv_uid.value)):
printLog(device, 'Unique ID', hex(dv_uid.value))
else:
printLog(device, 'Unique ID', 'N/A')
@@ -2510,7 +2519,7 @@ def showEvents(deviceList, eventTypes):
if user_input == 'q' or user_input == '\x03':
for device in deviceList:
ret = rocmsmi.rsmi_event_notification_stop(device)
if not rsmi_ret_ok(ret, device):
if not rsmi_ret_ok(ret, device, 'stop_event_notification'):
printErrLog(device, 'Unable to end event notifications.')
print('\r')
break
@@ -2633,7 +2642,7 @@ def showVoltage(deviceList):
met = rsmi_voltage_metric_t(0)
voltage = c_uint64()
ret = rocmsmi.rsmi_dev_volt_metric_get(device, vtype, met, byref(voltage))
if rsmi_ret_ok(ret, device) and str(voltage.value):
if rsmi_ret_ok(ret, device, 'get_volt_metric') and str(voltage.value):
printLog(device, 'Voltage (mV)', str(voltage.value))
else:
logging.debug('GPU voltage not supported')
@@ -2649,7 +2658,7 @@ def showVoltageCurve(deviceList):
odvf = rsmi_od_volt_freq_data_t()
for device in deviceList:
ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
if rsmi_ret_ok(ret, device, 'od volt', silent=False):
if rsmi_ret_ok(ret, device, 'get_od_volt_info', silent=False):
for position in range(3):
printLog(device, 'Voltage point %d: %sMhz %smV' % (
position, int(list(odvf.curve.vc_points)[position].frequency / 1000000),
@@ -2704,7 +2713,7 @@ def showAccessibleTopology(deviceList):
for srcdevice in deviceList:
for destdevice in deviceList:
ret = rocmsmi.rsmi_is_P2P_accessible(srcdevice, destdevice, byref(accessible))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='is_P2P_accessible'):
gpu_links_type[srcdevice][destdevice] = accessible.value
else:
printErrLog(srcdevice, 'Cannot read link accessibility: Unsupported on this machine')
@@ -2743,7 +2752,7 @@ def showWeightTopology(deviceList):
continue
weight = c_uint64()
ret = rocmsmi.rsmi_topo_get_link_weight(srcdevice, destdevice, byref(weight))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_link_weight_topology'):
gpu_links_weight[srcdevice][destdevice] = weight
else:
printErrLog(srcdevice, 'Cannot read Link Weight: Not supported on this machine')
@@ -2790,7 +2799,7 @@ def showHopsTopology(deviceList):
continue
hops = c_uint64()
ret = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_link_type_topology'):
gpu_links_hops[srcdevice][destdevice] = hops
else:
printErrLog(srcdevice, 'Cannot read Link Hops: Not supported on this machine')
@@ -2836,7 +2845,7 @@ def showTypeTopology(deviceList):
gpu_links_type[srcdevice][destdevice] = '0'
continue
ret = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_link_topology_type'):
if (linktype.value == 1):
gpu_links_type[srcdevice][destdevice] = "PCIE"
elif (linktype.value == 2):
@@ -2878,13 +2887,13 @@ def showNumaTopology(deviceList):
numa_numbers = c_uint32()
for device in deviceList:
ret = rocmsmi.rsmi_topo_get_numa_node_number(device, byref(numa_numbers))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_numa_node_number'):
printLog(device, "(Topology) Numa Node", numa_numbers.value)
else:
printErrLog(device, "Cannot read Numa Node")
ret = rocmsmi.rsmi_topo_numa_affinity_get(device, byref(numa_numbers))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_numa_affinity_topology'):
printLog(device, "(Topology) Numa Affinity", numa_numbers.value)
else:
printErrLog(device, 'Cannot read Numa Affinity')
@@ -2927,13 +2936,13 @@ def showNodesBw(deviceList):
ret = rocmsmi.rsmi_minmax_bandwidth_get(srcdevice, destdevice, byref(minBW), byref(maxBW))
#verify that link type is xgmi
ret2 = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype))
if rsmi_ret_ok(ret2," {} to {}".format(srcdevice, destdevice), None, True):
if rsmi_ret_ok(ret2," {} to {}".format(srcdevice, destdevice), 'get_link_topology_type', True):
if linktype.value != 2:
nonXgmi = True
silent= True
gpu_links_type[srcdevice][destdevice] = "N/A"
if rsmi_ret_ok(ret, " {} to {}".format(srcdevice, destdevice),None,silent):
if rsmi_ret_ok(ret, " {} to {}".format(srcdevice, destdevice), 'get_link_topology_type',silent):
gpu_links_type[srcdevice][destdevice] = "{}-{}".format(minBW.value, maxBW.value)
else:
gpu_links_type[srcdevice][destdevice] = "N/A"
@@ -2965,12 +2974,12 @@ def showComputePartition(deviceList):
printLogSpacer(' Current Compute Partition ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
if rsmi_ret_ok(ret, device, silent=True) and currentComputePartition.value.decode():
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
printLog(device, 'Compute Partition', currentComputePartition.value.decode())
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
else:
rsmi_ret_ok(ret, device)
rsmi_ret_ok(ret, device, 'get_compute_partition')
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
printLogSpacer()
@@ -2983,12 +2992,12 @@ def showNPSMode(deviceList):
printLogSpacer(' Current NPS Mode ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_nps_mode_get(device, npsMode, 256)
if rsmi_ret_ok(ret, device, silent=True) and npsMode.value.decode():
if rsmi_ret_ok(ret, device, 'get_NPS_mode',silent=True) and npsMode.value.decode():
printLog(device, 'NPS Mode', npsMode.value.decode())
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
else:
rsmi_ret_ok(ret, device)
rsmi_ret_ok(ret, device, 'get_NPS_mode')
printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.')
printLogSpacer()
@@ -3088,7 +3097,7 @@ def listDevices():
""" Returns a list of GPU devices """
numberOfDevices = c_uint32(0)
ret = rocmsmi.rsmi_num_monitor_devices(byref(numberOfDevices))
if rsmi_ret_ok(ret):
if rsmi_ret_ok(ret, metric='get_num_monitor_devices'):
deviceList = list(range(numberOfDevices.value))
return deviceList
else:
@@ -3178,6 +3187,8 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
@param device: DRM device identifier
@param my_ret: Return of RSMI call (rocm_smi_lib API)
@param metric: Parameter of GPU currently being analyzed
@param silent: Echo verbose error reponse.
True siliences err output, False does not silience err output (default).
"""
global RETCODE
global PRINT_JSON
@@ -3194,7 +3205,7 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
logging.debug('%s', returnString)
if not silent:
if my_ret in rsmi_status_verbose_err_out:
printLog(device, rsmi_status_verbose_err_out[my_ret], None)
printLog(device, metric + ", " + rsmi_status_verbose_err_out[my_ret], None)
RETCODE = my_ret
return False
return True
@@ -3226,14 +3237,14 @@ def save(deviceList, savefilepath):
for clk_type in sorted(rsmi_clk_names_dict):
clocks[device] = clocks.get(device, {})
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
if rsmi_ret_ok(ret, device, clk_type, True):
if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clk_type), True):
clocks[device][clk_type] = str(freq.current)
else:
clocks[device][clk_type] = '0'
fanSpeeds[device] = getFanSpeed(device)[0]
od = c_uint32()
ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(od))
if rsmi_ret_ok(ret, device):
if rsmi_ret_ok(ret, device, 'get_overdrive_level'):
overDriveGpu[device] = str(od.value)
else:
overDriveGpu[device] = '0'
@@ -3241,7 +3252,7 @@ def save(deviceList, savefilepath):
overDriveGpuMem[device] = '0'
status = rsmi_power_profile_status_t()
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
if rsmi_ret_ok(ret, device, 'profile'):
if rsmi_ret_ok(ret, device, 'get_profile_presets'):
profiles[device] = str(str(bin(status.current))[2:][::-1].index('1') + 1)
else:
profiles[device] = str('UNKNOWN')
+4 -2
Просмотреть файл
@@ -520,10 +520,12 @@ void RocmSMI::printEnvVarInfo(void) {
}
for (auto it=env_vars_.enum_overrides.begin();
it != env_vars_.enum_overrides.end(); ++it) {
std::cout << *it;
DevInfoTypes type = static_cast<DevInfoTypes>(*it);
std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type)
+ ")");
auto temp_it = it;
if(++temp_it != env_vars_.enum_overrides.end()) {
std::cout << ",";
std::cout << ", ";
}
}
std::cout << "}" << std::endl;