2020-07-15 06:01:40 -04:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""ROCm_SMI_LIB CLI Tool
|
|
|
|
|
|
|
|
|
|
This tool acts as a command line interface for manipulating
|
|
|
|
|
and monitoring the amdgpu kernel, and is intended to replace
|
2024-05-07 21:00:50 -05:00
|
|
|
and deprecate the existing rocm_smi.py CLI tool located at
|
|
|
|
|
https://github.com/ROCm/ROC-smi.
|
|
|
|
|
This tool uses Ctypes to call the rocm_smi_lib API.
|
2020-07-15 06:01:40 -04:00
|
|
|
Recommended: At least one AMD GPU with ROCm driver installed
|
|
|
|
|
Required: ROCm SMI library installed (librocm_smi64)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
import argparse
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import subprocess
|
2025-07-07 18:42:53 -05:00
|
|
|
import threading
|
2021-03-17 00:24:29 -04:00
|
|
|
import time
|
2023-01-30 15:58:03 -06:00
|
|
|
import multiprocessing
|
|
|
|
|
import trace
|
2025-04-14 13:05:22 -05:00
|
|
|
from os.path import exists
|
2023-01-30 15:58:03 -06:00
|
|
|
from io import StringIO
|
2021-03-17 00:24:29 -04:00
|
|
|
from time import ctime
|
2020-07-15 06:01:40 -04:00
|
|
|
from subprocess import check_output
|
2024-06-21 15:13:15 -05:00
|
|
|
from enum import IntEnum
|
2024-07-24 23:10:00 -07:00
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
|
|
|
|
|
|
# only used for type checking
|
|
|
|
|
# pyright trips up and cannot find rsmiBindings without it
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
|
from rsmiBindings import *
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from rsmiBindings import *
|
|
|
|
|
except ImportError:
|
|
|
|
|
current_path = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
additional_path = f"{current_path}/../libexec/rocm_smi"
|
|
|
|
|
sys.path.append(additional_path)
|
|
|
|
|
try:
|
|
|
|
|
from rsmiBindings import *
|
|
|
|
|
except ImportError:
|
|
|
|
|
print(f"Still couldn't import 'rsmiBindings'. Make sure it's installed in {additional_path}")
|
|
|
|
|
sys.exit(1)
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
# rocmSmiLib_cli version. Increment this as needed.
|
|
|
|
|
# Major version - Increment when backwards-compatibility breaks
|
|
|
|
|
# Minor version - Increment when adding a new feature, set to 0 when major is incremented
|
|
|
|
|
# Patch version - Increment when adding a fix, set to 0 when minor is incremented
|
2023-11-17 15:34:29 -06:00
|
|
|
# Hash version - Shortened commit hash. Print here and not with lib for consistency with amd-smi
|
2025-08-05 18:01:54 -05:00
|
|
|
SMI_MAJ = 4
|
2024-06-21 15:13:15 -05:00
|
|
|
SMI_MIN = 0
|
|
|
|
|
SMI_PAT = 0
|
2023-11-17 15:34:29 -06:00
|
|
|
# SMI_HASH is provided by rsmiBindings
|
|
|
|
|
__version__ = '%s.%s.%s+%s' % (SMI_MAJ, SMI_MIN, SMI_PAT, SMI_HASH)
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
# Set to 1 if an error occurs
|
|
|
|
|
RETCODE = 0
|
|
|
|
|
|
|
|
|
|
# If we want JSON format output instead
|
|
|
|
|
PRINT_JSON = False
|
|
|
|
|
JSON_DATA = {}
|
|
|
|
|
# Version of the JSON output used to save clocks
|
|
|
|
|
CLOCK_JSON_VERSION = 1
|
|
|
|
|
|
2023-09-21 14:53:35 -05:00
|
|
|
# Apply max buffer to all data allocation
|
|
|
|
|
MAX_BUFF_SIZE = 256
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
headerString = ' ROCm System Management Interface '
|
|
|
|
|
footerString = ' End of ROCm SMI Log '
|
|
|
|
|
# Output formatting
|
2023-09-24 02:29:07 -05:00
|
|
|
appWidth = 90
|
2020-07-15 06:01:40 -04:00
|
|
|
deviceList = []
|
|
|
|
|
|
|
|
|
|
# Enable or disable serialized format
|
|
|
|
|
OUTPUT_SERIALIZATION = False
|
|
|
|
|
|
2020-08-22 23:40:33 -04:00
|
|
|
# These are the valid clock types that can be returned/modified:
|
|
|
|
|
# TODO: "clk_type_names" from rsmiBindings.py should fetch valid clocks from
|
2021-03-17 00:24:29 -04:00
|
|
|
# the same location as rocm_smi_device.cc instead of hardcoding the values
|
2020-08-22 23:40:33 -04:00
|
|
|
validClockNames = clk_type_names[1:-2]
|
|
|
|
|
# The purpose of the [1:-2] here ^^^^ is to remove the duplicate elements at the
|
|
|
|
|
# beginning and end of the clk_type_names list (specifically sclk and mclk)
|
|
|
|
|
# Also the "invalid" clock in the list is removed since it isn't a valid clock type
|
|
|
|
|
validClockNames.append('pcie')
|
|
|
|
|
validClockNames.sort()
|
|
|
|
|
|
2025-07-07 18:42:53 -05:00
|
|
|
# Thread stop condition
|
|
|
|
|
stop_threads = False
|
|
|
|
|
|
2020-09-23 16:33:01 -04:00
|
|
|
def driverInitialized():
|
|
|
|
|
""" Returns true if amdgpu is found in the list of initialized modules
|
|
|
|
|
"""
|
2024-08-05 13:28:29 +02:00
|
|
|
driverInitialized = False
|
|
|
|
|
if os.path.exists("/sys/module/amdgpu") :
|
|
|
|
|
if os.path.exists("/sys/module/amdgpu/initstate"):
|
|
|
|
|
# amdgpu is loadable module
|
|
|
|
|
with open("/sys/module/amdgpu/initstate") as initstate:
|
|
|
|
|
if 'live' in initstate.read():
|
|
|
|
|
driverInitialized = True
|
|
|
|
|
else:
|
|
|
|
|
# amdgpu is built into the kernel
|
|
|
|
|
driverInitialized = True
|
|
|
|
|
return driverInitialized
|
2020-09-23 16:33:01 -04:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
def formatJson(device, log):
|
|
|
|
|
""" Print out in JSON format
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param log: String to parse and output into JSON format
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global JSON_DATA
|
|
|
|
|
for line in log.splitlines():
|
|
|
|
|
# Drop any invalid or improperly-formatted data
|
|
|
|
|
if ':' not in line:
|
|
|
|
|
continue
|
|
|
|
|
logTuple = line.split(': ')
|
|
|
|
|
if str(device) != 'system':
|
2020-09-09 17:34:44 -04:00
|
|
|
JSON_DATA['card' + str(device)][logTuple[0]] = logTuple[1].strip()
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
|
|
|
|
JSON_DATA['system'][logTuple[0]] = logTuple[1].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def formatCsv(deviceList):
|
|
|
|
|
""" Print out the JSON_DATA in CSV format """
|
|
|
|
|
global JSON_DATA
|
|
|
|
|
jsondata = json.dumps(JSON_DATA)
|
2022-03-21 06:21:21 -04:00
|
|
|
outstr = jsondata
|
|
|
|
|
# Check if the first json data element is 'system' or 'device'
|
|
|
|
|
outputType = outstr[outstr.find('\"')+1:]
|
|
|
|
|
outputType = outputType[:outputType.find('\"')]
|
|
|
|
|
header = []
|
|
|
|
|
my_string = ''
|
|
|
|
|
if outputType != 'system':
|
|
|
|
|
header.append('device')
|
|
|
|
|
else:
|
|
|
|
|
header.append('system')
|
|
|
|
|
if outputType == 'system':
|
|
|
|
|
jsonobj = json.loads(jsondata)
|
|
|
|
|
keylist = header
|
2023-07-26 15:28:18 -05:00
|
|
|
for record in jsonobj['system']:
|
|
|
|
|
my_string += "\"%s\", \"%s\"\n" % (record, jsonobj['system'][record])
|
|
|
|
|
# add header
|
|
|
|
|
my_string = "name, value\n" + my_string
|
2022-03-21 06:21:21 -04:00
|
|
|
return my_string
|
2020-07-15 06:01:40 -04:00
|
|
|
headerkeys = []
|
|
|
|
|
# Separate device-specific information from system-level information
|
|
|
|
|
for dev in deviceList:
|
|
|
|
|
if str(dev) != 'system':
|
|
|
|
|
headerkeys.extend(l for l in JSON_DATA['card' + str(dev)].keys() if l not in headerkeys)
|
|
|
|
|
else:
|
|
|
|
|
headerkeys.extend(l for l in JSON_DATA['system'].keys() if l not in headerkeys)
|
|
|
|
|
header.extend(headerkeys)
|
|
|
|
|
outStr = '%s\n' % ','.join(header)
|
|
|
|
|
if len(header) <= 1:
|
|
|
|
|
return ''
|
|
|
|
|
for dev in deviceList:
|
2022-03-21 06:21:21 -04:00
|
|
|
if str(dev) != 'system':
|
|
|
|
|
outStr += 'card%s,' % dev
|
|
|
|
|
else:
|
|
|
|
|
outStr += 'system,'
|
2020-07-15 06:01:40 -04:00
|
|
|
for val in headerkeys:
|
|
|
|
|
try:
|
|
|
|
|
if str(dev) != 'system':
|
2020-09-09 17:34:44 -04:00
|
|
|
# Remove commas like the ones in PCIe speed
|
|
|
|
|
outStr += '%s,' % JSON_DATA['card' + str(dev)][val].replace(',', '')
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2020-09-09 17:34:44 -04:00
|
|
|
outStr += '%s,' % JSON_DATA['system'][val].replace(',', '')
|
2020-07-15 06:01:40 -04:00
|
|
|
except KeyError as e:
|
|
|
|
|
# If the key doesn't exist (like dcefclock on Fiji, or unsupported functionality)
|
|
|
|
|
outStr += 'N/A,'
|
|
|
|
|
# Drop the trailing ',' and replace it with a \n
|
|
|
|
|
outStr = '%s\n' % outStr[0:-1]
|
|
|
|
|
return outStr
|
|
|
|
|
|
2021-07-29 12:43:54 -04:00
|
|
|
|
2020-09-09 17:34:44 -04:00
|
|
|
def formatMatrixToJSON(deviceList, matrix, metricName):
|
|
|
|
|
""" Format symmetric matrix of GPU permutations to become JSON print-ready.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param metricName: Title of the item to print to the log
|
|
|
|
|
:param matrix: symmetric matrix full of values of every permutation of DRM devices.
|
2024-04-26 23:48:15 -05:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
Matrix example:
|
|
|
|
|
|
|
|
|
|
.. math::
|
|
|
|
|
|
|
|
|
|
\\begin{bmatrix}
|
|
|
|
|
& GPU0 & GPU1 \\\\
|
|
|
|
|
GPU0 & 0 & 40 \\\\
|
|
|
|
|
GPU1 & 40 & 0
|
|
|
|
|
\\end{bmatrix}
|
2020-09-09 17:34:44 -04:00
|
|
|
|
|
|
|
|
Where matrix content is: [[0, 40], [40, 0]]
|
|
|
|
|
"""
|
|
|
|
|
devices_ind = range(len(deviceList))
|
|
|
|
|
for row_indx in devices_ind:
|
|
|
|
|
# Start at row_indx +1 to avoid printing repeated values ( GPU1 x GPU2 is the same as GPU2 x GPU1 )
|
|
|
|
|
for col_ind in range(row_indx + 1, len(deviceList)):
|
|
|
|
|
try:
|
|
|
|
|
valueStr = matrix[deviceList[row_indx]][deviceList[col_ind]].value
|
|
|
|
|
except AttributeError:
|
|
|
|
|
valueStr = matrix[deviceList[row_indx]][deviceList[col_ind]]
|
|
|
|
|
|
|
|
|
|
printSysLog(metricName.format(deviceList[row_indx], deviceList[col_ind]), valueStr)
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getBus(device, silent=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Return the bus identifier of a given device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
bdfid = c_uint64(0)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid))
|
|
|
|
|
|
2024-02-26 20:58:17 -06:00
|
|
|
# BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | ((BUS & 0xFF) << 8) |
|
|
|
|
|
# ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7)
|
|
|
|
|
# bits [63:32] = domain
|
2024-07-24 23:10:00 -07:00
|
|
|
# bits [31:28] or bits [2:0] = partition id
|
2024-02-26 20:58:17 -06:00
|
|
|
# bits [27:16] = reserved
|
2024-06-20 19:36:23 -05:00
|
|
|
# bits [15:8] = Bus
|
|
|
|
|
# bits [7:3] = Device
|
|
|
|
|
# bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
|
2020-07-15 06:01:40 -04:00
|
|
|
domain = (bdfid.value >> 32) & 0xffffffff
|
|
|
|
|
bus = (bdfid.value >> 8) & 0xff
|
|
|
|
|
device = (bdfid.value >> 3) & 0x1f
|
|
|
|
|
function = bdfid.value & 0x7
|
|
|
|
|
|
|
|
|
|
pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function)
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_pci_id', silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
return pic_id
|
|
|
|
|
|
2024-02-26 20:58:17 -06:00
|
|
|
def getPartitionId(device, silent=False):
|
|
|
|
|
""" Return the partition identifier of a given device
|
|
|
|
|
|
|
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
|
|
|
|
"""
|
2024-06-20 19:36:23 -05:00
|
|
|
partition_id = c_uint32(0)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_partition_id_get(device, byref(partition_id))
|
2024-02-26 20:58:17 -06:00
|
|
|
|
|
|
|
|
# BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | ((BUS & 0xFF) << 8) |
|
|
|
|
|
# ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7)
|
|
|
|
|
# bits [63:32] = domain
|
2024-07-24 23:10:00 -07:00
|
|
|
# bits [31:28] or bits [2:0] = partition id
|
2024-06-20 19:36:23 -05:00
|
|
|
# bits [27:16] = reserved
|
|
|
|
|
# bits [15:8] = Bus
|
|
|
|
|
# bits [7:3] = Device
|
|
|
|
|
# bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
|
|
|
|
|
partition_id = '{:d}'.format(partition_id.value)
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'rsmi_dev_partition_id_get', silent):
|
2024-02-26 20:58:17 -06:00
|
|
|
return partition_id
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getFanSpeed(device, silent=True):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Return a tuple with the fan speed (value,%) for a specified device,
|
|
|
|
|
or (None,None) if either current fan speed or max fan speed cannot be
|
|
|
|
|
obtained
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is on.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
fanLevel = c_int64()
|
|
|
|
|
fanMax = c_int64()
|
|
|
|
|
sensor_ind = c_uint32(0)
|
|
|
|
|
fl = 0
|
|
|
|
|
fm = 0
|
|
|
|
|
|
2023-08-22 17:15:18 -05:00
|
|
|
""" If ret = 2; (No such file or directory)
|
|
|
|
|
/sys/class/drm/cardX/device/hwmon/hwmonX/pwmX
|
|
|
|
|
"""
|
2020-07-15 06:01:40 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel))
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_fan_speed', silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
fl = fanLevel.value
|
2023-08-22 17:15:18 -05:00
|
|
|
last_ret = ret
|
|
|
|
|
|
|
|
|
|
""" If ret = 2; (No such file or directory)
|
|
|
|
|
/sys/class/drm/cardX/device/hwmon/hwmonX/pwmX
|
|
|
|
|
"""
|
2020-07-15 06:01:40 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax))
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_fan_max_speed', silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
fm = fanMax.value
|
2023-08-22 17:15:18 -05:00
|
|
|
|
|
|
|
|
""" In case we had an error before, we don't overwrite it with a
|
|
|
|
|
possible success now. Otherwise, we get the next error.
|
|
|
|
|
"""
|
|
|
|
|
if (last_ret == rsmi_status_t.RSMI_STATUS_SUCCESS):
|
|
|
|
|
last_ret = ret
|
|
|
|
|
|
2021-01-20 02:30:44 -05:00
|
|
|
if fl == 0 or fm == 0:
|
2023-08-22 17:15:18 -05:00
|
|
|
return (last_ret, fl, 0) # to prevent division by zero crash
|
2021-01-19 20:50:11 -05:00
|
|
|
|
2023-08-22 17:15:18 -05:00
|
|
|
return (last_ret, fl, round((float(fl) / float(fm)) * 100, 2))
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getGpuUse(device, silent=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Return the current GPU usage as a percentage
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
percent = c_uint32()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent))
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'GPU Utilization ', silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
return percent.value
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
|
2024-01-31 21:03:33 -06:00
|
|
|
def getDRMDeviceId(device, silent=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Return the hexadecimal value of a device's ID
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
dv_id = c_short()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id))
|
2024-01-31 21:03:33 -06:00
|
|
|
device_id_ret = "N/A"
|
2025-04-13 22:38:31 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_device_id', silent=True):
|
2024-01-31 21:03:33 -06:00
|
|
|
device_id_ret = hex(dv_id.value)
|
|
|
|
|
return device_id_ret
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getRev(device, silent=False):
|
2023-07-17 22:39:08 -05:00
|
|
|
""" Return the hexadecimal value of a device's Revision
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2023-07-17 22:39:08 -05:00
|
|
|
"""
|
|
|
|
|
dv_rev = c_short()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev))
|
2024-01-31 21:03:33 -06:00
|
|
|
revision_ret = "N/A"
|
2025-04-13 22:38:31 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_device_rev', silent=True):
|
2024-01-31 21:03:33 -06:00
|
|
|
revision_ret = padHexValue(hex(dv_rev.value), 2)
|
|
|
|
|
return revision_ret
|
2023-07-17 22:39:08 -05:00
|
|
|
|
2024-01-31 21:03:33 -06:00
|
|
|
def getSubsystemId(device, silent=False):
|
|
|
|
|
""" Return the a device's subsystem id
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2024-01-31 21:03:33 -06:00
|
|
|
"""
|
2025-01-24 00:09:02 -06:00
|
|
|
model = c_short()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_subsystem_id_get(device, byref(model))
|
2024-01-31 21:03:33 -06:00
|
|
|
device_model = "N/A"
|
2025-04-13 22:38:31 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent=True):
|
2025-01-24 00:09:02 -06:00
|
|
|
device_model = model.value
|
2024-01-31 21:03:33 -06:00
|
|
|
# padHexValue is used for applications that expect 4-digit card models
|
2025-01-24 00:09:02 -06:00
|
|
|
device_model = padHexValue(hex(device_model), 4)
|
2024-01-31 21:03:33 -06:00
|
|
|
return device_model
|
|
|
|
|
|
|
|
|
|
def getVendor(device, silent=False):
|
|
|
|
|
""" Return the a device's vendor id
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2024-01-31 21:03:33 -06:00
|
|
|
"""
|
|
|
|
|
vendor = create_string_buffer(MAX_BUFF_SIZE)
|
|
|
|
|
device_vendor = "N/A"
|
|
|
|
|
# Retrieve card vendor
|
|
|
|
|
ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, MAX_BUFF_SIZE)
|
|
|
|
|
# Only continue if GPU vendor is AMD
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_vendor_name', silent) and isAmdDevice(device):
|
|
|
|
|
device_vendor = vendor.value.decode()
|
|
|
|
|
return device_vendor
|
|
|
|
|
|
|
|
|
|
def getGUID(device, silent=False):
|
|
|
|
|
""" Return the uint64 value of device's GUID,
|
|
|
|
|
also referred as GPU ID - reported by KFD.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2024-01-31 21:03:33 -06:00
|
|
|
"""
|
|
|
|
|
guid = c_uint64()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_guid_get(device, byref(guid))
|
|
|
|
|
guid_ret = "N/A"
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_gpu_id_kfd', silent=silent):
|
|
|
|
|
guid_ret = guid.value
|
|
|
|
|
return guid_ret
|
|
|
|
|
|
|
|
|
|
def getTargetGfxVersion(device, silent=False):
|
|
|
|
|
""" Return the uint64 value of device's target
|
|
|
|
|
graphics version as reported by KFD
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2024-01-31 21:03:33 -06:00
|
|
|
"""
|
2024-11-06 15:13:32 -06:00
|
|
|
target_graphics_version = c_uint64()
|
2024-01-31 21:03:33 -06:00
|
|
|
gfx_ver_ret = "N/A"
|
2024-11-06 15:13:32 -06:00
|
|
|
ret = rocmsmi.rsmi_dev_target_graphics_version_get(device, byref(target_graphics_version))
|
2025-01-21 15:46:56 -06:00
|
|
|
target_graphics_version = hex(target_graphics_version.value)[2:]
|
2024-01-31 21:03:33 -06:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_target_gfx_version', silent=silent):
|
2024-11-06 15:13:32 -06:00
|
|
|
gfx_ver_ret = "gfx" + str(target_graphics_version)
|
2024-01-31 21:03:33 -06:00
|
|
|
return gfx_ver_ret
|
|
|
|
|
|
|
|
|
|
def getNodeId(device, silent=False):
|
|
|
|
|
""" Return the uint32 value of device's node id
|
|
|
|
|
reported by KFD.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2024-01-31 21:03:33 -06:00
|
|
|
"""
|
|
|
|
|
node_id = c_uint32()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_node_id_get(device, byref(node_id))
|
|
|
|
|
node_id_ret = "N/A"
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_node_id_kfd', silent=silent):
|
|
|
|
|
node_id_ret = node_id.value
|
|
|
|
|
return node_id_ret
|
|
|
|
|
|
|
|
|
|
def getDeviceName(device, silent=False):
|
2025-01-24 00:09:02 -06:00
|
|
|
""" Return the uint64 value of device's name
|
|
|
|
|
reported by KFD
|
2024-01-31 21:03:33 -06:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2024-01-31 21:03:33 -06:00
|
|
|
"""
|
|
|
|
|
# Retrieve the device series
|
|
|
|
|
series = create_string_buffer(MAX_BUFF_SIZE)
|
|
|
|
|
device_name_ret = "N/A"
|
2025-01-24 00:09:02 -06:00
|
|
|
ret = rocmsmi.rsmi_dev_market_name_get(device, series, MAX_BUFF_SIZE)
|
2024-01-31 21:03:33 -06:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_name', silent=silent):
|
|
|
|
|
device_name_ret = series.value.decode()
|
|
|
|
|
return device_name_ret
|
2023-07-17 22:39:08 -05:00
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getMaxPower(device, silent=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Return the maximum power cap of a given device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
power_cap = c_uint64()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_power_cap', silent):
|
2024-03-14 23:44:57 -05:00
|
|
|
# take floor of result (round down to nearest integer)
|
|
|
|
|
return float(power_cap.value / 1000000) // 1
|
2020-07-15 06:01:40 -04:00
|
|
|
return -1
|
|
|
|
|
|
2024-03-14 23:44:57 -05:00
|
|
|
def getAllocatedMemoryPercent(device):
|
|
|
|
|
""" Return dictionary of allocated memory (VRAM) of a given device
|
|
|
|
|
Response of allocated_memory_vram dictionary:
|
|
|
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
'value': float allocated vram memory (floor of %) or 'N/A' (for rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED),
|
|
|
|
|
'unit': %,
|
|
|
|
|
'combined': string (eg. '30%') or 'N/A' (for rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED)
|
|
|
|
|
'ret': rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED or rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
:param device: DRM device identifier
|
|
|
|
|
"""
|
|
|
|
|
allocated_memory_vram = {
|
|
|
|
|
'value': "N/A",
|
|
|
|
|
'unit': '%',
|
|
|
|
|
'combined': "N/A",
|
|
|
|
|
'ret': rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED
|
|
|
|
|
}
|
|
|
|
|
vram_used, vram_total = getMemInfo(device, 'vram', silent=True)
|
|
|
|
|
mem_use_pct = 0
|
|
|
|
|
if vram_used is None:
|
|
|
|
|
return allocated_memory_vram
|
2025-09-10 14:50:32 -05:00
|
|
|
if vram_used is not None and vram_total is not None and float(vram_total) != 0:
|
2024-03-14 23:44:57 -05:00
|
|
|
# take floor of result (round down to nearest integer)
|
|
|
|
|
mem_use_pct = (100 * (float(vram_used) / float(vram_total))) // 1
|
|
|
|
|
allocated_memory_vram['value'] = mem_use_pct
|
|
|
|
|
mem_use_pct = '{:<.0f}%'.format(mem_use_pct) # left aligned
|
|
|
|
|
# values with no precision
|
|
|
|
|
allocated_memory_vram['combined'] = mem_use_pct
|
|
|
|
|
allocated_memory_vram['ret'] = rsmi_status_t.RSMI_STATUS_SUCCESS
|
|
|
|
|
return allocated_memory_vram
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getMemInfo(device, memType, silent=False):
|
2023-04-13 10:43:52 -05:00
|
|
|
""" Returns a tuple of (memory_used, memory_total) of
|
2024-03-06 20:55:02 +01:00
|
|
|
the requested memory type usage for the device specified
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param type: [vram|vis_vram|gtt] Memory type to return
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off,
|
|
|
|
|
which exposes any issue accessing the different
|
|
|
|
|
memory types.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
memType = memType.upper()
|
|
|
|
|
if memType not in memory_type_l:
|
|
|
|
|
printErrLog(device, 'Invalid memory type %s' % (memType))
|
|
|
|
|
return (None, None)
|
|
|
|
|
|
|
|
|
|
memoryUse = c_uint64()
|
|
|
|
|
memoryTot = c_uint64()
|
|
|
|
|
memUsed = None
|
|
|
|
|
memTotal = None
|
|
|
|
|
|
|
|
|
|
ret = rocmsmi.rsmi_dev_memory_usage_get(device, memory_type_l.index(memType), byref(memoryUse))
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
memUsed = memoryUse.value
|
|
|
|
|
|
|
|
|
|
ret = rocmsmi.rsmi_dev_memory_total_get(device, memory_type_l.index(memType), byref(memoryTot))
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
memTotal = memoryTot.value
|
|
|
|
|
return (memUsed, memTotal)
|
|
|
|
|
|
|
|
|
|
|
2020-12-10 17:42:41 -06:00
|
|
|
def getProcessName(pid):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Get the process name of a specific pid
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param pid: Process ID of a program to be parsed
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
if int(pid) < 1:
|
|
|
|
|
logging.debug('PID must be greater than 0')
|
|
|
|
|
return 'UNKNOWN'
|
2020-12-10 17:42:41 -06:00
|
|
|
try:
|
|
|
|
|
pName = str(subprocess.check_output("ps -p %d -o comm=" % (int(pid)), shell=True))
|
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
|
|
|
pName = 'UNKNOWN'
|
|
|
|
|
|
2025-09-10 14:50:32 -05:00
|
|
|
if pName is None:
|
2020-12-10 17:42:41 -06:00
|
|
|
pName = 'UNKNOWN'
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
# Remove the substrings surrounding from process name (b' and \n')
|
|
|
|
|
if str(pName).startswith('b\''):
|
|
|
|
|
pName = pName[2:]
|
|
|
|
|
if str(pName).endswith('\\n\''):
|
|
|
|
|
pName = pName[:-3]
|
2020-12-10 17:42:41 -06:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
return pName
|
|
|
|
|
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getPerfLevel(device, silent=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Return the current performance level of a given device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
perf = rsmi_dev_perf_level_t()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf))
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_perf_level', silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
return perf_level_string(perf.value)
|
2023-06-07 11:56:29 +08:00
|
|
|
return 'N/A'
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def getPid(name):
|
|
|
|
|
""" Get the process id of a specific application
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param name: Process name of a program to be parsed
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
return check_output(['pidof', name])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getPidList():
|
|
|
|
|
""" Return a list of KFD process IDs """
|
|
|
|
|
num_items = c_uint32()
|
|
|
|
|
ret = rocmsmi.rsmi_compute_process_info_get(None, byref(num_items))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='get_compute_process_info'):
|
2020-07-15 06:01:40 -04:00
|
|
|
buff_sz = num_items.value + 10
|
|
|
|
|
procs = (rsmi_process_info_t * buff_sz)()
|
|
|
|
|
procList = []
|
|
|
|
|
ret = rocmsmi.rsmi_compute_process_info_get(byref(procs), byref(num_items))
|
|
|
|
|
for i in range(num_items.value):
|
|
|
|
|
procList.append('%s' % (procs[i].process_id))
|
|
|
|
|
return procList
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
2024-02-02 00:00:38 -06:00
|
|
|
def getPower(device):
|
|
|
|
|
""" Return dictionary of power responses.
|
2024-03-06 20:55:02 +01:00
|
|
|
Response power dictionary:
|
2024-04-26 23:48:15 -05:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
.. code-block:: python
|
2024-04-26 23:48:15 -05:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
{
|
|
|
|
|
'power': string wattage response or 'N/A' (for not RSMI_STATUS_SUCCESS),
|
|
|
|
|
'power_type': power type string - 'Current Socket' or 'Average',
|
|
|
|
|
'unit': W (Watt)
|
|
|
|
|
'ret': response of rsmi_dev_power_get(device, byref(power), byref(power_type))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
:param device: DRM device identifier
|
2023-09-24 02:29:07 -05:00
|
|
|
"""
|
2024-04-26 23:48:15 -05:00
|
|
|
|
2024-02-02 00:00:38 -06:00
|
|
|
power = c_int64(0)
|
|
|
|
|
power_type = rsmi_power_type_t()
|
|
|
|
|
power_ret_dict = {
|
|
|
|
|
'power': "N/A",
|
|
|
|
|
'power_type': "N/A",
|
|
|
|
|
'unit': 'W',
|
|
|
|
|
'ret': rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED
|
|
|
|
|
}
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_get(device, byref(power), byref(power_type))
|
|
|
|
|
if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
|
|
|
|
|
power_ret_dict = {
|
|
|
|
|
'power': str(power.value / 1000000),
|
|
|
|
|
'power_type': rsmi_power_type_dict[power_type.value],
|
|
|
|
|
'unit': 'W',
|
|
|
|
|
'ret': ret
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
power_ret_dict['ret'] = ret
|
|
|
|
|
return power_ret_dict
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getRasEnablement(device, block, silent=True):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Return RAS enablement state for a given device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param block: RAS block identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is on.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
state = rsmi_ras_err_state_t()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state))
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
return rsmi_ras_err_stale_machine[state.value].upper()
|
|
|
|
|
return 'N/A'
|
|
|
|
|
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getTemp(device, sensor, silent=True):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Display the current temperature from a given device's sensor
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param sensor: Temperature sensor identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is on.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
temp = c_int64(0)
|
|
|
|
|
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
|
|
|
|
|
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), temp_type_lst.index(sensor), metric, byref(temp))
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
return temp.value / 1000
|
|
|
|
|
return 'N/A'
|
|
|
|
|
|
2023-08-10 18:25:02 -05:00
|
|
|
def findFirstAvailableTemp(device):
|
|
|
|
|
""" Discovers the first available device temperature to display
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
Returns a tuple of (temp_type, temp_value) for the device specified
|
|
|
|
|
|
|
|
|
|
:param device: DRM device identifier
|
2023-08-10 18:25:02 -05:00
|
|
|
"""
|
|
|
|
|
temp = c_int64(0)
|
|
|
|
|
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
|
|
|
|
|
ret_temp = "N/A"
|
2023-09-24 02:29:07 -05:00
|
|
|
ret_temp_type = temp_type_lst[0]
|
2023-08-10 18:25:02 -05:00
|
|
|
for i, templist_val in enumerate(temp_type_lst):
|
|
|
|
|
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), i, metric, byref(temp))
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_temp_metric_' + templist_val, silent=True):
|
|
|
|
|
ret_temp = temp.value / 1000
|
|
|
|
|
ret_temp_type = '(' + templist_val.capitalize() + ')'
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
continue
|
|
|
|
|
return (ret_temp_type, ret_temp)
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2023-09-24 02:29:07 -05:00
|
|
|
def getTemperatureLabel(deviceList):
|
|
|
|
|
""" Discovers the the first identified power label
|
2024-03-06 20:55:02 +01:00
|
|
|
Returns a string label value
|
2023-09-24 02:29:07 -05:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
2023-09-24 02:29:07 -05:00
|
|
|
"""
|
|
|
|
|
# Default label is Edge
|
|
|
|
|
tempLabel = temp_type_lst[0].lower()
|
|
|
|
|
if len(deviceList) < 1:
|
|
|
|
|
return tempLabel
|
|
|
|
|
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
|
|
|
|
tempLabel = temp_type.lower().replace('(', '').replace(')', '')
|
|
|
|
|
return tempLabel
|
|
|
|
|
|
|
|
|
|
def getPowerLabel(deviceList):
|
|
|
|
|
""" Discovers the the first identified power label
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
Returns a string label value
|
|
|
|
|
|
|
|
|
|
:param device: DRM device identifier
|
2023-09-24 02:29:07 -05:00
|
|
|
"""
|
|
|
|
|
power = c_int64(0)
|
|
|
|
|
# Default label is AvgPower
|
|
|
|
|
powerLabel = rsmi_power_label.AVG_POWER
|
|
|
|
|
if len(deviceList) < 1:
|
|
|
|
|
return powerLabel
|
|
|
|
|
device=deviceList[0]
|
2024-02-02 00:00:38 -06:00
|
|
|
power_dict = getPower(device)
|
2024-04-26 23:48:15 -05:00
|
|
|
if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and
|
2024-02-02 00:00:38 -06:00
|
|
|
power_dict['power_type'] == 'CURRENT SOCKET'):
|
2023-09-24 02:29:07 -05:00
|
|
|
powerLabel = rsmi_power_label.CURRENT_SOCKET_POWER
|
|
|
|
|
return powerLabel
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getVbiosVersion(device, silent=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Returns the VBIOS version for a given device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
vbios = create_string_buffer(256)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
|
2024-01-31 21:03:33 -06:00
|
|
|
vbios_ret = "N/A"
|
|
|
|
|
if rsmi_ret_ok(ret, device, silent=silent):
|
|
|
|
|
vbios_ret = vbios.value.decode()
|
|
|
|
|
if vbios_ret == "":
|
|
|
|
|
vbios_ret = "N/A"
|
|
|
|
|
return vbios_ret
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getVersion(deviceList, component, silent=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Return the software version for the specified component
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param component: Component (currently only driver)
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is off.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
ver_str = create_string_buffer(256)
|
|
|
|
|
ret = rocmsmi.rsmi_version_str_get(component, ver_str, 256)
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component), silent):
|
2020-07-15 06:01:40 -04:00
|
|
|
return ver_str.value.decode()
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getComputePartition(device, silent=True):
|
2023-02-14 17:06:03 -06:00
|
|
|
""" Return the current compute partition of a given device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is on.
|
2023-02-14 17:06:03 -06:00
|
|
|
"""
|
2023-09-21 14:53:35 -05:00
|
|
|
currentComputePartition = create_string_buffer(MAX_BUFF_SIZE)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, MAX_BUFF_SIZE)
|
2023-09-07 16:20:30 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent) and currentComputePartition.value.decode():
|
2023-02-14 17:06:03 -06:00
|
|
|
return str(currentComputePartition.value.decode())
|
2023-08-10 18:25:02 -05:00
|
|
|
return "N/A"
|
2023-02-14 17:06:03 -06:00
|
|
|
|
|
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
def getMemoryPartition(device, silent=True):
|
2023-02-14 17:06:03 -06:00
|
|
|
""" Return the current memory partition of a given device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is on.
|
2023-02-14 17:06:03 -06:00
|
|
|
"""
|
2023-09-21 14:53:35 -05:00
|
|
|
currentMemoryPartition = create_string_buffer(MAX_BUFF_SIZE)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_memory_partition_get(device, currentMemoryPartition, MAX_BUFF_SIZE)
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_memory_partition', silent) and currentMemoryPartition.value.decode():
|
|
|
|
|
return str(currentMemoryPartition.value.decode())
|
2023-08-10 18:25:02 -05:00
|
|
|
return "N/A"
|
2023-02-14 17:06:03 -06:00
|
|
|
|
2024-11-06 15:13:32 -06:00
|
|
|
def getMemoryPartitionCapabilities(device, silent=True):
|
|
|
|
|
""" Return the current memory partition capablities of a given device
|
|
|
|
|
|
|
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param silent: Turn on to silence error output
|
|
|
|
|
(you plan to handle manually). Default is on.
|
|
|
|
|
"""
|
|
|
|
|
memoryPartitionCapabilities = create_string_buffer(MAX_BUFF_SIZE)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_memory_partition_capabilities_get(device, memoryPartitionCapabilities, MAX_BUFF_SIZE)
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent) and memoryPartitionCapabilities.value.decode():
|
|
|
|
|
return str(memoryPartitionCapabilities.value.decode())
|
|
|
|
|
return "N/A"
|
|
|
|
|
|
2023-02-14 17:06:03 -06:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def print2DArray(dataArray):
|
|
|
|
|
""" Print 2D Array with uniform spacing """
|
2020-12-17 04:44:41 -05:00
|
|
|
global PRINT_JSON
|
2020-07-15 06:01:40 -04:00
|
|
|
dataArrayLength = []
|
2020-12-17 04:44:41 -05:00
|
|
|
isPid = False
|
|
|
|
|
if str(dataArray[0][0]) == 'PID':
|
|
|
|
|
isPid = True
|
2020-07-15 06:01:40 -04:00
|
|
|
for position in range(len(dataArray[0])):
|
|
|
|
|
dataArrayLength.append(len(dataArray[0][position]))
|
|
|
|
|
for position in range(len(dataArray)):
|
|
|
|
|
for cell in range(len(dataArray[0])):
|
|
|
|
|
if len(dataArray[position][cell]) > dataArrayLength[cell]:
|
|
|
|
|
dataArrayLength[cell] = len(dataArray[position][cell])
|
|
|
|
|
for position in range(len(dataArray)):
|
|
|
|
|
printString = ''
|
|
|
|
|
for cell in range(len(dataArray[0])):
|
|
|
|
|
printString += str(dataArray[position][cell]).ljust(dataArrayLength[cell], ' ') + '\t'
|
2020-12-17 04:44:41 -05:00
|
|
|
if PRINT_JSON:
|
|
|
|
|
printString = ' '.join(printString.split()).lower()
|
|
|
|
|
firstElement = printString.split(' ', 1)[0]
|
|
|
|
|
printString = printString.split(' ', 1)[1]
|
|
|
|
|
printString = printString.replace(' ', ', ')
|
|
|
|
|
if (position > 0):
|
|
|
|
|
if isPid:
|
|
|
|
|
printSysLog('PID%s' % (firstElement), printString)
|
|
|
|
|
else:
|
|
|
|
|
printSysLog(firstElement, printString)
|
|
|
|
|
else:
|
|
|
|
|
printLog(None, printString, None)
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def printEmptyLine():
|
|
|
|
|
""" Print out a single empty line """
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
if not PRINT_JSON:
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
2024-09-10 11:25:55 -05:00
|
|
|
def printErrLog(device, err, is_warning=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Print out an error to the SMI log
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param err: Error string to print
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
devName = device
|
|
|
|
|
for line in err.split('\n'):
|
2022-06-24 11:56:46 -04:00
|
|
|
errstr = 'GPU[%s]\t: %s' % (devName, line)
|
2020-07-15 06:01:40 -04:00
|
|
|
if not PRINT_JSON:
|
2024-09-10 11:25:55 -05:00
|
|
|
if not is_warning:
|
|
|
|
|
logging.error(errstr)
|
|
|
|
|
else:
|
|
|
|
|
logging.warning(errstr)
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
|
|
|
|
logging.debug(errstr)
|
|
|
|
|
|
|
|
|
|
|
2022-06-09 19:52:38 -04:00
|
|
|
def printInfoLog(device, metricName, value):
|
|
|
|
|
""" Print out an info line to the SMI log
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param metricName: Title of the item to print to the log
|
|
|
|
|
:param value: The item's value to print to the log
|
2022-06-09 19:52:38 -04:00
|
|
|
"""
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
|
|
|
|
|
if not PRINT_JSON:
|
|
|
|
|
if value is not None:
|
|
|
|
|
logstr = 'GPU[%s]\t: %s: %s' % (device, metricName, value)
|
|
|
|
|
else:
|
|
|
|
|
logstr = 'GPU[%s]\t: %s' % (device, metricName)
|
|
|
|
|
if device is None:
|
|
|
|
|
logstr = logstr[13:]
|
|
|
|
|
|
|
|
|
|
logging.info(logstr)
|
|
|
|
|
|
|
|
|
|
|
2021-03-17 00:24:29 -04:00
|
|
|
def printEventList(device, delay, eventList):
|
|
|
|
|
""" Print out notification events for a specified device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param delay: Notification delay in ms
|
|
|
|
|
:param eventList: List of event type names (can be a single-item list)
|
2021-03-17 00:24:29 -04:00
|
|
|
"""
|
|
|
|
|
mask = 0
|
|
|
|
|
ret = rocmsmi.rsmi_event_notification_init(device)
|
2023-04-13 10:43:52 -05:00
|
|
|
if not rsmi_ret_ok(ret, device, 'event_notification_init'):
|
2021-03-17 00:24:29 -04:00
|
|
|
printErrLog(device, 'Unable to initialize event notifications.')
|
|
|
|
|
return
|
|
|
|
|
for eventType in eventList:
|
|
|
|
|
mask |= 2 ** notification_type_names.index(eventType.upper())
|
|
|
|
|
ret = rocmsmi.rsmi_event_notification_mask_set(device, mask)
|
2023-04-13 10:43:52 -05:00
|
|
|
if not rsmi_ret_ok(ret, device, 'set_event_notification_mask'):
|
2021-03-17 00:24:29 -04:00
|
|
|
printErrLog(device, 'Unable to set event notification mask.')
|
|
|
|
|
return
|
2025-07-07 18:42:53 -05:00
|
|
|
while not stop_threads: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
|
2021-03-17 00:24:29 -04:00
|
|
|
num_elements = c_uint32(1)
|
|
|
|
|
data = rsmi_evt_notification_data_t(1)
|
|
|
|
|
rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data))
|
|
|
|
|
if len(data.message) > 0:
|
2023-07-13 08:28:53 -05:00
|
|
|
print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1],
|
2021-03-17 00:24:29 -04:00
|
|
|
data.message.decode('utf8') + '\r']])
|
2025-08-28 11:49:36 -05:00
|
|
|
ret = rocmsmi.rsmi_event_notification_stop(device)
|
|
|
|
|
if not rsmi_ret_ok(ret, device, 'stop_event_notification'):
|
|
|
|
|
printErrLog(device, 'Unable to end event notifications.')
|
2021-03-17 00:24:29 -04:00
|
|
|
|
2024-06-21 15:13:15 -05:00
|
|
|
def printLog(device, metricName, value=None, extraSpace=False, useItalics=False, xcp=None):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Print out to the SMI log
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param metricName: Title of the item to print to the log
|
|
|
|
|
:param value: The item's value to print to the log
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
2023-08-10 18:25:02 -05:00
|
|
|
red = '\033[91m'
|
|
|
|
|
green = '\033[92m'
|
|
|
|
|
blue = '\033[94m'
|
|
|
|
|
bold = '\033[1m'
|
|
|
|
|
italics = '\033[3m'
|
|
|
|
|
underline = '\033[4m'
|
|
|
|
|
end = '\033[0m'
|
2020-07-15 06:01:40 -04:00
|
|
|
global PRINT_JSON
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
if value is not None and device is not None:
|
|
|
|
|
formatJson(device, str(metricName) + ': ' + str(value))
|
|
|
|
|
elif device is not None:
|
|
|
|
|
formatJson(device, str(metricName))
|
|
|
|
|
return
|
|
|
|
|
if value is not None:
|
2024-06-21 15:13:15 -05:00
|
|
|
if xcp == None:
|
|
|
|
|
logstr = 'GPU[%s]\t\t: %s: %s' % (device, metricName, value)
|
|
|
|
|
else:
|
|
|
|
|
logstr = 'GPU[%s] XCP[%s]\t: %s: %s' % (device, xcp, metricName, value)
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
|
|
|
|
logstr = 'GPU[%s]\t\t: %s' % (device, metricName)
|
|
|
|
|
if device is None:
|
2023-06-08 19:55:31 -05:00
|
|
|
logstr = logstr.split(':', 1)[1][1:]
|
2021-07-09 00:41:30 -04:00
|
|
|
# Force thread safe printing
|
2023-01-30 15:58:03 -06:00
|
|
|
lock = multiprocessing.Lock()
|
|
|
|
|
lock.acquire()
|
2023-08-10 18:25:02 -05:00
|
|
|
if useItalics:
|
|
|
|
|
logstr = italics + logstr + end
|
2023-09-14 11:58:49 -05:00
|
|
|
try:
|
|
|
|
|
if extraSpace:
|
|
|
|
|
print('\n', end='')
|
2024-07-26 10:46:28 -05:00
|
|
|
|
|
|
|
|
# Handle non UTF-8 locale
|
|
|
|
|
try:
|
2025-08-28 11:49:36 -05:00
|
|
|
print(logstr.encode('utf-8', 'ignore').decode('utf-8'))
|
|
|
|
|
except UnicodeError:
|
2024-07-26 10:46:28 -05:00
|
|
|
print(logstr.encode('ascii', 'ignore').decode('ascii'))
|
|
|
|
|
|
2023-09-14 11:58:49 -05:00
|
|
|
sys.stdout.flush()
|
|
|
|
|
# when piped into programs like 'head' - print throws an error.
|
|
|
|
|
# silently ignore instead
|
|
|
|
|
except(BrokenPipeError, IOError):
|
|
|
|
|
# https://docs.python.org/3/library/signal.html#note-on-sigpipe
|
|
|
|
|
# Python flushes standard streams on exit; redirect remaining output
|
|
|
|
|
# to devnull to avoid another BrokenPipeError at shutdown
|
|
|
|
|
devnull = os.open(os.devnull, os.O_WRONLY)
|
|
|
|
|
os.dup2(devnull, sys.stdout.fileno())
|
|
|
|
|
sys.exit(1) # Python exits with error code 1 on EPIPE
|
|
|
|
|
|
2023-01-30 15:58:03 -06:00
|
|
|
lock.release()
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def printListLog(metricName, valuesList):
|
|
|
|
|
""" Print out to the SMI log for the lists
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param metricName: Title of the item to print to the log
|
|
|
|
|
:param valuesList: The item's list of values to print to the log
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
listStr = ''
|
|
|
|
|
line = metricName + ':\n'
|
|
|
|
|
if not valuesList:
|
|
|
|
|
line = 'None'
|
|
|
|
|
else:
|
|
|
|
|
for value in valuesList:
|
|
|
|
|
value = str(value) + ' '
|
|
|
|
|
if (len(line) + len(value)) < appWidth:
|
|
|
|
|
line += value
|
|
|
|
|
else:
|
|
|
|
|
listStr = listStr + line + '\n'
|
|
|
|
|
line = value
|
|
|
|
|
if not PRINT_JSON:
|
|
|
|
|
print(listStr + line)
|
|
|
|
|
|
|
|
|
|
|
2023-09-24 02:29:07 -05:00
|
|
|
def printLogSpacer(displayString=None, fill='=', contentSizeToFit=0):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Prints [name of the option]/[name of the program] in the spacer to explain data below
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
If no parameters are given, a default fill of the '=' string is used in the spacer
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param displayString: name of item to be displayed inside of the log spacer
|
|
|
|
|
:param fill: padding string which surrounds the given display string
|
|
|
|
|
:param contentSizeToFit: providing an integer > 0 allows
|
|
|
|
|
ability to dynamically change output padding/fill based on this value
|
|
|
|
|
instead of appWidth. Handy for concise info output.
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global appWidth, PRINT_JSON
|
2023-09-24 02:29:07 -05:00
|
|
|
resizeValue = appWidth
|
|
|
|
|
if contentSizeToFit != 0:
|
|
|
|
|
resizeValue = contentSizeToFit
|
|
|
|
|
if resizeValue % 2: # if odd -> make even
|
|
|
|
|
resizeValue += 1
|
|
|
|
|
# leaving below to check if resizing works properly
|
|
|
|
|
# print("resizeVal=" +str(resizeValue) + "; appWidth=" + str(appWidth) +
|
|
|
|
|
# "; contentSizeToFit=" + str(contentSizeToFit) + "; fill=" + fill)
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
if not PRINT_JSON:
|
|
|
|
|
if displayString:
|
|
|
|
|
if len(displayString) % 2:
|
|
|
|
|
displayString += fill
|
2023-09-24 02:29:07 -05:00
|
|
|
logSpacer = fill * int((resizeValue - (len(displayString))) / 2) + displayString + fill * int(
|
|
|
|
|
(resizeValue - (len(displayString))) / 2)
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2023-09-24 02:29:07 -05:00
|
|
|
logSpacer = fill * resizeValue
|
2020-07-15 06:01:40 -04:00
|
|
|
print(logSpacer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def printSysLog(SysComponentName, value):
|
|
|
|
|
""" Print out to the SMI log for repeated features
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param SysComponentName: Title of the item to print to the log
|
|
|
|
|
:param value: The item's value to print to the log
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global PRINT_JSON, JSON_DATA
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
if 'system' not in JSON_DATA:
|
|
|
|
|
JSON_DATA['system'] = {}
|
|
|
|
|
formatJson('system', str(SysComponentName) + ': ' + str(value))
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
logstr = '{}: {}'.format(SysComponentName, value)
|
|
|
|
|
logging.debug(logstr)
|
|
|
|
|
print(logstr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def printTableLog(column_headers, data_matrix, device=None, tableName=None, anchor='>', v_delim=' '):
|
|
|
|
|
""" Print out to the SMI log for the lists
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param column_headers: Header names for each column
|
|
|
|
|
:param data_matrix: Matrix of values
|
|
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param tableName: Title of the table to print to the log
|
|
|
|
|
:param anchor: Alignment direction of the print output
|
|
|
|
|
:param v_delim: Boundary String delimiter for the print output
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
# Usage: the length of col_Names would be determining column width.
|
|
|
|
|
# If additional space is needed, please pad corresponding column name with spaces
|
|
|
|
|
# If table should print tabulated, pad name of column one with leading zeroes
|
|
|
|
|
# Use anchor '<' to to align columns to the right
|
|
|
|
|
global OUTPUT_SERIALIZATION, PRINT_JSON
|
|
|
|
|
if OUTPUT_SERIALIZATION or PRINT_JSON:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if (device is not None) or tableName:
|
|
|
|
|
if device is not None:
|
|
|
|
|
print('\nGPU[%s]: ' % (device), end='\t')
|
|
|
|
|
if tableName:
|
|
|
|
|
print(tableName, end='')
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
|
|
|
|
|
for header in column_headers:
|
|
|
|
|
print('{:>}'.format(header), end=v_delim)
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
|
|
|
|
|
for row in data_matrix:
|
|
|
|
|
for index, cell in enumerate(row):
|
|
|
|
|
if cell is None:
|
|
|
|
|
cell = 'None'
|
|
|
|
|
print('{:{anc}{width}}'.format(cell, anc=anchor, width=len(column_headers[index])), end=v_delim)
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
|
2021-07-29 12:43:54 -04:00
|
|
|
|
2020-09-09 17:34:44 -04:00
|
|
|
def printTableRow(space, displayString, v_delim=" "):
|
|
|
|
|
""" Print out a line of a matrix table
|
2020-08-05 16:30:22 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param space: The item's spacing to print
|
|
|
|
|
:param displayString: The item's value to print
|
|
|
|
|
:param v_delim: Boundary String delimiter for the print output
|
2020-08-05 16:30:22 -04:00
|
|
|
"""
|
|
|
|
|
if space:
|
2021-07-29 12:43:54 -04:00
|
|
|
print(space % (displayString), end=v_delim)
|
2020-08-05 16:30:22 -04:00
|
|
|
else:
|
2021-07-29 12:43:54 -04:00
|
|
|
print(displayString, end=v_delim)
|
|
|
|
|
|
2020-08-05 16:30:22 -04:00
|
|
|
|
2021-07-29 11:39:34 -04:00
|
|
|
def checkIfSecondaryDie(device):
|
|
|
|
|
""" Checks if GCD(die) is the secondary die in a MCM.
|
2023-11-15 11:26:10 -06:00
|
|
|
MI200 device specific feature check.
|
|
|
|
|
The secondary dies lacks power management features.
|
2021-07-29 11:39:34 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: The device to check
|
2021-07-29 11:39:34 -04:00
|
|
|
"""
|
2023-11-15 11:26:10 -06:00
|
|
|
energy_count = c_uint64()
|
|
|
|
|
counter_resoution = c_float()
|
|
|
|
|
timestamp = c_uint64()
|
|
|
|
|
|
|
|
|
|
# secondary die can be determined by checking if energy counter == 0
|
|
|
|
|
ret = rocmsmi.rsmi_dev_energy_count_get(device, byref(energy_count), byref(counter_resoution), byref(timestamp))
|
|
|
|
|
if (rsmi_ret_ok(ret, None, 'energy_count_secondary_die_check', silent=False)) and (energy_count.value == 0):
|
|
|
|
|
return True
|
|
|
|
|
return False
|
2020-08-05 16:30:22 -04:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def resetClocks(deviceList):
|
|
|
|
|
""" Reset clocks to default
|
|
|
|
|
|
|
|
|
|
Reset clocks to default values by setting performance level to auto, as well
|
|
|
|
|
as setting OverDrive back to 0
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Reset Clocks ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(0))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_overdrive_level'):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'OverDrive set to 0', None)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_perf_level'):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Successfully reset clocks', None)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_perf_level'):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Performance level reset to auto', None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resetFans(deviceList):
|
|
|
|
|
""" Reset fans to driver control for a list of devices.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Reset GPU Fan Speed ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
sensor_ind = c_uint32(0)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind)
|
2023-09-12 16:34:04 -05:00
|
|
|
if rsmi_ret_ok(ret, device, silent=True):
|
2023-08-28 20:08:13 -05:00
|
|
|
printLog(device, 'Successfully reset fan speed to driver control', None)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resetPowerOverDrive(deviceList, autoRespond):
|
|
|
|
|
""" Reset Power OverDrive to the default value
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
setPowerOverDrive(deviceList, 0, autoRespond)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resetProfile(deviceList):
|
|
|
|
|
""" Reset profile for a list of a devices.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Reset Profile ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_profile_set(device, 0, profileString('BOOTUP DEFAULT'))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_power_profile'):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Successfully reset Power Profile', None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resetXgmiErr(deviceList):
|
|
|
|
|
""" Reset the XGMI Error value
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: Reset XGMI error count for these devices
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer('Reset XGMI Error Status ')
|
2020-08-05 16:30:22 -04:00
|
|
|
for device in deviceList:
|
2020-07-15 06:01:40 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_xgmi_error_reset(device)
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'reset xgmi'):
|
|
|
|
|
printLog(device, 'Successfully reset XGMI Error count', None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2020-10-30 13:40:26 -04:00
|
|
|
def resetPerfDeterminism(deviceList):
|
|
|
|
|
""" Reset Performance Determinism
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: Disable Performance Determinism for these devices
|
2020-10-30 13:40:26 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer('Disable Performance Determinism')
|
|
|
|
|
for device in deviceList:
|
2020-12-02 16:31:01 -05:00
|
|
|
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
|
2020-10-30 13:40:26 -04:00
|
|
|
if rsmi_ret_ok(ret, device, 'disable performance determinism'):
|
|
|
|
|
printLog(device, 'Successfully disabled performance determinism', None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2021-04-20 10:31:36 -04:00
|
|
|
def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond):
|
2020-08-27 15:00:53 -04:00
|
|
|
""" Set the range for the specified clktype in the PowerPlay table for a list of devices.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param clktype: [sclk|mclk] Which clock type to apply the range to
|
|
|
|
|
:param minvalue: Minimum value to apply to the clock range
|
|
|
|
|
:param maxvalue: Maximum value to apply to the clock range
|
|
|
|
|
:param autoRespond: Response to automatically provide for all prompts
|
2020-08-27 15:00:53 -04:00
|
|
|
"""
|
|
|
|
|
global RETCODE
|
|
|
|
|
if clkType not in {'sclk', 'mclk'}:
|
|
|
|
|
printLog(None, 'Invalid range identifier %s' % (clkType), None)
|
2021-04-20 10:31:36 -04:00
|
|
|
logging.error('Unsupported range type %s', clkType)
|
2020-08-27 15:00:53 -04:00
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
try:
|
2021-04-20 10:31:36 -04:00
|
|
|
int(minvalue) & int(maxvalue)
|
2020-08-27 15:00:53 -04:00
|
|
|
except ValueError:
|
2023-06-07 11:56:29 +08:00
|
|
|
printErrLog(None, 'Unable to set %s range' % (clkType))
|
2021-04-20 10:31:36 -04:00
|
|
|
logging.error('%s or %s is not an integer', minvalue, maxvalue)
|
2020-08-27 15:00:53 -04:00
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
confirmOutOfSpecWarning(autoRespond)
|
|
|
|
|
printLogSpacer(' Set Valid %s Range ' % (clkType))
|
|
|
|
|
for device in deviceList:
|
2021-04-20 10:31:36 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_clk_range_set(device, int(minvalue), int(maxvalue), rsmi_clk_names_dict[clkType])
|
2022-05-31 06:13:56 -04:00
|
|
|
if rsmi_ret_ok(ret, device, silent=True):
|
2021-04-20 10:31:36 -04:00
|
|
|
printLog(device, 'Successfully set %s from %s(MHz) to %s(MHz)' % (clkType, minvalue, maxvalue), None)
|
2020-08-27 15:00:53 -04:00
|
|
|
else:
|
2022-05-31 06:13:56 -04:00
|
|
|
if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
|
|
|
|
printLog(device, 'Setting %s range is not supported for this device.' % (clkType), None)
|
2025-09-16 10:56:03 -05:00
|
|
|
else:
|
|
|
|
|
RETCODE = 1
|
2020-08-27 15:00:53 -04:00
|
|
|
|
2024-02-09 09:22:51 -06:00
|
|
|
def setClockExtremum(deviceList, level, clkType, clkValue, autoRespond):
|
|
|
|
|
""" Set the range for the specified clktype in the PowerPlay table for a list of devices.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param level: [min|max] Minimum value or Maximum value
|
|
|
|
|
:param clktype: [sclk|mclk] Which clock type to apply the range to
|
|
|
|
|
:param clkValue: clock value to apply to the level
|
|
|
|
|
:param autoRespond: Response to automatically provide for all prompts
|
2024-02-09 09:22:51 -06:00
|
|
|
"""
|
|
|
|
|
global RETCODE
|
|
|
|
|
if level not in {'min', 'max'}:
|
|
|
|
|
printLog(None, 'Invalid extremum identifier %s, use min or max' % (level), None)
|
|
|
|
|
logging.error('Unsupported clock extremum %s', level)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if clkType not in {'sclk', 'mclk'}:
|
|
|
|
|
printLog(None, 'Invalid clock type identifier %s, use sclk or mclk ' % (clkType), None)
|
|
|
|
|
logging.error('Unsupported clock type %s', clkType)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
point = 0
|
|
|
|
|
if level == "max":
|
|
|
|
|
point = 1
|
|
|
|
|
try:
|
2024-04-26 23:48:15 -05:00
|
|
|
int(clkValue)
|
2024-02-09 09:22:51 -06:00
|
|
|
except ValueError:
|
|
|
|
|
printErrLog(None, 'Unable to set %s' % (clkValue))
|
|
|
|
|
logging.error('%s is not an integer', clkValue)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
confirmOutOfSpecWarning(autoRespond)
|
|
|
|
|
printLogSpacer(' Set Valid %s Extremum ' % (clkType))
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_clk_extremum_set(device, rsmi_freq_ind_t(int(point)), int(clkValue), rsmi_clk_names_dict[clkType])
|
|
|
|
|
if rsmi_ret_ok(ret, device, silent=True):
|
|
|
|
|
printLog(device, 'Successfully set %s %s to %s(MHz)' % (level, clkType, clkValue), None)
|
|
|
|
|
else:
|
|
|
|
|
if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
|
|
|
|
printLog(device, 'Setting %s %s clock is not supported for this device.' % (level, clkType), None)
|
2025-09-16 10:56:03 -05:00
|
|
|
else:
|
|
|
|
|
RETCODE = 1
|
2024-02-09 09:22:51 -06:00
|
|
|
|
2020-08-27 15:00:53 -04:00
|
|
|
|
2024-05-03 02:58:31 -05:00
|
|
|
def setVoltageCurve(deviceList, point, clk, volt, autoRespond):
|
|
|
|
|
""" Set voltage curve for a point in the PowerPlay table for a list of devices.
|
|
|
|
|
|
|
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param point: Point on the voltage curve to modify
|
|
|
|
|
:param clk: Clock speed specified for this curve point
|
|
|
|
|
:param volt: Voltage specified for this curve point
|
|
|
|
|
:param autoRespond: Response to automatically provide for all prompts
|
|
|
|
|
"""
|
|
|
|
|
global RETCODE
|
|
|
|
|
value = '%s %s %s' % (point, clk, volt)
|
|
|
|
|
try:
|
|
|
|
|
any(int(item) for item in value.split())
|
|
|
|
|
except ValueError:
|
|
|
|
|
printErrLog(None, 'Unable to set Voltage curve')
|
|
|
|
|
printErrLog(None, 'Non-integer characters are present in %s' %value)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
confirmOutOfSpecWarning(autoRespond)
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_od_volt_info_set(device, int(point), int(clk), int(volt))
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'set_voltage_curve'):
|
|
|
|
|
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
|
|
|
|
|
|
|
|
|
|
|
2020-08-27 15:00:53 -04:00
|
|
|
def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
|
|
|
|
|
""" Set clock frequency and voltage for a level in the PowerPlay table for a list of devices.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param clktype: [sclk|mclk] Which clock type to apply the range to
|
|
|
|
|
:param point: Point on the voltage curve to modify
|
|
|
|
|
:param clk: Clock speed specified for this curve point
|
|
|
|
|
:param volt: Voltage specified for this curve point
|
|
|
|
|
:param autoRespond: Response to automatically provide for all prompts
|
2020-08-27 15:00:53 -04:00
|
|
|
"""
|
|
|
|
|
global RETCODE
|
|
|
|
|
value = '%s %s %s' % (point, clk, volt)
|
2023-04-13 10:43:52 -05:00
|
|
|
listOfValues = value.split(' ')
|
2020-08-27 15:00:53 -04:00
|
|
|
try:
|
|
|
|
|
any(int(item) for item in value.split())
|
|
|
|
|
except ValueError:
|
2023-04-13 10:43:52 -05:00
|
|
|
printErrLog(None, 'Unable to set PowerPlay table level')
|
|
|
|
|
printErrLog(None, 'Non-integer characters are present in %s' %value)
|
2020-08-27 15:00:53 -04:00
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
confirmOutOfSpecWarning(autoRespond)
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if clkType == 'sclk':
|
2021-07-29 12:43:54 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk),
|
|
|
|
|
rsmi_clk_names_dict[clkType])
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)):
|
2020-08-27 15:00:53 -04:00
|
|
|
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
|
|
|
|
|
else:
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
elif clkType == 'mclk':
|
2021-07-29 12:43:54 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk),
|
|
|
|
|
rsmi_clk_names_dict[clkType])
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_power_play_table_level_' + str(clkType)):
|
2020-08-27 15:00:53 -04:00
|
|
|
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
|
|
|
|
|
else:
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
else:
|
|
|
|
|
printErrLog(device, 'Unable to set %s range' % (clkType))
|
|
|
|
|
logging.error('Unsupported range type %s', clkType)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def setClockOverDrive(deviceList, clktype, value, autoRespond):
|
|
|
|
|
""" Set clock speed to OverDrive for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param type: [sclk|mclk] Clock type to set
|
|
|
|
|
:param value: [0-20] OverDrive percentage
|
|
|
|
|
:param autoRespond: Response to automatically provide for all prompts
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Set Clock OverDrive (Range: 0% to 20%) ')
|
|
|
|
|
global RETCODE
|
|
|
|
|
try:
|
|
|
|
|
int(value)
|
|
|
|
|
except ValueError:
|
|
|
|
|
printLog(None, 'Unable to set OverDrive level', None)
|
|
|
|
|
logging.error('%s it is not an integer', value)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
confirmOutOfSpecWarning(autoRespond)
|
|
|
|
|
for device in deviceList:
|
2020-09-07 21:29:38 -04:00
|
|
|
if int(value) < 0:
|
|
|
|
|
printErrLog(device, 'Unable to set OverDrive')
|
|
|
|
|
logging.debug('Overdrive cannot be less than 0%')
|
|
|
|
|
RETCODE = 1
|
2020-07-15 06:01:40 -04:00
|
|
|
return
|
2020-09-07 21:29:38 -04:00
|
|
|
if int(value) > 20:
|
|
|
|
|
printLog(device, 'Setting OverDrive to 20%', None)
|
|
|
|
|
logging.debug('OverDrive cannot be set to a value greater than 20%')
|
|
|
|
|
value = '20'
|
|
|
|
|
if getPerfLevel(device) != 'MANUAL':
|
2020-07-15 06:01:40 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_perf_level_manual_' + str(clktype)):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Performance level set to manual', None)
|
2020-09-07 21:29:38 -04:00
|
|
|
if clktype == 'mclk':
|
|
|
|
|
fsFile = os.path.join('/sys/class/drm', 'card%d' % (device), 'device', 'pp_mclk_od')
|
|
|
|
|
if not os.path.isfile(fsFile):
|
2023-04-13 10:43:52 -05:00
|
|
|
printLog(None, 'Unable to write to sysfs file (' + fsFile +
|
|
|
|
|
'), file does not exist', None)
|
2020-09-07 21:29:38 -04:00
|
|
|
logging.debug('%s does not exist', fsFile)
|
|
|
|
|
continue
|
|
|
|
|
try:
|
|
|
|
|
logging.debug('Writing value \'%s\' to file \'%s\'', value, fsFile)
|
|
|
|
|
with open(fsFile, 'w') as fs:
|
|
|
|
|
fs.write(value + '\n')
|
|
|
|
|
except (IOError, OSError):
|
2023-04-13 10:43:52 -05:00
|
|
|
printLog(None, 'Unable to write to sysfs file %s' %fsFile, None)
|
2020-09-07 21:29:38 -04:00
|
|
|
logging.warning('IO or OS error')
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
continue
|
|
|
|
|
printLog(device, 'Successfully set %s OverDrive to %s%%' % (clktype, value), None)
|
|
|
|
|
elif clktype == 'sclk':
|
2020-07-15 06:01:40 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(int(value)))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_overdrive_level_' + str(clktype)):
|
2020-09-07 21:29:38 -04:00
|
|
|
printLog(device, 'Successfully set %s OverDrive to %s%%' % (clktype, value), None)
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2020-09-07 21:29:38 -04:00
|
|
|
printLog(device, 'Unable to set %s OverDrive to %s%%' % (clktype, value), None)
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
|
|
|
|
printErrLog(device, 'Unable to set OverDrive')
|
|
|
|
|
logging.error('Unsupported clock type %s', clktype)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def setClocks(deviceList, clktype, clk):
|
2021-02-23 18:51:29 -06:00
|
|
|
""" Set clock frequency levels for a list of devices.
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param clktype: [validClockNames] Clock type to set
|
|
|
|
|
:param clk: Clock frequency level to set
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global RETCODE
|
|
|
|
|
if not clk:
|
|
|
|
|
printLog(None, 'Invalid clock frequency', None)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
if clktype not in validClockNames:
|
2021-04-15 17:03:17 -04:00
|
|
|
printErrLog(None, 'Unable to set clock level')
|
2020-07-15 06:01:40 -04:00
|
|
|
logging.error('Invalid clock type %s', clktype)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
check_value = ''.join(map(str, clk))
|
|
|
|
|
try:
|
|
|
|
|
int(check_value)
|
|
|
|
|
except ValueError:
|
|
|
|
|
printLog(None, 'Unable to set clock level', None)
|
2023-06-07 11:56:29 +08:00
|
|
|
logging.error('Non-integer characters are present in value %s', check_value)
|
2020-07-15 06:01:40 -04:00
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
# Generate a frequency bitmask from user input value
|
2021-02-23 18:51:29 -06:00
|
|
|
freq_bitmask = 0
|
|
|
|
|
for bit in clk:
|
|
|
|
|
if bit > 63:
|
2021-04-15 17:03:17 -04:00
|
|
|
printErrLog(None, 'Invalid clock frequency')
|
2021-02-23 18:51:29 -06:00
|
|
|
logging.error('Invalid frequency: %s', bit)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
freq_bitmask |= (1 << bit)
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer(' Set %s Frequency ' % (str(clktype)))
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
# Check if the performance level is manual, if not then set it to manual
|
|
|
|
|
if getPerfLevel(device).lower() != 'manual':
|
|
|
|
|
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_perf_level_manual'):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Performance level was set to manual', None)
|
|
|
|
|
else:
|
|
|
|
|
RETCODE = 1
|
2025-10-23 08:56:41 -05:00
|
|
|
continue
|
2021-07-29 12:43:54 -04:00
|
|
|
if clktype != 'pcie':
|
2023-04-04 12:46:38 -05:00
|
|
|
# Validate frequency bitmask
|
|
|
|
|
freq = rsmi_frequencies_t()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clktype], byref(freq))
|
2025-09-10 14:50:23 -05:00
|
|
|
if not rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clktype)):
|
2023-04-04 12:46:38 -05:00
|
|
|
RETCODE = 1
|
2025-10-23 08:56:41 -05:00
|
|
|
continue
|
2023-04-04 12:46:38 -05:00
|
|
|
# The freq_bitmask should be less than 2^(freqs.num_supported)
|
|
|
|
|
# For example, num_supported == 3, the max bitmask is 0111
|
|
|
|
|
if freq_bitmask >= (1 << freq.num_supported):
|
|
|
|
|
printErrLog(device, 'Invalid clock frequency %s' % hex(freq_bitmask))
|
|
|
|
|
RETCODE = 1
|
2025-10-23 08:56:41 -05:00
|
|
|
continue
|
2023-04-04 12:46:38 -05:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_gpu_clk_freq_set(device, rsmi_clk_names_dict[clktype], freq_bitmask)
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_gpu_clk_freq_' + str(clktype)):
|
2021-02-23 18:51:29 -06:00
|
|
|
printLog(device, 'Successfully set %s bitmask to' % (clktype), hex(freq_bitmask))
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2020-08-27 15:00:53 -04:00
|
|
|
RETCODE = 1
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2023-04-04 12:46:38 -05:00
|
|
|
# Validate the bandwidth bitmask
|
|
|
|
|
bw = rsmi_pcie_bandwidth_t()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
|
2025-09-10 14:50:23 -05:00
|
|
|
if not rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth'):
|
2023-04-04 12:46:38 -05:00
|
|
|
RETCODE = 1
|
2025-10-23 08:56:41 -05:00
|
|
|
continue
|
2023-04-04 12:46:38 -05:00
|
|
|
# The freq_bitmask should be less than 2^(bw.transfer_rate.num_supported)
|
|
|
|
|
# For example, num_supported == 3, the max bitmask is 0111
|
|
|
|
|
if freq_bitmask >= (1 << bw.transfer_rate.num_supported):
|
|
|
|
|
printErrLog(device, 'Invalid PCIe frequency %s' % hex(freq_bitmask))
|
|
|
|
|
RETCODE = 1
|
2025-10-23 08:56:41 -05:00
|
|
|
continue
|
2023-04-04 12:46:38 -05:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_pci_bandwidth_set(device, freq_bitmask)
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_PCIe_bandwidth'):
|
2021-02-23 18:51:29 -06:00
|
|
|
printLog(device, 'Successfully set %s to level bitmask' % (clktype), hex(freq_bitmask))
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2020-08-27 15:00:53 -04:00
|
|
|
RETCODE = 1
|
2025-10-23 08:56:41 -05:00
|
|
|
continue
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2021-03-24 14:06:28 -04:00
|
|
|
def setPerfDeterminism(deviceList, clkvalue):
|
2020-10-30 13:40:26 -04:00
|
|
|
""" Set clock frequency level for a list of devices to enable performance
|
|
|
|
|
determinism.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param value: Clock frequency level to set
|
2020-10-30 13:40:26 -04:00
|
|
|
"""
|
|
|
|
|
global RETCODE
|
|
|
|
|
try:
|
2021-03-24 14:06:28 -04:00
|
|
|
int(clkvalue)
|
2020-10-30 13:40:26 -04:00
|
|
|
except ValueError:
|
2023-06-07 11:56:29 +08:00
|
|
|
printErrLog(None, 'Unable to set Performance Determinism')
|
2021-03-24 14:06:28 -04:00
|
|
|
logging.error('%s is not an integer', clkvalue)
|
2020-10-30 13:40:26 -04:00
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
for device in deviceList:
|
2021-04-15 17:03:17 -04:00
|
|
|
ret = rocmsmi.rsmi_perf_determinism_mode_set(device, int(clkvalue))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_perf_determinism'):
|
2021-03-24 14:06:28 -04:00
|
|
|
printLog(device, 'Successfully enabled performance determinism and set GFX clock frequency', str(clkvalue))
|
2020-10-30 13:40:26 -04:00
|
|
|
else:
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
|
|
|
|
|
|
2020-08-18 14:05:41 -04:00
|
|
|
def resetGpu(device):
|
|
|
|
|
""" Perform a GPU reset on the specified device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
2020-08-18 14:05:41 -04:00
|
|
|
"""
|
2020-11-03 06:35:42 -05:00
|
|
|
printLogSpacer(' Reset GPU ')
|
2020-08-18 14:05:41 -04:00
|
|
|
global RETCODE
|
2021-01-07 05:35:17 -05:00
|
|
|
if len(device) > 1:
|
2020-08-18 14:05:41 -04:00
|
|
|
logging.error('GPU Reset can only be performed on one GPU per call')
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
2021-04-28 07:43:07 -04:00
|
|
|
resetDev = int(device[0])
|
|
|
|
|
if not isAmdDevice(resetDev):
|
2021-04-13 08:00:17 -04:00
|
|
|
logging.error('GPU Reset can only be performed on an AMD GPU')
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
2021-01-07 05:35:17 -05:00
|
|
|
ret = rocmsmi.rsmi_dev_gpu_reset(resetDev)
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, resetDev, 'reset_gpu'):
|
2021-01-07 05:35:17 -05:00
|
|
|
printLog(resetDev, 'Successfully reset GPU %d' % (resetDev), None)
|
2020-08-18 14:05:41 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2020-10-22 17:12:32 -04:00
|
|
|
def isRasControlAvailable(device):
|
|
|
|
|
""" Check if RAS control is available for a specified device.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
2020-10-22 17:12:32 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
path = os.path.join('/sys/kernel/debug/dri', 'card%d' % device, 'device', 'ras_ctrl')
|
|
|
|
|
|
|
|
|
|
if not doesDeviceExist(device) or not path or not os.path.isfile(path):
|
|
|
|
|
logging.warning('GPU[%s]\t: RAS control is not available')
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def setRas(deviceList, rasAction, rasBlock, rasType):
|
|
|
|
|
""" Perform a RAS action on the devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param rasAction: [enable|disable|inject] RAS Action to perform
|
|
|
|
|
:param rasBlock: [$validRasBlocks] RAS block
|
|
|
|
|
:param rasType: [ce|ue] Error type to enable/disable
|
2020-10-22 17:12:32 -04:00
|
|
|
"""
|
|
|
|
|
global RETCODE
|
|
|
|
|
printLog(None, "This is experimental feature, use 'amdgpuras' tool for ras error manipulations for newer vbios")
|
|
|
|
|
|
|
|
|
|
if rasAction not in validRasActions:
|
2023-06-07 11:56:29 +08:00
|
|
|
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType))
|
2020-10-22 17:12:32 -04:00
|
|
|
logging.debug('Action %s is not a valid RAS command' % rasAction)
|
|
|
|
|
return
|
|
|
|
|
if rasBlock not in validRasBlocks:
|
2023-06-07 11:56:29 +08:00
|
|
|
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType))
|
2020-10-22 17:12:32 -04:00
|
|
|
printLog(None, 'Block %s is not a valid RAS block' % rasBlock)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if rasType not in validRasTypes:
|
2023-06-07 11:56:29 +08:00
|
|
|
printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType))
|
2020-10-22 17:12:32 -04:00
|
|
|
printLog(None, 'Memory error type %s is not a valid RAS memory type' % rasAction)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
# NOTE PSP FW doesn't support enabling disabled counters yet
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if isRasControlAvailable(device):
|
|
|
|
|
rasFilePath = path = os.path.join('/sys/kernel/debug/dri', 'card%d' % device, 'device', 'ras_ctrl')
|
|
|
|
|
rasCmd = '%s %s %s' % (rasAction, rasBlock, rasType)
|
|
|
|
|
|
2021-07-29 12:43:54 -04:00
|
|
|
# writeToSysfs analog to old cli
|
2020-10-22 17:12:32 -04:00
|
|
|
if not os.path.isfile(rasFilePath):
|
|
|
|
|
printLog(None, 'Unable to write to sysfs file', None)
|
|
|
|
|
logging.debug('%s does not exist', rasFilePath)
|
|
|
|
|
return False
|
|
|
|
|
try:
|
|
|
|
|
logging.debug('Writing value \'%s\' to file \'%s\'', rasCmd, rasFilePath)
|
|
|
|
|
with open(rasFilePath, 'w') as fs:
|
|
|
|
|
fs.write(rasFilePath + '\n') # Certain sysfs files require \n at the end
|
|
|
|
|
except (IOError, OSError):
|
|
|
|
|
printLog(None, 'Unable to write to sysfs file %s' % rasFilePath, None)
|
|
|
|
|
logging.warning('IO or OS error')
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def setFanSpeed(deviceList, fan):
|
|
|
|
|
""" Set fan speed for a list of devices.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param level: [0-255] Fan speed level
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Set GPU Fan Speed ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if str(fan):
|
|
|
|
|
fanLevel = c_int64()
|
|
|
|
|
last_char = str(fan)[-1]
|
|
|
|
|
if last_char == '%':
|
2021-07-29 12:43:54 -04:00
|
|
|
fanLevel = int(str(fan)[:-1]) / 100 * 255
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
|
|
|
|
fanLevel = int(str(fan))
|
|
|
|
|
ret = rocmsmi.rsmi_dev_fan_speed_set(device, 0, int(fanLevel))
|
2023-09-12 16:34:04 -05:00
|
|
|
if rsmi_ret_ok(ret, device, silent=True):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Successfully set fan speed to level %s' % (str(int(fanLevel))), None)
|
2023-09-12 16:34:04 -05:00
|
|
|
else:
|
|
|
|
|
printLog(device, 'Not supported on the given system', None)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def setPerformanceLevel(deviceList, level):
|
|
|
|
|
""" Set the Performance Level for a specified device.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param level: Performance Level to set
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Set Performance Level ')
|
|
|
|
|
validLevels = ['auto', 'low', 'high', 'manual']
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if level not in validLevels:
|
|
|
|
|
printErrLog(device, 'Unable to set Performance Level')
|
|
|
|
|
logging.error('Invalid Performance level: %s', level)
|
|
|
|
|
else:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(validLevels.index(level)))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_perf_level'):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Performance level set to %s' % (str(level)), None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def setPowerOverDrive(deviceList, value, autoRespond):
|
|
|
|
|
""" Use Power OverDrive to change the the maximum power available power
|
|
|
|
|
available to the GPU in Watts. May be limited by the maximum power the
|
|
|
|
|
VBIOS is configured to allow this card to use in OverDrive mode.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param value: New maximum power to assign to the target device, in Watts
|
|
|
|
|
:param autoRespond: Response to automatically provide for all prompts
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global RETCODE, PRINT_JSON
|
|
|
|
|
try:
|
|
|
|
|
int(value)
|
|
|
|
|
except ValueError:
|
|
|
|
|
printLog(None, 'Unable to set Power OverDrive', None)
|
|
|
|
|
logging.error('%s is not an integer', value)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
2022-02-14 20:05:30 -05:00
|
|
|
# Wattage input value converted to microWatt for ROCm SMI Lib
|
|
|
|
|
|
|
|
|
|
if int(value) == 0:
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer(' Reset GPU Power OverDrive ')
|
|
|
|
|
else:
|
|
|
|
|
printLogSpacer(' Set GPU Power OverDrive ')
|
2022-02-14 20:05:30 -05:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
# Value in Watts - stored early this way to avoid strenuous value type conversions
|
|
|
|
|
strValue = value
|
2021-01-27 03:05:24 -05:00
|
|
|
specWarningConfirmed = False
|
2020-07-15 06:01:40 -04:00
|
|
|
for device in deviceList:
|
2022-04-08 12:48:42 -04:00
|
|
|
# Continue to next device in deviceList loop if the device is a secondary die
|
|
|
|
|
if checkIfSecondaryDie(device):
|
|
|
|
|
logging.debug("Unavailable for secondary die.")
|
|
|
|
|
continue
|
2020-07-15 06:01:40 -04:00
|
|
|
power_cap_min = c_uint64()
|
|
|
|
|
power_cap_max = c_uint64()
|
2022-02-14 20:05:30 -05:00
|
|
|
current_power_cap = c_uint64()
|
|
|
|
|
default_power_cap = c_uint64()
|
|
|
|
|
new_power_cap = c_uint64()
|
|
|
|
|
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(current_power_cap))
|
|
|
|
|
if ret != 0:
|
|
|
|
|
logging.debug("Unable to retireive current power cap.")
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_cap_default_get(device, byref(default_power_cap))
|
|
|
|
|
# If rsmi_dev_power_cap_default_get fails, use manual workaround to fetch default power cap
|
|
|
|
|
if ret != 0:
|
|
|
|
|
logging.debug("Unable to retrieve default power cap; retrieving via reset.")
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_cap_set(device, 0, 0)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(default_power_cap))
|
|
|
|
|
|
|
|
|
|
if int(value) == 0:
|
|
|
|
|
new_power_cap = default_power_cap
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2022-02-14 20:05:30 -05:00
|
|
|
new_power_cap.value = int(value) * 1000000
|
|
|
|
|
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_cap_range_get(device, 0, byref(power_cap_max), byref(power_cap_min))
|
2025-09-10 14:50:23 -05:00
|
|
|
if not rsmi_ret_ok(ret, device, 'get_power_cap_range'):
|
2020-07-15 06:01:40 -04:00
|
|
|
printErrLog(device, 'Unable to parse Power OverDrive range')
|
|
|
|
|
RETCODE = 1
|
2022-04-08 12:48:42 -04:00
|
|
|
continue
|
2020-07-15 06:01:40 -04:00
|
|
|
if int(strValue) > (power_cap_max.value / 1000000):
|
|
|
|
|
printErrLog(device, 'Unable to set Power OverDrive')
|
|
|
|
|
logging.error('GPU[%s]\t\t: Value cannot be greater than: %dW ', device, power_cap_max.value / 1000000)
|
|
|
|
|
RETCODE = 1
|
2022-04-08 12:48:42 -04:00
|
|
|
continue
|
2020-07-15 06:01:40 -04:00
|
|
|
if int(strValue) < (power_cap_min.value / 1000000):
|
|
|
|
|
printErrLog(device, 'Unable to set Power OverDrive')
|
|
|
|
|
logging.error('GPU[%s]\t\t: Value cannot be less than: %dW ', device, power_cap_min.value / 1000000)
|
|
|
|
|
RETCODE = 1
|
2022-04-08 12:48:42 -04:00
|
|
|
continue
|
2022-02-14 20:05:30 -05:00
|
|
|
if new_power_cap.value == current_power_cap.value:
|
2023-07-27 15:18:28 -05:00
|
|
|
printLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000))
|
2022-02-14 20:05:30 -05:00
|
|
|
|
2021-01-27 03:05:24 -05:00
|
|
|
if current_power_cap.value < default_power_cap.value:
|
|
|
|
|
current_power_cap.value = default_power_cap.value
|
|
|
|
|
if not specWarningConfirmed and new_power_cap.value > current_power_cap.value:
|
|
|
|
|
confirmOutOfSpecWarning(autoRespond)
|
|
|
|
|
specWarningConfirmed = True
|
2022-02-14 20:05:30 -05:00
|
|
|
|
2021-01-27 03:05:24 -05:00
|
|
|
ret = rocmsmi.rsmi_dev_power_cap_set(device, 0, new_power_cap)
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_power_cap'):
|
2022-02-14 20:05:30 -05:00
|
|
|
if int(value) == 0:
|
2020-07-15 06:01:40 -04:00
|
|
|
power_cap = c_uint64()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_power_cap'):
|
2020-07-15 06:01:40 -04:00
|
|
|
if not PRINT_JSON:
|
2021-07-29 12:43:54 -04:00
|
|
|
printLog(device,
|
|
|
|
|
'Successfully reset Power OverDrive to: %sW' % (int(power_cap.value / 1000000)), None)
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
|
|
|
|
if not PRINT_JSON:
|
2021-01-27 03:05:24 -05:00
|
|
|
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(current_power_cap))
|
|
|
|
|
if current_power_cap.value == new_power_cap.value:
|
|
|
|
|
printLog(device, 'Successfully set power to: %sW' % (strValue), None)
|
|
|
|
|
else:
|
|
|
|
|
printErrLog(device, 'Unable set power to: %sW, current value is %sW' % \
|
2021-07-29 12:43:54 -04:00
|
|
|
(strValue, int(current_power_cap.value / 1000000)))
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2022-02-14 20:05:30 -05:00
|
|
|
if int(value) == 0:
|
2020-07-15 06:01:40 -04:00
|
|
|
printErrLog(device, 'Unable to reset Power OverDrive to default')
|
|
|
|
|
else:
|
|
|
|
|
printErrLog(device, 'Unable to set Power OverDrive to ' + strValue + 'W')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def setProfile(deviceList, profile):
|
|
|
|
|
""" Set Power Profile, or set CUSTOM Power Profile values for a list of devices.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param profile: Profile to set
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Set Power Profile ')
|
|
|
|
|
status = rsmi_power_profile_status_t()
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
# Get previous profile
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_power_profile'):
|
2020-07-15 06:01:40 -04:00
|
|
|
previousProfile = profileString(status.current)
|
|
|
|
|
# Get desired profile
|
|
|
|
|
desiredProfile = 'UNKNOWN'
|
|
|
|
|
if str(profile).isnumeric() and int(profile) > 0 and int(profile) < 8:
|
|
|
|
|
desiredProfile = profileString(2 ** (int(profile) - 1))
|
|
|
|
|
elif str(profileString(str(profile).replace('_', ' ').upper())).isnumeric():
|
|
|
|
|
desiredProfile = str(profile).replace('_', ' ').upper()
|
|
|
|
|
else:
|
|
|
|
|
printErrLog(device, 'Unable to set profile to: %s (UNKNOWN profile)' % (str(profile)))
|
|
|
|
|
return
|
|
|
|
|
# Set profile to desired profile
|
|
|
|
|
if previousProfile == desiredProfile:
|
|
|
|
|
printLog(device, 'Profile was already set to', previousProfile)
|
|
|
|
|
return
|
|
|
|
|
else:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_profile_set(device, 0, profileString(desiredProfile))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_power_profile'):
|
2020-07-15 06:01:40 -04:00
|
|
|
# Get current profile
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_power_profile_presets'):
|
2020-07-15 06:01:40 -04:00
|
|
|
currentProfile = profileString(status.current)
|
|
|
|
|
if currentProfile == desiredProfile:
|
|
|
|
|
printLog(device, 'Successfully set profile to', desiredProfile)
|
|
|
|
|
else:
|
|
|
|
|
printErrLog(device, 'Failed to set profile to: %s' % (desiredProfile))
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2023-01-06 11:01:18 -06:00
|
|
|
def setComputePartition(deviceList, computePartitionType):
|
|
|
|
|
""" Sets compute partitioning for a list of device
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param computePartition: Compute Partition type to set as
|
2023-01-06 11:01:18 -06:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Set compute partition to %s ' % (str(computePartitionType).upper()))
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
computePartitionType = computePartitionType.upper()
|
|
|
|
|
if computePartitionType not in compute_partition_type_l:
|
|
|
|
|
printErrLog(device, 'Invalid compute partition type %s'
|
|
|
|
|
'\nValid compute partition types are %s'
|
|
|
|
|
% ( computePartitionType.upper(),
|
|
|
|
|
(', '.join(map(str, compute_partition_type_l))) ))
|
|
|
|
|
return (None, None)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_compute_partition_set(device,
|
|
|
|
|
rsmi_compute_partition_type_dict[computePartitionType])
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_compute_partition', silent=True):
|
2023-01-06 11:01:18 -06:00
|
|
|
printLog(device,
|
|
|
|
|
'Successfully set compute partition to %s' % (computePartitionType),
|
|
|
|
|
None)
|
|
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
|
|
|
|
|
printLog(device, 'Permission denied', None)
|
2023-02-20 15:16:06 -06:00
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_SETTING_UNAVAILABLE:
|
|
|
|
|
printLog(device, 'Requested setting (%s) is unavailable for current device'
|
|
|
|
|
%computePartitionType, None)
|
2023-01-06 11:01:18 -06:00
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
|
|
|
|
printLog(device, 'Not supported on the given system', None)
|
2023-10-23 21:37:31 -05:00
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
|
|
|
|
|
printLog(device, 'Device is currently busy, try again later',
|
|
|
|
|
None)
|
2023-01-06 11:01:18 -06:00
|
|
|
else:
|
2023-04-13 10:43:52 -05:00
|
|
|
rsmi_ret_ok(ret, device, 'set_compute_partition')
|
2023-01-06 11:01:18 -06:00
|
|
|
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2023-01-30 15:58:03 -06:00
|
|
|
def progressbar(it, prefix="", size=60, out=sys.stdout):
|
|
|
|
|
count = len(it)
|
|
|
|
|
def show(j):
|
|
|
|
|
x = int(size*j/count)
|
|
|
|
|
lock = multiprocessing.Lock()
|
|
|
|
|
lock.acquire()
|
|
|
|
|
print("{}[{}{}] {}/{} secs remain".format(prefix, u"█"*x, "."*(size-x), j, count),
|
|
|
|
|
end='\r', file=out, flush=True)
|
|
|
|
|
lock.release()
|
|
|
|
|
show(0)
|
|
|
|
|
for i, item in enumerate(it):
|
|
|
|
|
yield item
|
|
|
|
|
show(i+1)
|
|
|
|
|
lock = multiprocessing.Lock()
|
|
|
|
|
lock.acquire()
|
|
|
|
|
print("\n", flush=True, file=out)
|
|
|
|
|
lock.release()
|
|
|
|
|
|
|
|
|
|
def showProgressbar(title="", timeInSeconds=13):
|
|
|
|
|
if title != "":
|
|
|
|
|
title += ": "
|
|
|
|
|
for i in progressbar(range(timeInSeconds), title, 40):
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
2024-11-06 15:13:32 -06:00
|
|
|
def setMemoryPartition(deviceList, memoryPartition, autoRespond):
|
2023-09-21 14:53:35 -05:00
|
|
|
""" Sets memory partition (memory partition) for a list of devices
|
2023-01-30 15:58:03 -06:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param memoryPartition: Memory Partition type to set as
|
2023-01-30 15:58:03 -06:00
|
|
|
"""
|
2024-11-06 15:13:32 -06:00
|
|
|
addExtraLine=False
|
2023-09-21 14:53:35 -05:00
|
|
|
printLogSpacer(' Set memory partition to %s ' % (str(memoryPartition).upper()))
|
2024-11-06 15:13:32 -06:00
|
|
|
confirmChangingMemoryPartitionAndReloadingAMDGPU(autoRespond)
|
2023-01-30 15:58:03 -06:00
|
|
|
for device in deviceList:
|
2024-11-06 15:13:32 -06:00
|
|
|
current_memory_partition = getMemoryPartition(device, silent=True)
|
|
|
|
|
if current_memory_partition == 'N/A':
|
|
|
|
|
printLog(device, 'Not supported on the given system', None, addExtraLine)
|
|
|
|
|
continue
|
2023-09-21 14:53:35 -05:00
|
|
|
memoryPartition = memoryPartition.upper()
|
|
|
|
|
if memoryPartition not in memory_partition_type_l:
|
|
|
|
|
printErrLog(device, 'Invalid memory partition type %s'
|
|
|
|
|
'\nValid memory partition types are %s'
|
|
|
|
|
% ( memoryPartition.upper(),
|
|
|
|
|
(', '.join(map(str, memory_partition_type_l))) ))
|
2023-01-30 15:58:03 -06:00
|
|
|
return (None, None)
|
|
|
|
|
|
2024-11-21 23:28:03 -06:00
|
|
|
kTimeWait = 140
|
2023-01-30 15:58:03 -06:00
|
|
|
t1 = multiprocessing.Process(target=showProgressbar,
|
2024-11-06 15:13:32 -06:00
|
|
|
args=("Updating memory partition",kTimeWait,))
|
2023-01-30 15:58:03 -06:00
|
|
|
t1.start()
|
|
|
|
|
addExtraLine=True
|
|
|
|
|
start=time.time()
|
2023-09-21 14:53:35 -05:00
|
|
|
ret = rocmsmi.rsmi_dev_memory_partition_set(device,
|
|
|
|
|
rsmi_memory_partition_type_dict[memoryPartition])
|
2023-01-30 15:58:03 -06:00
|
|
|
stop=time.time()
|
|
|
|
|
duration=stop-start
|
|
|
|
|
if t1.is_alive():
|
|
|
|
|
t1.terminate()
|
|
|
|
|
t1.join()
|
|
|
|
|
if duration < float(0.1): # For longer runs, add extra line before output
|
|
|
|
|
addExtraLine=False # This is to prevent overriding progress bar
|
|
|
|
|
|
2023-09-21 14:53:35 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'set_memory_partition', silent=True):
|
2023-01-30 15:58:03 -06:00
|
|
|
printLog(device,
|
2023-09-21 14:53:35 -05:00
|
|
|
'Successfully set memory partition to %s' % (memoryPartition),
|
2023-01-30 15:58:03 -06:00
|
|
|
None, addExtraLine)
|
|
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
|
|
|
|
|
printLog(device, 'Permission denied', None, addExtraLine)
|
|
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
|
|
|
|
printLog(device, 'Not supported on the given system', None, addExtraLine)
|
2024-11-06 15:13:32 -06:00
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_INVALID_ARGS:
|
|
|
|
|
printLog(device, 'Device does not support setting to ' + str(memoryPartition).upper(), None, addExtraLine)
|
|
|
|
|
memory_partition_caps = getMemoryPartitionCapabilities(device, silent=True)
|
|
|
|
|
printLog(device, 'Available memory partition modes: ' + str(memory_partition_caps).upper(), None, addExtraLine)
|
2023-10-23 21:37:31 -05:00
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
|
|
|
|
|
printLog(device, 'Device is currently busy, try again later',
|
|
|
|
|
None, addExtraLine)
|
2024-11-06 15:13:32 -06:00
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_AMDGPU_RESTART_ERR:
|
|
|
|
|
printLog(device, 'Issue reloading driver, please check dmsg for errors',
|
|
|
|
|
None, addExtraLine)
|
2023-01-30 15:58:03 -06:00
|
|
|
else:
|
2024-11-06 15:13:32 -06:00
|
|
|
printErrLog(device, 'Failed to set memory partition, even though device supports it.')
|
2023-01-30 15:58:03 -06:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
2023-10-02 17:57:02 -05:00
|
|
|
def showVersion(isCSV=False):
|
|
|
|
|
values = { 'ROCM-SMI version': __version__ }
|
|
|
|
|
|
|
|
|
|
version = rsmi_version_t()
|
|
|
|
|
status = rocmsmi.rsmi_version_get(byref(version))
|
|
|
|
|
if status == 0:
|
|
|
|
|
version_string = "%u.%u.%u" % (version.major, version.minor, version.patch)
|
|
|
|
|
values['ROCM-SMI-LIB version'] = version_string
|
|
|
|
|
|
|
|
|
|
if isCSV:
|
|
|
|
|
print('name, value')
|
|
|
|
|
for k in values.keys():
|
|
|
|
|
print('%s, %s' % (k, values[k]))
|
|
|
|
|
return
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
temp_str = '{\n'
|
|
|
|
|
for k in values.keys():
|
|
|
|
|
temp_str += ' "%s": "%s",\n' % (k, values[k])
|
|
|
|
|
if len(values.keys()) > 1:
|
|
|
|
|
# replace ',\n' with '\n}'
|
|
|
|
|
temp_str = temp_str[:-2]
|
|
|
|
|
temp_str += '\n}'
|
|
|
|
|
print(temp_str)
|
|
|
|
|
return
|
|
|
|
|
for k in values.keys():
|
|
|
|
|
print('%s: %s' % (k, values[k]))
|
2023-01-30 15:58:03 -06:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def showAllConcise(deviceList):
|
|
|
|
|
""" Display critical info for all devices in a concise format
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
2023-08-08 16:47:16 -05:00
|
|
|
global PRINT_JSON, appWidth
|
2020-07-15 06:01:40 -04:00
|
|
|
if PRINT_JSON:
|
2024-06-21 09:08:53 -05:00
|
|
|
print('NOT_SUPPORTED: Cannot print JSON/CSV output for concise output')
|
2020-07-15 06:01:40 -04:00
|
|
|
sys.exit(1)
|
2023-08-08 16:47:16 -05:00
|
|
|
|
2023-09-07 16:20:30 -05:00
|
|
|
silent = True
|
2023-08-08 16:47:16 -05:00
|
|
|
|
2023-08-10 18:25:02 -05:00
|
|
|
deviceList.sort()
|
2023-09-24 02:29:07 -05:00
|
|
|
available_temp_type = getTemperatureLabel(deviceList)
|
|
|
|
|
temp_type = "(" + available_temp_type.capitalize() + ")"
|
2024-01-31 21:03:33 -06:00
|
|
|
header=['Device', 'Node','IDs','', 'Temp', 'Power', 'Partitions',
|
2023-09-24 02:29:07 -05:00
|
|
|
'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
|
2024-02-26 20:58:17 -06:00
|
|
|
subheader = ['', '','(DID,', 'GUID)', temp_type, getPowerLabel(deviceList),
|
|
|
|
|
'(Mem, Compute, ID)',
|
|
|
|
|
'', '', '', '', '', '', '']
|
2023-08-10 18:25:02 -05:00
|
|
|
# add additional spaces to match header
|
|
|
|
|
for idx, item in enumerate(subheader):
|
|
|
|
|
header_size = len(header[idx])
|
|
|
|
|
subheader_size = len(subheader[idx])
|
|
|
|
|
if header_size != subheader_size:
|
|
|
|
|
numSpacesToFill_subheader = header_size - subheader_size
|
|
|
|
|
numSpacesToFill_header = subheader_size - header_size
|
|
|
|
|
#take pos spaces to mean, we need to match size of the other
|
|
|
|
|
if numSpacesToFill_subheader > 0:
|
|
|
|
|
subheader[idx] = subheader[idx] + (' ' * numSpacesToFill_subheader)
|
|
|
|
|
if numSpacesToFill_header > 0:
|
|
|
|
|
header[idx] = header[idx] + (' ' * numSpacesToFill_header)
|
2020-09-09 17:34:44 -04:00
|
|
|
head_widths = [len(head) + 2 for head in header]
|
2020-07-15 06:01:40 -04:00
|
|
|
values = {}
|
2023-08-10 18:25:02 -05:00
|
|
|
degree_sign = u'\N{DEGREE SIGN}'
|
2020-07-15 06:01:40 -04:00
|
|
|
for device in deviceList:
|
2023-09-07 16:20:30 -05:00
|
|
|
temp_val = str(getTemp(device, available_temp_type, silent))
|
2023-08-10 18:25:02 -05:00
|
|
|
if temp_val != 'N/A':
|
|
|
|
|
temp_val += degree_sign + 'C'
|
2024-02-02 00:00:38 -06:00
|
|
|
power_dict = getPower(device)
|
2023-09-24 02:29:07 -05:00
|
|
|
powerVal = 'N/A'
|
2024-04-26 23:48:15 -05:00
|
|
|
if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and
|
2024-02-02 00:00:38 -06:00
|
|
|
power_dict['power_type'] != 'INVALID_POWER_TYPE'):
|
|
|
|
|
if power_dict['power'] != 0:
|
|
|
|
|
powerVal = power_dict['power'] + power_dict['unit']
|
2024-02-26 20:58:17 -06:00
|
|
|
combined_partition_data = (getMemoryPartition(device, silent) + ", "
|
|
|
|
|
+ getComputePartition(device, silent)
|
|
|
|
|
+ ", " + getPartitionId(device, silent))
|
2023-09-07 16:20:30 -05:00
|
|
|
sclk = showCurrentClocks([device], 'sclk', concise=silent)
|
2025-01-14 17:15:18 -06:00
|
|
|
if not sclk:
|
|
|
|
|
sclk = 'N/A'
|
2023-09-07 16:20:30 -05:00
|
|
|
mclk = showCurrentClocks([device], 'mclk', concise=silent)
|
2025-01-14 17:15:18 -06:00
|
|
|
if not mclk:
|
|
|
|
|
mclk = 'N/A'
|
2023-09-07 16:20:30 -05:00
|
|
|
(retCode, fanLevel, fanSpeed) = getFanSpeed(device, silent)
|
2020-07-15 06:01:40 -04:00
|
|
|
fan = str(fanSpeed) + '%'
|
2023-09-07 16:20:30 -05:00
|
|
|
if getPerfLevel(device, silent) != -1:
|
2025-04-13 22:38:31 -05:00
|
|
|
perf = str(getPerfLevel(device, silent)).lower()
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2025-01-14 17:15:18 -06:00
|
|
|
perf = 'N/A'
|
2023-09-07 16:20:30 -05:00
|
|
|
if getMaxPower(device, silent) != -1:
|
|
|
|
|
pwrCap = str(getMaxPower(device, silent)) + 'W'
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2025-01-14 17:15:18 -06:00
|
|
|
pwrCap = 'N/A'
|
2023-09-07 16:20:30 -05:00
|
|
|
if getGpuUse(device, silent) != -1:
|
|
|
|
|
gpu_busy = str(getGpuUse(device, silent)) + '%'
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2025-01-14 17:15:18 -06:00
|
|
|
gpu_busy = 'N/A'
|
2024-03-14 23:44:57 -05:00
|
|
|
allocated_mem_percent = getAllocatedMemoryPercent(device)
|
2025-01-14 17:15:18 -06:00
|
|
|
if allocated_mem_percent['ret'] != rsmi_status_t.RSMI_STATUS_SUCCESS:
|
|
|
|
|
allocated_mem_percent['combined'] = 'N/A'
|
2024-01-31 21:03:33 -06:00
|
|
|
|
|
|
|
|
# Top Row - per device data
|
2024-02-26 20:58:17 -06:00
|
|
|
values['card%s' % (str(device))] = [device, getNodeId(device),
|
2024-01-31 21:03:33 -06:00
|
|
|
str(getDRMDeviceId(device)) + ", ",
|
|
|
|
|
str(getGUID(device)),
|
2024-04-26 23:48:15 -05:00
|
|
|
temp_val, powerVal,
|
2024-02-26 20:58:17 -06:00
|
|
|
combined_partition_data,
|
2025-04-13 22:38:31 -05:00
|
|
|
sclk, mclk, fan, perf,
|
2024-01-31 21:03:33 -06:00
|
|
|
str(pwrCap),
|
2024-03-14 23:44:57 -05:00
|
|
|
allocated_mem_percent['combined'],
|
2024-01-31 21:03:33 -06:00
|
|
|
str(gpu_busy)]
|
2023-08-08 16:47:16 -05:00
|
|
|
|
2023-02-27 08:01:15 -06:00
|
|
|
val_widths = {}
|
2020-07-15 06:01:40 -04:00
|
|
|
for device in deviceList:
|
2020-09-09 17:34:44 -04:00
|
|
|
val_widths[device] = [len(str(val)) + 2 for val in values['card%s' % (str(device))]]
|
2020-07-15 06:01:40 -04:00
|
|
|
max_widths = head_widths
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
for col in range(len(val_widths[device])):
|
|
|
|
|
max_widths[col] = max(max_widths[col], val_widths[device][col])
|
2023-09-24 02:29:07 -05:00
|
|
|
|
|
|
|
|
########################
|
|
|
|
|
# Display concise info #
|
|
|
|
|
########################
|
|
|
|
|
header_output = "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header))
|
|
|
|
|
subheader_output = "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader))
|
|
|
|
|
printLogSpacer(headerString, contentSizeToFit=len(header_output))
|
|
|
|
|
printLogSpacer(' Concise Info ', contentSizeToFit=len(header_output))
|
|
|
|
|
printLog(None, header_output, None)
|
|
|
|
|
printLog(None, subheader_output, None, useItalics=True)
|
|
|
|
|
printLogSpacer(fill='=', contentSizeToFit=len(header_output))
|
2023-08-08 16:47:16 -05:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
for device in deviceList:
|
2021-07-29 12:43:54 -04:00
|
|
|
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
|
|
|
|
|
zip(range(len(max_widths)), values['card%s' % (str(device))])), None)
|
2023-08-08 16:47:16 -05:00
|
|
|
|
2023-09-24 02:29:07 -05:00
|
|
|
printLogSpacer(contentSizeToFit=len(header_output))
|
|
|
|
|
printLogSpacer(footerString, contentSizeToFit=len(header_output))
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def showAllConciseHw(deviceList):
|
2024-01-31 21:03:33 -06:00
|
|
|
""" Display critical Hardware info
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
if PRINT_JSON:
|
2024-06-21 09:08:53 -05:00
|
|
|
print('NOT_SUPPORTED: Cannot print JSON/CSV output for concise hardware output')
|
2020-07-15 06:01:40 -04:00
|
|
|
sys.exit(1)
|
2024-02-26 20:58:17 -06:00
|
|
|
header = ['GPU', 'NODE', 'DID', 'GUID', 'GFX VER', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS'
|
|
|
|
|
, 'PARTITION ID']
|
2020-09-09 17:34:44 -04:00
|
|
|
head_widths = [len(head) + 2 for head in header]
|
2020-07-15 06:01:40 -04:00
|
|
|
values = {}
|
2023-09-07 16:20:30 -05:00
|
|
|
silent = True
|
2020-07-15 06:01:40 -04:00
|
|
|
for device in deviceList:
|
2024-01-31 21:03:33 -06:00
|
|
|
did = getDRMDeviceId(device, silent)
|
|
|
|
|
nodeid = getNodeId(device, silent)
|
|
|
|
|
guid = getGUID(device, silent)
|
2024-02-26 20:58:17 -06:00
|
|
|
partition_id = getPartitionId(device, silent)
|
2024-01-31 21:03:33 -06:00
|
|
|
gfxVer = getTargetGfxVersion(device, silent)
|
2023-09-07 16:20:30 -05:00
|
|
|
gfxRas = getRasEnablement(device, 'GFX', silent)
|
|
|
|
|
sdmaRas = getRasEnablement(device, 'SDMA', silent)
|
|
|
|
|
umcRas = getRasEnablement(device, 'UMC', silent)
|
|
|
|
|
vbios = getVbiosVersion(device, silent)
|
|
|
|
|
bus = getBus(device, silent)
|
2024-01-31 21:03:33 -06:00
|
|
|
values['card%s' % (str(device))] = [device, nodeid, did, guid, gfxVer, gfxRas, sdmaRas,
|
2024-02-26 20:58:17 -06:00
|
|
|
umcRas, vbios, bus, partition_id]
|
2020-07-15 06:01:40 -04:00
|
|
|
val_widths = {}
|
|
|
|
|
for device in deviceList:
|
2020-09-09 17:34:44 -04:00
|
|
|
val_widths[device] = [len(str(val)) + 2 for val in values['card%s' % (str(device))]]
|
2020-07-15 06:01:40 -04:00
|
|
|
max_widths = head_widths
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
for col in range(len(val_widths[device])):
|
|
|
|
|
max_widths[col] = max(max_widths[col], val_widths[device][col])
|
2024-01-31 21:03:33 -06:00
|
|
|
device_output=""
|
2020-07-15 06:01:40 -04:00
|
|
|
for device in deviceList:
|
2024-01-31 21:03:33 -06:00
|
|
|
if (device + 1 != len(deviceList)):
|
|
|
|
|
device_output += "".join(str(word).ljust(max_widths[col]) for col, word in
|
|
|
|
|
zip(range(len(max_widths)), values['card%s' % (str(device))])) + "\n"
|
|
|
|
|
else:
|
|
|
|
|
device_output += "".join(str(word).ljust(max_widths[col]) for col, word in
|
|
|
|
|
zip(range(len(max_widths)), values['card%s' % (str(device))]))
|
|
|
|
|
|
|
|
|
|
#################################
|
|
|
|
|
# Display concise hardware info #
|
|
|
|
|
#################################
|
|
|
|
|
header_output = "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header))
|
|
|
|
|
printLogSpacer(headerString, contentSizeToFit=len(header_output))
|
|
|
|
|
printLogSpacer(' Concise Hardware Info ', contentSizeToFit=len(header_output))
|
|
|
|
|
printLog(None, header_output, None)
|
|
|
|
|
printLog(None, device_output, None)
|
|
|
|
|
printLogSpacer(fill='=', contentSizeToFit=len(header_output))
|
|
|
|
|
printLogSpacer(footerString, contentSizeToFit=len(header_output))
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def showBus(deviceList):
|
|
|
|
|
""" Display PCI Bus info
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' PCI Bus ID ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
printLog(device, 'PCI Bus', getBus(device))
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showClocks(deviceList):
|
|
|
|
|
""" Display all available clocks for a list of devices
|
|
|
|
|
|
|
|
|
|
Current clocks marked with a '*' symbol
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
freq = rsmi_frequencies_t()
|
|
|
|
|
bw = rsmi_pcie_bandwidth_t()
|
|
|
|
|
printLogSpacer(' Supported clock frequencies ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
for clk_type in sorted(rsmi_clk_names_dict):
|
2020-12-18 07:32:57 -05:00
|
|
|
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
|
2023-08-29 19:33:10 -05:00
|
|
|
if ret == rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA:
|
|
|
|
|
printLog(device, 'Clock [%s] on device [%s] exists but EMPTY! Likely driver error!' % (clk_type, str(device)))
|
|
|
|
|
continue
|
|
|
|
|
if not rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True):
|
|
|
|
|
continue
|
|
|
|
|
printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None)
|
2023-09-24 23:01:54 -05:00
|
|
|
for i in range(freq.num_supported):
|
|
|
|
|
freq_string = '{:>.0f}Mhz'.format(freq.frequency[i] / 1000000)
|
|
|
|
|
if i == freq.current:
|
|
|
|
|
freq_string += ' *'
|
|
|
|
|
freq_index = i
|
|
|
|
|
# Deep Sleep frequency is only supported by some GPUs
|
|
|
|
|
# It is indicated by letter 'S' instead of the index number
|
|
|
|
|
if freq.has_deep_sleep:
|
|
|
|
|
# sleep state
|
|
|
|
|
if i == 0:
|
|
|
|
|
freq_index = 'S'
|
|
|
|
|
# all indices are offset by 1 because Deep Sleep occupies index 0
|
|
|
|
|
else:
|
|
|
|
|
freq_index = i - 1
|
|
|
|
|
printLog(device, str(freq_index), freq_string)
|
2023-08-29 19:33:10 -05:00
|
|
|
printLog(device, '', None)
|
2020-12-18 07:32:57 -05:00
|
|
|
else:
|
2022-04-29 16:41:06 -04:00
|
|
|
logging.debug('{} frequency is unsupported on device[{}]'.format(clk_type, device))
|
2020-12-18 07:32:57 -05:00
|
|
|
printLog(device, '', None)
|
|
|
|
|
if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth', True):
|
2020-12-18 07:32:57 -05:00
|
|
|
printLog(device, 'Supported %s frequencies on GPU%s' % ('PCIe', str(device)), None)
|
2023-09-24 23:01:54 -05:00
|
|
|
for i in range(bw.transfer_rate.num_supported):
|
|
|
|
|
freq_string = '{:>.1f}GT/s x{}'.format(bw.transfer_rate.frequency[i] / 1000000000, bw.lanes[i])
|
|
|
|
|
if i == bw.transfer_rate.current:
|
|
|
|
|
freq_string += ' *'
|
|
|
|
|
printLog(device, str(i), str(freq_string))
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, '', None)
|
2020-12-18 07:32:57 -05:00
|
|
|
else:
|
2022-04-29 16:41:06 -04:00
|
|
|
logging.debug('PCIe frequency is unsupported on device [{}]'.format(device))
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, '', None)
|
2020-09-09 17:34:44 -04:00
|
|
|
printLogSpacer(None, '-') # divider between devices for better visibility
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showCurrentClocks(deviceList, clk_defined=None, concise=False):
|
|
|
|
|
""" Display all clocks for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param clk-type: Clock type to display
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
freq = rsmi_frequencies_t()
|
|
|
|
|
bw = rsmi_pcie_bandwidth_t()
|
|
|
|
|
if not concise:
|
|
|
|
|
printLogSpacer(' Current clock frequencies ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if clk_defined:
|
2020-12-18 07:32:57 -05:00
|
|
|
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], None) == 1:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_defined], byref(freq))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clk_defined), silent=True):
|
2020-12-18 07:32:57 -05:00
|
|
|
levl = freq.current
|
2022-05-03 18:41:45 -04:00
|
|
|
if levl >= freq.num_supported:
|
|
|
|
|
printLog(device, '%s current clock frequency not found' % (clk_defined), None)
|
|
|
|
|
continue
|
2020-12-18 07:32:57 -05:00
|
|
|
fr = freq.frequency[levl] / 1000000
|
2023-09-24 23:01:54 -05:00
|
|
|
freq_index = levl
|
|
|
|
|
if freq.has_deep_sleep:
|
|
|
|
|
# sleep state
|
|
|
|
|
if levl == 0:
|
|
|
|
|
freq_index = 'S'
|
|
|
|
|
# all indices are offset by 1 because Deep Sleep occupies index 0
|
|
|
|
|
else:
|
|
|
|
|
freq_index = levl - 1
|
2020-12-18 07:32:57 -05:00
|
|
|
if concise: # in case function is used for concise output, no need to print.
|
|
|
|
|
return '{:.0f}Mhz'.format(fr)
|
2023-09-24 23:01:54 -05:00
|
|
|
printLog(device, '{} clock level'.format(clk_defined), '{} ({:.0f}Mhz)'.format(freq_index, fr))
|
2023-09-07 16:20:30 -05:00
|
|
|
elif not concise:
|
|
|
|
|
logging.debug('{} clock is unsupported on device[{}]'.format(clk_defined, device))
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
else: # if clk is not defined, will display all current clk
|
|
|
|
|
for clk_type in sorted(rsmi_clk_names_dict):
|
2020-12-18 07:32:57 -05:00
|
|
|
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_clk_freq_' + str(clk_type), True):
|
2020-12-18 07:32:57 -05:00
|
|
|
levl = freq.current
|
2022-05-03 18:41:45 -04:00
|
|
|
if levl >= freq.num_supported:
|
|
|
|
|
printLog(device, '%s current clock frequency not found' % (clk_type), None)
|
|
|
|
|
continue
|
2023-09-24 23:01:54 -05:00
|
|
|
freq_index = levl
|
|
|
|
|
if freq.has_deep_sleep:
|
|
|
|
|
# sleep state
|
|
|
|
|
if levl == 0:
|
|
|
|
|
freq_index = 'S'
|
|
|
|
|
# all indices are offset by 1 because Deep Sleep occupies index 0
|
|
|
|
|
else:
|
|
|
|
|
freq_index = levl - 1
|
2020-12-18 07:32:57 -05:00
|
|
|
fr = freq.frequency[levl] / 1000000
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
printLog(device, '%s clock speed:' % (clk_type), '(%sMhz)' % (str(fr)[:-2]))
|
2023-09-24 23:01:54 -05:00
|
|
|
printLog(device, '%s clock level:' % (clk_type), freq_index)
|
2020-12-18 07:32:57 -05:00
|
|
|
else:
|
2023-09-24 23:01:54 -05:00
|
|
|
printLog(device, '%s clock level: %s' % (clk_type, freq_index), '(%sMhz)' % (str(fr)[:-2]))
|
2023-09-07 16:20:30 -05:00
|
|
|
elif not concise:
|
2022-04-29 16:41:06 -04:00
|
|
|
logging.debug('{} clock is unsupported on device[{}]'.format(clk_type, device))
|
2020-07-15 06:01:40 -04:00
|
|
|
# pcie clocks
|
2020-12-18 07:32:57 -05:00
|
|
|
if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth', True):
|
2020-12-18 07:32:57 -05:00
|
|
|
current_f = bw.transfer_rate.current
|
2022-05-03 18:41:45 -04:00
|
|
|
if current_f >= bw.transfer_rate.num_supported:
|
|
|
|
|
printLog(device, 'PCIe current clock frequency not found', None )
|
|
|
|
|
continue
|
2021-07-29 12:43:54 -04:00
|
|
|
fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000,
|
|
|
|
|
bw.lanes[current_f])
|
2020-12-18 07:32:57 -05:00
|
|
|
printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr))
|
2023-09-07 16:20:30 -05:00
|
|
|
elif not concise:
|
|
|
|
|
logging.debug('{} clock is unsupported on device[{}]'.format('PCIe', device))
|
2023-08-25 22:25:25 -05:00
|
|
|
if not concise:
|
|
|
|
|
printLogSpacer()
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def showCurrentFans(deviceList):
|
|
|
|
|
""" Display the current fan speed for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
printLogSpacer(' Current Fan Metric ')
|
|
|
|
|
rpmSpeed = c_int64()
|
|
|
|
|
sensor_ind = c_uint32(0)
|
|
|
|
|
|
|
|
|
|
for device in deviceList:
|
2023-08-22 17:15:18 -05:00
|
|
|
(retCode, fanLevel, fanSpeed) = getFanSpeed(device)
|
|
|
|
|
if (retCode == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED):
|
|
|
|
|
printLog(device, 'Not supported', None)
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2023-08-22 17:15:18 -05:00
|
|
|
fanSpeed = round(fanSpeed)
|
|
|
|
|
if fanLevel == 0 or fanSpeed == 0:
|
|
|
|
|
printLog(device, 'Unable to detect fan speed for GPU %d' % (device), None)
|
|
|
|
|
logging.debug('Current fan speed is: %d\n' % (fanSpeed) + \
|
|
|
|
|
' Current fan level is: %d\n' % (fanLevel) + \
|
|
|
|
|
' (GPU might be cooled with a non-PWM fan)')
|
|
|
|
|
continue
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
printLog(device, 'Fan speed (level)', str(fanLevel))
|
|
|
|
|
printLog(device, 'Fan speed (%)', str(fanSpeed))
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, 'Fan Level', str(fanLevel) + ' (%s%%)' % (str(fanSpeed)))
|
|
|
|
|
ret = rocmsmi.rsmi_dev_fan_rpms_get(device, sensor_ind, byref(rpmSpeed))
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_fan_rpms'):
|
|
|
|
|
printLog(device, 'Fan RPM', rpmSpeed.value)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showCurrentTemps(deviceList):
|
|
|
|
|
""" Display all available temperatures for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Temperature ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
for sensor in temp_type_lst:
|
2022-06-09 19:52:38 -04:00
|
|
|
temp = getTemp(device, sensor)
|
|
|
|
|
if temp != 'N/A':
|
|
|
|
|
printLog(device, 'Temperature (Sensor %s) (C)' % (sensor), temp)
|
|
|
|
|
else:
|
|
|
|
|
printInfoLog(device, 'Temperature (Sensor %s) (C)' % (sensor), temp)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showFwInfo(deviceList, fwType):
|
|
|
|
|
""" Show the requested FW information for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param fwType: [$validFwBlocks] FW block version to display (all if left empty)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
if not fwType or 'all' in fwType:
|
|
|
|
|
firmware_blocks = fw_block_names_l
|
|
|
|
|
else:
|
|
|
|
|
for name in fwType: # cleaning list from wrong values
|
|
|
|
|
if name.upper() not in fw_block_names_l:
|
|
|
|
|
fwType.remove(name)
|
|
|
|
|
firmware_blocks = fwType
|
|
|
|
|
printLogSpacer(' Firmware Information ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
fw_ver = c_uint64()
|
|
|
|
|
for fw_name in firmware_blocks:
|
|
|
|
|
fw_name = fw_name.upper()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_firmware_version_get(device, fw_block_names_l.index(fw_name), byref(fw_ver))
|
2024-08-23 11:33:31 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_firmware_version_' + str(fw_name), silent=True):
|
2022-01-07 23:04:26 -05:00
|
|
|
# The VCN, VCE, UVD, SOS and ASD firmware's value needs to be in hexadecimal
|
2023-09-27 12:38:15 -04:00
|
|
|
if fw_name in ['VCN', 'VCE', 'UVD', 'SOS', 'ASD', 'MES', 'MES KIQ']:
|
2021-07-29 12:43:54 -04:00
|
|
|
printLog(device, '%s firmware version' % (fw_name),
|
|
|
|
|
'\t0x%s' % (str(hex(fw_ver.value))[2:].zfill(8)))
|
2020-07-15 06:01:40 -04:00
|
|
|
# The TA XGMI, TA RAS, and SMC firmware's hex value looks like 0x12345678
|
|
|
|
|
# However, they are parsed as: int(0x12).int(0x34).int(0x56).int(0x78)
|
|
|
|
|
# Which results in the following: 12.34.56.78
|
|
|
|
|
elif fw_name in ['TA XGMI', 'TA RAS', 'SMC']:
|
|
|
|
|
pos1 = str('%02d' % int((('0x%s' % (str(hex(fw_ver.value))[2:].zfill(8))[0:2])), 16))
|
|
|
|
|
pos2 = str('%02d' % int((('0x%s' % (str(hex(fw_ver.value))[2:].zfill(8))[2:4])), 16))
|
|
|
|
|
pos3 = str('%02d' % int((('0x%s' % (str(hex(fw_ver.value))[2:].zfill(8))[4:6])), 16))
|
|
|
|
|
pos4 = str('%02d' % int((('0x%s' % (str(hex(fw_ver.value))[2:].zfill(8))[6:8])), 16))
|
|
|
|
|
printLog(device, '%s firmware version' % (fw_name), '\t%s.%s.%s.%s' % (pos1, pos2, pos3, pos4))
|
|
|
|
|
# The ME, MC, and CE firmware names are only 2 characters, so they need an additional tab
|
|
|
|
|
elif fw_name in ['ME', 'MC', 'CE']:
|
|
|
|
|
printLog(device, '%s firmware version' % (fw_name), '\t\t%s' % (str(fw_ver.value)))
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, '%s firmware version' % (fw_name), '\t%s' % (str(fw_ver.value)))
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showGpusByPid(pidList):
|
|
|
|
|
""" Show GPUs used by a specific Process ID (pid)
|
|
|
|
|
|
|
|
|
|
Print out the GPU(s) used by a specific KFD process
|
|
|
|
|
If pidList is empty, print all used GPUs for all KFD processes
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param pidList: List of PIDs to check
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' GPUs Indexed by PID ')
|
|
|
|
|
# If pidList is empty then we were given 0 arguments, so they want all PIDs
|
|
|
|
|
# dv_indices = (c_uint32 * dv_limit)()
|
|
|
|
|
num_devices = c_uint32()
|
|
|
|
|
dv_indices = c_void_p()
|
|
|
|
|
|
|
|
|
|
if not pidList:
|
|
|
|
|
pidList = getPidList()
|
|
|
|
|
if not pidList:
|
|
|
|
|
printLog(None, 'No KFD PIDs currently running', None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
return
|
|
|
|
|
for pid in pidList:
|
2020-08-11 16:04:14 -04:00
|
|
|
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), None, byref(num_devices))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric=('PID ' + pid)):
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
dv_indices = (c_uint32 * num_devices.value)()
|
2020-08-11 16:04:14 -04:00
|
|
|
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), dv_indices, byref(num_devices))
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
|
2020-08-11 16:04:14 -04:00
|
|
|
metricName = 'PID %s is using %s DRM device(s)' % (pid, str(num_devices.value))
|
2022-05-31 16:17:42 -04:00
|
|
|
if (num_devices.value):
|
|
|
|
|
printListLog(metricName, list(dv_indices))
|
|
|
|
|
else:
|
|
|
|
|
printLog(None, metricName, None)
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
|
|
|
|
print(None, 'Unable to get list of KFD PIDs. A kernel update may be needed', None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
2021-07-29 12:43:54 -04:00
|
|
|
|
|
|
|
|
def getCoarseGrainUtil(device, typeName=None):
|
2021-04-15 22:56:28 -04:00
|
|
|
""" Find Coarse Grain Utilization
|
2024-03-06 20:55:02 +01:00
|
|
|
If typeName is not given, will return array with of all available sensors,
|
|
|
|
|
where sensor type and value could be addressed like this:
|
|
|
|
|
|
|
|
|
|
.. code-block:: python
|
2021-04-15 22:56:28 -04:00
|
|
|
|
|
|
|
|
for ut_counter in utilization_counters:
|
|
|
|
|
printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val)
|
2024-04-26 23:48:15 -05:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param typeName: 'GFX Activity', 'Memory Activity'
|
2021-04-15 22:56:28 -04:00
|
|
|
"""
|
|
|
|
|
timestamp = c_uint64(0)
|
|
|
|
|
|
2025-09-10 14:50:32 -05:00
|
|
|
if typeName is not None:
|
2021-04-15 22:56:28 -04:00
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
i = utilization_counter_name.index(typeName)
|
|
|
|
|
length = 1
|
|
|
|
|
utilization_counters = (rsmi_utilization_counter_t * length)()
|
|
|
|
|
utilization_counters[0].type = c_int(i)
|
|
|
|
|
except ValueError:
|
2021-07-29 12:43:54 -04:00
|
|
|
printLog(None, "No such coarse grain counter type")
|
2021-04-15 22:56:28 -04:00
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
length = rsmi_utilization_counter_type.RSMI_UTILIZATION_COUNTER_LAST + 1
|
|
|
|
|
utilization_counters = (rsmi_utilization_counter_t * length)()
|
|
|
|
|
# populate array with all existing types to query
|
|
|
|
|
for i in range(0, length):
|
|
|
|
|
utilization_counters[i].type = c_int(i)
|
|
|
|
|
|
|
|
|
|
ret = rocmsmi.rsmi_utilization_count_get(device, utilization_counters, length, byref(timestamp))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_utilization_count_'+ str(typeName), True):
|
2021-04-15 22:56:28 -04:00
|
|
|
return utilization_counters
|
|
|
|
|
return -1
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
def showGpuUse(deviceList):
|
|
|
|
|
""" Display GPU use for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' % time GPU is busy ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if getGpuUse(device) != -1:
|
|
|
|
|
printLog(device, 'GPU use (%)', getGpuUse(device))
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, 'GPU use Unsupported', None)
|
2021-04-15 22:56:28 -04:00
|
|
|
util_counters = getCoarseGrainUtil(device, "GFX Activity")
|
|
|
|
|
if util_counters != -1:
|
|
|
|
|
for ut_counter in util_counters:
|
|
|
|
|
printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val)
|
2021-11-24 13:49:43 -05:00
|
|
|
else:
|
2022-06-09 19:52:38 -04:00
|
|
|
printInfoLog(device, 'GFX Activity', 'N/A')
|
2021-11-24 13:49:43 -05:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
2021-07-29 12:43:54 -04:00
|
|
|
|
2021-04-17 01:37:19 -04:00
|
|
|
def showEnergy(deviceList):
|
|
|
|
|
""" Display amount of energy consumed by device until now
|
|
|
|
|
|
|
|
|
|
Default counter value is 10000b, indicating energy status unit
|
|
|
|
|
is 15.3 micro-Joules increment.
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2021-04-17 01:37:19 -04:00
|
|
|
"""
|
|
|
|
|
power = c_uint64()
|
|
|
|
|
timestamp = c_uint64()
|
|
|
|
|
counter_resolution = c_float()
|
|
|
|
|
printLogSpacer(" Consumed Energy ")
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_energy_count_get(device, byref(power), byref(counter_resolution), byref(timestamp))
|
|
|
|
|
if rsmi_ret_ok(ret, device, "% Energy Counter"):
|
|
|
|
|
printLog(device, "Energy counter", power.value)
|
|
|
|
|
printLog(device, "Accumulated Energy (uJ)", round(power.value * counter_resolution.value, 2))
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
def showId(deviceList):
|
2024-01-31 21:03:33 -06:00
|
|
|
""" Display the device IDs for a list of devices
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' ID ')
|
|
|
|
|
for device in deviceList:
|
2024-01-31 21:03:33 -06:00
|
|
|
printLog(device, 'Device Name', '\t\t' + str(getDeviceName(device)))
|
|
|
|
|
printLog(device, 'Device ID', '\t\t' + str(getDRMDeviceId(device)))
|
|
|
|
|
printLog(device, 'Device Rev', '\t\t' + str(getRev(device)))
|
|
|
|
|
printLog(device, 'Subsystem ID', '\t' + str(getSubsystemId(device)))
|
|
|
|
|
printLog(device, 'GUID', '\t\t' + str(getGUID(device)))
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showMaxPower(deviceList):
|
|
|
|
|
""" Display the maximum Graphics Package Power that this GPU will attempt to consume
|
|
|
|
|
before it begins throttling performance
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Power Cap ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if getMaxPower(device) != -1:
|
|
|
|
|
printLog(device, 'Max Graphics Package Power (W)', getMaxPower(device))
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, 'Max Graphics Package Power Unsupported', None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showMemInfo(deviceList, memType):
|
|
|
|
|
""" Display Memory information for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param memType: [$validMemTypes] Type of memory information to display
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
# Python will pass in a list of values as a single-value list
|
|
|
|
|
# If we get 'all' as the string, just set the list to all supported types
|
|
|
|
|
# Otherwise, split the single-item list by space, then split each element
|
|
|
|
|
# up to process it below
|
|
|
|
|
|
|
|
|
|
if 'all' in memType:
|
|
|
|
|
returnTypes = memory_type_l
|
|
|
|
|
else:
|
|
|
|
|
returnTypes = memType
|
|
|
|
|
|
|
|
|
|
printLogSpacer(' Memory Usage (Bytes) ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
for mem in returnTypes:
|
|
|
|
|
mem = mem.upper()
|
|
|
|
|
memInfo = getMemInfo(device, mem)
|
|
|
|
|
printLog(device, '%s Total Memory (B)' % (mem), memInfo[1])
|
|
|
|
|
printLog(device, '%s Total Used Memory (B)' % (mem), memInfo[0])
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showMemUse(deviceList):
|
|
|
|
|
""" Display GPU memory usage for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
memoryUse = c_uint64()
|
2023-09-19 13:44:20 -05:00
|
|
|
avgMemBandwidth = c_uint16()
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer(' Current Memory Use ')
|
|
|
|
|
for device in deviceList:
|
2024-03-14 23:44:57 -05:00
|
|
|
allocated_mem_percent = getAllocatedMemoryPercent(device)
|
|
|
|
|
printLog(device, 'GPU Memory Allocated (VRAM%)',
|
|
|
|
|
int(allocated_mem_percent['value']))
|
2020-07-15 06:01:40 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse))
|
2025-04-13 22:38:31 -05:00
|
|
|
if rsmi_ret_ok(ret, device, '% memory use', silent=True):
|
2024-03-14 23:44:57 -05:00
|
|
|
printLog(device, 'GPU Memory Read/Write Activity (%)', memoryUse.value)
|
2021-04-15 22:56:28 -04:00
|
|
|
util_counters = getCoarseGrainUtil(device, "Memory Activity")
|
|
|
|
|
if util_counters != -1:
|
|
|
|
|
for ut_counter in util_counters:
|
|
|
|
|
printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val)
|
2021-11-24 13:49:43 -05:00
|
|
|
else:
|
|
|
|
|
printLog(device, 'Memory Activity', 'N/A')
|
2023-09-19 13:44:20 -05:00
|
|
|
|
|
|
|
|
ret = rocmsmi.rsmi_dev_activity_avg_mm_get(device, byref(avgMemBandwidth))
|
|
|
|
|
if rsmi_ret_ok(ret, device, silent=True):
|
|
|
|
|
printLog(device, 'Avg. Memory Bandwidth', avgMemBandwidth.value)
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, 'Not supported on the given system', None)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showMemVendor(deviceList):
|
|
|
|
|
""" Display GPU memory vendor for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
vendor = create_string_buffer(256)
|
|
|
|
|
printLogSpacer(' Memory Vendor ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_vram_vendor_get(device, vendor, 256)
|
2023-06-07 11:56:29 +08:00
|
|
|
try:
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_vram_vendor') and vendor.value.decode():
|
|
|
|
|
printLog(device, 'GPU memory vendor', vendor.value.decode())
|
|
|
|
|
else:
|
|
|
|
|
logging.debug('GPU memory vendor missing or not supported')
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
printErrLog(device, 'Unable to read GPU memory vendor')
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showOverDrive(deviceList, odtype):
|
|
|
|
|
""" Display current OverDrive level for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param odtype: [sclk|mclk] OverDrive type
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
2020-09-07 21:29:38 -04:00
|
|
|
rsmi_od = c_uint32()
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer(' OverDrive Level ')
|
|
|
|
|
for device in deviceList:
|
2023-06-07 11:56:29 +08:00
|
|
|
odStr = ''
|
|
|
|
|
od = ''
|
2020-07-15 06:01:40 -04:00
|
|
|
if odtype == 'sclk':
|
|
|
|
|
odStr = 'GPU'
|
2020-09-07 21:29:38 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(rsmi_od))
|
|
|
|
|
od = rsmi_od.value
|
2023-04-13 10:43:52 -05:00
|
|
|
if not rsmi_ret_ok(ret, device, 'get_overdrive_level_' + str(odtype)):
|
2022-06-20 13:37:11 -04:00
|
|
|
continue
|
2020-07-15 06:01:40 -04:00
|
|
|
elif odtype == 'mclk':
|
|
|
|
|
odStr = 'GPU Memory'
|
2022-06-20 13:37:11 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_mem_overdrive_level_get(device, byref(rsmi_od))
|
|
|
|
|
od = rsmi_od.value
|
2023-04-13 10:43:52 -05:00
|
|
|
if not rsmi_ret_ok(ret, device, 'get_mem_overdrive_level_' + str(odtype)):
|
2022-06-20 13:37:11 -04:00
|
|
|
continue
|
2020-09-07 21:29:38 -04:00
|
|
|
else:
|
|
|
|
|
printErrLog(device, 'Unable to retrieve OverDrive')
|
2022-06-20 13:37:11 -04:00
|
|
|
logging.error('Unsupported clock type %s', odtype)
|
2020-09-07 21:29:38 -04:00
|
|
|
printLog(device, odStr + ' OverDrive value (%)', od)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showPcieBw(deviceList):
|
|
|
|
|
""" Display estimated PCIe bandwidth usage for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
2020-09-09 13:37:52 -04:00
|
|
|
sent = c_uint64()
|
|
|
|
|
received = c_uint64()
|
|
|
|
|
max_pkt_sz = c_uint64()
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer(' Measured PCIe Bandwidth ')
|
|
|
|
|
for device in deviceList:
|
2025-09-10 14:48:31 -05:00
|
|
|
# Get BW from GPU metrics from version >= 1.5
|
|
|
|
|
header = metrics_table_header_t()
|
|
|
|
|
ret_version = rocmsmi.rsmi_dev_metrics_header_info_get(device, byref(header))
|
|
|
|
|
if rsmi_ret_ok(ret_version, device, 'get_metrics_header', True):
|
|
|
|
|
if header.format_revision >= 1 and header.content_revision >= 5:
|
|
|
|
|
gpu_metrics = rsmi_gpu_metrics_t()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_gpu_metrics_info_get(device, byref(gpu_metrics))
|
|
|
|
|
if rsmi_ret_ok(ret, device, "get_gpu_metrics", True):
|
|
|
|
|
metric_bw = gpu_metrics.pcie_bandwidth_inst
|
|
|
|
|
if metric_bw != ctypes.c_uint64(-1).value and metric_bw > 0:
|
|
|
|
|
bandwidth_mbps = metric_bw / 8.0 # Convert megabits to megabytes
|
|
|
|
|
bwstr = f"{bandwidth_mbps:.3f}"
|
|
|
|
|
printLog(device, "Current PCIe bandwidth (MB/s)", bwstr)
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, "GPU metrics pcie_bandwidth_inst is invalid", None)
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, "Failed to get GPU metrics info", None)
|
|
|
|
|
|
|
|
|
|
# Use legacy API (For GPU metric version < 1.5 or failed)
|
2020-09-09 13:37:52 -04:00
|
|
|
ret = rocmsmi.rsmi_dev_pci_throughput_get(device, byref(sent), byref(received), byref(max_pkt_sz))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth'):
|
2020-09-09 13:37:52 -04:00
|
|
|
# Use 1024.0 to ensure that the result is a float and not integer division
|
|
|
|
|
bw = ((received.value + sent.value) * max_pkt_sz.value) / 1024.0 / 1024.0
|
|
|
|
|
# Use the bwstr below to control precision on the string
|
|
|
|
|
bwstr = '%.3f' % bw
|
|
|
|
|
printLog(device, 'Estimated maximum PCIe bandwidth over the last second (MB/s)', bwstr)
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
|
|
|
|
logging.debug('GPU PCIe bandwidth usage not supported')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showPcieReplayCount(deviceList):
|
|
|
|
|
""" Display number of PCIe replays for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
counter = c_uint64()
|
|
|
|
|
printLogSpacer(' PCIe Replay Counter ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_pci_replay_counter_get(device, byref(counter))
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'PCIe Replay Count'):
|
|
|
|
|
printLog(device, 'PCIe Replay Count', counter.value)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showPerformanceLevel(deviceList):
|
|
|
|
|
""" Display current Performance Level for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Show Performance Level ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if getPerfLevel(device) != -1:
|
|
|
|
|
printLog(device, 'Performance Level', str(getPerfLevel(device)).lower())
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, 'Performance Level Unsupported', None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2023-06-13 11:20:27 -05:00
|
|
|
def showPids(verbose):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Show Information for PIDs created in a KFD (Compute) context """
|
|
|
|
|
printLogSpacer(' KFD Processes ')
|
|
|
|
|
dataArray = []
|
2023-06-13 11:20:27 -05:00
|
|
|
if verbose == "details":
|
|
|
|
|
dataArray.append(['PID', 'PROCESS NAME', 'GPU', 'VRAM USED', 'SDMA USED', 'CU OCCUPANCY'])
|
|
|
|
|
else:
|
|
|
|
|
dataArray.append(['PID', 'PROCESS NAME', 'GPU(s)', 'VRAM USED', 'SDMA USED', 'CU OCCUPANCY'])
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
pidList = getPidList()
|
|
|
|
|
if not pidList:
|
|
|
|
|
printLog(None, 'No KFD PIDs currently running', None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
return
|
|
|
|
|
dv_indices = c_void_p()
|
|
|
|
|
num_devices = c_uint32()
|
|
|
|
|
proc = rsmi_process_info_t()
|
|
|
|
|
for pid in pidList:
|
|
|
|
|
gpuNumber = 'UNKNOWN'
|
|
|
|
|
vramUsage = 'UNKNOWN'
|
2020-08-14 12:12:34 -04:00
|
|
|
sdmaUsage = 'UNKNOWN'
|
2020-10-15 06:31:29 -04:00
|
|
|
cuOccupancy = 'UNKNOWN'
|
2024-01-24 15:33:04 -05:00
|
|
|
cuOccupancyInvalid = 0xFFFFFFFF
|
2023-06-13 11:20:27 -05:00
|
|
|
dv_indices = (c_uint32 * num_devices.value)()
|
2020-08-11 16:04:14 -04:00
|
|
|
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), None, byref(num_devices))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
|
2020-07-15 06:01:40 -04:00
|
|
|
dv_indices = (c_uint32 * num_devices.value)()
|
2020-08-11 16:04:14 -04:00
|
|
|
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), dv_indices, byref(num_devices))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
|
2020-07-15 06:01:40 -04:00
|
|
|
gpuNumber = str(num_devices.value)
|
|
|
|
|
else:
|
2020-10-15 06:31:29 -04:00
|
|
|
logging.debug('Unable to fetch GPU number by PID')
|
2023-06-13 11:20:27 -05:00
|
|
|
if verbose == "details":
|
|
|
|
|
for dv_ind in dv_indices:
|
|
|
|
|
ret = rocmsmi.rsmi_compute_process_info_by_device_get(int(pid), dv_ind, byref(proc))
|
|
|
|
|
if rsmi_ret_ok(ret, metric='get_compute_process_info_by_pid'):
|
|
|
|
|
vramUsage = proc.vram_usage
|
|
|
|
|
sdmaUsage = proc.sdma_usage
|
2024-01-24 15:33:04 -05:00
|
|
|
if proc.cu_occupancy != cuOccupancyInvalid:
|
|
|
|
|
cuOccupancy = proc.cu_occupancy
|
2023-06-13 11:20:27 -05:00
|
|
|
else:
|
|
|
|
|
logging.debug('Unable to fetch process info by PID')
|
|
|
|
|
dataArray.append([pid, getProcessName(pid), str(gpuNumber), str(vramUsage), str(sdmaUsage), str(cuOccupancy)])
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2023-06-13 11:20:27 -05:00
|
|
|
ret = rocmsmi.rsmi_compute_process_info_by_pid_get(int(pid), byref(proc))
|
|
|
|
|
if rsmi_ret_ok(ret, metric='get_compute_process_info_by_pid'):
|
|
|
|
|
vramUsage = proc.vram_usage
|
|
|
|
|
sdmaUsage = proc.sdma_usage
|
2024-01-24 15:33:04 -05:00
|
|
|
if proc.cu_occupancy != cuOccupancyInvalid:
|
|
|
|
|
cuOccupancy = proc.cu_occupancy
|
2023-06-13 11:20:27 -05:00
|
|
|
else:
|
|
|
|
|
logging.debug('Unable to fetch process info by PID')
|
|
|
|
|
dataArray.append([pid, getProcessName(pid), str(gpuNumber), str(vramUsage), str(sdmaUsage), str(cuOccupancy)])
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(None, 'KFD process information:', None)
|
|
|
|
|
print2DArray(dataArray)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showPower(deviceList):
|
2023-09-24 02:29:07 -05:00
|
|
|
""" Display Current (also known as instant) Socket or Average
|
2024-03-06 20:55:02 +01:00
|
|
|
Graphics Package Power Consumption for a list of devices
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
2021-09-16 09:59:45 -04:00
|
|
|
secondaryPresent=False
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer(' Power Consumption ')
|
|
|
|
|
for device in deviceList:
|
2024-02-02 00:00:38 -06:00
|
|
|
power_dict = getPower(device)
|
|
|
|
|
power = 'N/A'
|
2024-04-26 23:48:15 -05:00
|
|
|
if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and
|
2024-02-02 00:00:38 -06:00
|
|
|
power_dict['power_type'] != 'INVALID_POWER_TYPE'):
|
|
|
|
|
power = power_dict['power']
|
2024-04-26 23:48:15 -05:00
|
|
|
printLog(device, power_dict['power_type'].title() + ' Graphics Package Power ('
|
2024-02-02 00:00:38 -06:00
|
|
|
+ power_dict['unit'] + ')',
|
|
|
|
|
power)
|
2023-09-24 02:29:07 -05:00
|
|
|
elif checkIfSecondaryDie(device):
|
2021-09-16 09:59:45 -04:00
|
|
|
printLog(device, 'Average Graphics Package Power (W)', "N/A (Secondary die)")
|
|
|
|
|
secondaryPresent=True
|
|
|
|
|
if secondaryPresent:
|
|
|
|
|
printLog(None, "\n\t\tPrimary die (usually one above or below the secondary) shows total (primary + secondary) socket power information", None)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2024-05-03 02:58:31 -05:00
|
|
|
def showPowerPlayTable(deviceList):
|
|
|
|
|
""" Display current GPU Memory clock frequencies and voltages for a list of devices
|
|
|
|
|
|
|
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
"""
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
return
|
|
|
|
|
printLogSpacer(' GPU Memory clock frequencies and voltages ')
|
|
|
|
|
odvf = rsmi_od_volt_freq_data_t()
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_od_volt'):
|
|
|
|
|
# TODO: Make this more dynamic and less hard-coded if possible
|
|
|
|
|
printLog(device, 'OD_SCLK:', None)
|
|
|
|
|
printLog(device, '0: %sMhz' % (int(odvf.curr_sclk_range.lower_bound / 1000000)), None)
|
|
|
|
|
printLog(device, '1: %sMhz' % (int(odvf.curr_sclk_range.upper_bound / 1000000)), None)
|
|
|
|
|
printLog(device, 'OD_MCLK:', None)
|
|
|
|
|
printLog(device, '0: %sMhz' % (int(odvf.curr_mclk_range.lower_bound / 1000000)), None)
|
|
|
|
|
printLog(device, '1: %sMhz' % (int(odvf.curr_mclk_range.upper_bound / 1000000)), None)
|
|
|
|
|
if odvf.sclk_freq_limits.lower_bound > 0 or odvf.sclk_freq_limits.upper_bound > 0 \
|
|
|
|
|
or odvf.mclk_freq_limits.lower_bound >0 or odvf.mclk_freq_limits.upper_bound > 0:
|
|
|
|
|
printLog(device, 'OD_RANGE:', None)
|
|
|
|
|
if odvf.sclk_freq_limits.lower_bound > 0 or odvf.sclk_freq_limits.upper_bound > 0:
|
|
|
|
|
printLog(device, 'SCLK: %sMhz %sMhz' % (
|
|
|
|
|
int(odvf.sclk_freq_limits.lower_bound / 1000000), int(odvf.sclk_freq_limits.upper_bound / 1000000)), None)
|
|
|
|
|
if odvf.mclk_freq_limits.lower_bound >0 or odvf.mclk_freq_limits.upper_bound > 0:
|
|
|
|
|
printLog(device, 'MCLK: %sMhz %sMhz' % (
|
|
|
|
|
int(odvf.mclk_freq_limits.lower_bound / 1000000), int(odvf.mclk_freq_limits.upper_bound / 1000000)), None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2024-01-31 21:03:33 -06:00
|
|
|
def showProduct(deviceList):
|
|
|
|
|
""" Show the requested product information for a list of devices
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Product Info ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
# Only continue if GPU vendor is AMD
|
2024-01-31 21:03:33 -06:00
|
|
|
if isAmdDevice(device):
|
2020-11-10 19:55:22 -05:00
|
|
|
# TODO: Retrieve the SKU using 'rsmi_dev_sku_get' from the LIB
|
2024-01-31 21:03:33 -06:00
|
|
|
# Device SKU is just the characters in between the two '-' in vbios_version
|
|
|
|
|
vbios = getVbiosVersion(device, True)
|
|
|
|
|
device_sku = "N/A"
|
|
|
|
|
if vbios.count('-') == 2 and len(str(vbios.split('-')[1])) > 1:
|
|
|
|
|
device_sku = vbios.split('-')[1]
|
|
|
|
|
|
|
|
|
|
printLog(device, 'Card Series', '\t\t' + str(getDeviceName(device)))
|
|
|
|
|
# Retrieve device ID from DRM and KFD
|
|
|
|
|
printLog(device, 'Card Model', str('\t\t' + getDRMDeviceId(device)))
|
|
|
|
|
printLog(device, 'Card Vendor', '\t\t' + getVendor(device))
|
|
|
|
|
printLog(device, 'Card SKU', '\t\t' + device_sku)
|
|
|
|
|
printLog(device, 'Subsystem ID', str('\t' + getSubsystemId(device)))
|
|
|
|
|
printLog(device, 'Device Rev', str('\t\t' + getRev(device)))
|
|
|
|
|
printLog(device, 'Node ID', str('\t\t' + str(getNodeId(device))))
|
|
|
|
|
printLog(device, 'GUID', str('\t\t' + str(getGUID(device))))
|
|
|
|
|
printLog(device, 'GFX Version', str('\t\t' + getTargetGfxVersion(device)))
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
2024-01-31 21:03:33 -06:00
|
|
|
vendor = getVendor(device)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Incompatible device.\n' \
|
2021-07-29 12:43:54 -04:00
|
|
|
'GPU[%s]\t\t: Expected vendor name: Advanced Micro Devices, Inc. [AMD/ATI]\n' \
|
2024-01-31 21:03:33 -06:00
|
|
|
'GPU[%s]\t\t: Actual vendor name' % (device, device), vendor)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showProfile(deviceList):
|
|
|
|
|
""" Display available Power Profiles for a list of devices.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
return
|
|
|
|
|
printLogSpacer(' Show Power Profiles ')
|
|
|
|
|
status = rsmi_power_profile_status_t()
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_power_profiles', silent=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
binaryMaskString = str(format(status.available_profiles, '07b'))[::-1]
|
|
|
|
|
bitMaskPosition = 0
|
|
|
|
|
profileNumber = 0
|
2020-09-09 17:34:44 -04:00
|
|
|
while (bitMaskPosition < 7):
|
2020-07-15 06:01:40 -04:00
|
|
|
if binaryMaskString[bitMaskPosition] == '1':
|
|
|
|
|
profileNumber = profileNumber + 1
|
|
|
|
|
if 2 ** bitMaskPosition == status.current:
|
|
|
|
|
printLog(device, '%d. Available power profile (#%d of 7)' % \
|
2020-09-09 17:34:44 -04:00
|
|
|
(profileNumber, bitMaskPosition + 1), profileString(2 ** bitMaskPosition) + '*')
|
2020-07-15 06:01:40 -04:00
|
|
|
else:
|
|
|
|
|
printLog(device, '%d. Available power profile (#%d of 7)' % \
|
2020-09-09 17:34:44 -04:00
|
|
|
(profileNumber, bitMaskPosition + 1), profileString(2 ** bitMaskPosition))
|
2020-07-15 06:01:40 -04:00
|
|
|
bitMaskPosition = bitMaskPosition + 1
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showRange(deviceList, rangeType):
|
|
|
|
|
""" Show the range for either the sclk or voltage for the specified devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param rangeType: [sclk|voltage] Type of range to return
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global RETCODE
|
2024-05-03 02:58:31 -05:00
|
|
|
if rangeType not in {'sclk', 'mclk', 'voltage'}:
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(None, 'Invalid range identifier %s' % (rangeType), None)
|
|
|
|
|
RETCODE = 1
|
|
|
|
|
return
|
|
|
|
|
printLogSpacer(' Show Valid %s Range ' % (rangeType))
|
|
|
|
|
odvf = rsmi_od_volt_freq_data_t()
|
2025-08-06 12:19:14 -05:00
|
|
|
uint64_max = UIntegerTypes.UINT64_T
|
2020-07-15 06:01:40 -04:00
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_od_volt', silent=False):
|
2020-07-15 06:01:40 -04:00
|
|
|
if rangeType == 'sclk':
|
2025-08-05 19:39:48 -05:00
|
|
|
if odvf.curr_sclk_range.lower_bound == uint64_max or odvf.curr_sclk_range.upper_bound == uint64_max:
|
|
|
|
|
printLog(device, 'Unable to display %s range' % (rangeType), None)
|
|
|
|
|
continue
|
2021-07-29 12:43:54 -04:00
|
|
|
printLog(device, 'Valid sclk range: %sMhz - %sMhz' % (
|
|
|
|
|
int(odvf.curr_sclk_range.lower_bound / 1000000), int(odvf.curr_sclk_range.upper_bound / 1000000)), None)
|
2020-07-15 06:01:40 -04:00
|
|
|
if rangeType == 'mclk':
|
2025-08-05 19:39:48 -05:00
|
|
|
if odvf.curr_mclk_range.lower_bound == uint64_max or odvf.curr_mclk_range.upper_bound == uint64_max:
|
|
|
|
|
printLog(device, 'Unable to display %s range' % (rangeType), None)
|
|
|
|
|
continue
|
2021-07-29 12:43:54 -04:00
|
|
|
printLog(device, 'Valid mclk range: %sMhz - %sMhz' % (
|
|
|
|
|
int(odvf.curr_mclk_range.lower_bound / 1000000), int(odvf.curr_mclk_range.upper_bound / 1000000)), None)
|
2024-05-03 02:58:31 -05:00
|
|
|
if rangeType == 'voltage':
|
|
|
|
|
if odvf.num_regions == 0:
|
2024-09-10 11:25:55 -05:00
|
|
|
printErrLog(device, 'Voltage curve regions unsupported.', is_warning=True)
|
2024-05-03 02:58:31 -05:00
|
|
|
continue
|
|
|
|
|
num_regions = c_uint32(odvf.num_regions)
|
|
|
|
|
regions = (rsmi_freq_volt_region_t * odvf.num_regions)()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_od_volt_curve_regions_get(device, byref(num_regions), byref(regions))
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'volt'):
|
|
|
|
|
for i in range(num_regions.value):
|
|
|
|
|
printLog(device,
|
|
|
|
|
'Region %d: Valid voltage range: %smV - %smV' % (i, regions[i].volt_range.lower_bound,
|
|
|
|
|
regions[i].volt_range.upper_bound),
|
|
|
|
|
None)
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, 'Unable to display %s range' % (rangeType), None)
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showRasInfo(deviceList, rasType):
|
|
|
|
|
""" Show the requested RAS information for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param rasType: [$validRasBlocks] RAS counter to display (all if left empty)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
state = rsmi_ras_err_state_t()
|
|
|
|
|
if not rasType or 'all' in rasType:
|
|
|
|
|
rasBlocks = rsmi_gpu_block_d.keys()
|
|
|
|
|
else:
|
|
|
|
|
for name in rasType:
|
|
|
|
|
if name.upper() not in rsmi_gpu_block_d:
|
|
|
|
|
rasType.remove(name)
|
2023-06-07 11:56:29 +08:00
|
|
|
printErrLog(None, '%s is not a RAS block' % (name))
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
rasBlocks = [block.upper() for block in rasType]
|
|
|
|
|
|
|
|
|
|
printLogSpacer(' RAS Info ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
data = []
|
|
|
|
|
for block in rasBlocks:
|
|
|
|
|
row = []
|
|
|
|
|
ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True):
|
2020-07-15 06:01:40 -04:00
|
|
|
row.append(block)
|
|
|
|
|
row.append(rsmi_ras_err_stale_machine[state.value].upper())
|
|
|
|
|
# Now add the error count
|
|
|
|
|
if rsmi_ras_err_stale_machine[state.value] != 'disabled' or 'none' or 'unknown error':
|
|
|
|
|
ec = rsmi_error_count_t()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_ecc_count_get(device, rsmi_gpu_block_d[block], byref(ec))
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'ecc err count', True):
|
|
|
|
|
row.append(ec.correctable_err)
|
|
|
|
|
row.append(ec.uncorrectable_err)
|
|
|
|
|
data.append(row)
|
2022-12-08 12:27:00 -05:00
|
|
|
printTableLog([' Block', ' Status ', 'Correctable Error', 'Uncorrectable Error'], data, device,
|
2021-07-29 12:43:54 -04:00
|
|
|
'RAS INFO')
|
2020-07-15 06:01:40 -04:00
|
|
|
# TODO: Use dynamic spacing for column widths
|
|
|
|
|
printLogSpacer(None, '_')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showRetiredPages(deviceList, retiredType='all'):
|
|
|
|
|
""" Show retired pages of a specified type for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param retiredType: Type of retired pages to show (default = all)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Pages Info ')
|
|
|
|
|
num_pages = c_uint32()
|
|
|
|
|
records = rsmi_retired_page_record_t()
|
|
|
|
|
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
data = []
|
|
|
|
|
ret = rocmsmi.rsmi_dev_memory_reserved_pages_get(device, byref(num_pages), None)
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'ras'):
|
|
|
|
|
records = (rsmi_retired_page_record_t * num_pages.value)()
|
|
|
|
|
else:
|
|
|
|
|
logging.debug('Unable to retrieve reserved page info')
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
ret = rocmsmi.rsmi_dev_memory_reserved_pages_get(device, byref(num_pages), byref(records))
|
|
|
|
|
for rec in records:
|
|
|
|
|
if (memory_page_status_l[rec.status] == retiredType or retiredType == 'all'):
|
|
|
|
|
data.append((hex(rec.page_address), hex(rec.page_size), memory_page_status_l[rec.status]))
|
|
|
|
|
if data:
|
|
|
|
|
printTableLog([' Page address', ' Page size', ' Status'], data, device,
|
|
|
|
|
retiredType.upper() + ' PAGES INFO')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showSerialNumber(deviceList):
|
|
|
|
|
""" Display the serial number for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Serial Number ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
sn = create_string_buffer(256)
|
|
|
|
|
ret = rocmsmi.rsmi_dev_serial_number_get(device, sn, 256)
|
2022-08-19 12:22:54 -04:00
|
|
|
try:
|
|
|
|
|
sn.value.decode()
|
|
|
|
|
except UnicodeDecodeError:
|
2022-06-16 10:58:00 -04:00
|
|
|
printErrLog(device, "FRU Serial Number contains non-alphanumeric characters. FRU is likely corrupted")
|
|
|
|
|
continue
|
|
|
|
|
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_serial_number') and sn.value.decode():
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Serial Number', sn.value.decode())
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, 'Serial Number', 'N/A')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showUId(deviceList):
|
|
|
|
|
""" Display the unique device ID for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Unique ID ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
dv_uid = c_uint64()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_unique_id_get(device, byref(dv_uid))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_unique_id', True) and str(hex(dv_uid.value)):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Unique ID', hex(dv_uid.value))
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, 'Unique ID', 'N/A')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showVbiosVersion(deviceList):
|
|
|
|
|
""" Display the VBIOS version for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' VBIOS ')
|
|
|
|
|
for device in deviceList:
|
2024-03-30 00:11:09 -05:00
|
|
|
printLog(device, 'VBIOS version', getVbiosVersion(device, silent=True))
|
2020-07-15 06:01:40 -04:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2021-03-17 00:24:29 -04:00
|
|
|
class _Getch:
|
|
|
|
|
"""
|
|
|
|
|
Get a single character from standard input
|
|
|
|
|
"""
|
2021-07-29 12:43:54 -04:00
|
|
|
|
2021-03-17 00:24:29 -04:00
|
|
|
def __init__(self):
|
|
|
|
|
import sys, tty
|
2021-07-29 12:43:54 -04:00
|
|
|
|
2021-03-17 00:24:29 -04:00
|
|
|
def __call__(self):
|
|
|
|
|
import sys, termios, tty
|
|
|
|
|
fd = sys.stdin.fileno()
|
|
|
|
|
old_settings = termios.tcgetattr(fd)
|
|
|
|
|
try:
|
|
|
|
|
tty.setraw(sys.stdin.fileno())
|
|
|
|
|
ch = sys.stdin.read(1)
|
|
|
|
|
finally:
|
|
|
|
|
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
|
|
|
|
return ch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showEvents(deviceList, eventTypes):
|
|
|
|
|
""" Display a blocking list of events for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param eventTypes: List of event type names (can be a single-item list)
|
2021-03-17 00:24:29 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Show Events ')
|
2025-08-28 11:49:36 -05:00
|
|
|
printLog(None, 'press \'q\' or \'ctrl + c\' and then \'Enter\' to quit', None)
|
2021-03-17 00:24:29 -04:00
|
|
|
eventTypeList = []
|
2025-07-07 18:42:53 -05:00
|
|
|
thread_list = []
|
2021-07-29 12:43:54 -04:00
|
|
|
for event in eventTypes: # Cleaning list from wrong values
|
2021-03-17 00:24:29 -04:00
|
|
|
if event.replace(',', '').upper() in notification_type_names:
|
|
|
|
|
eventTypeList.append(event.replace(',', '').upper())
|
|
|
|
|
else:
|
|
|
|
|
printErrLog(None, 'Ignoring unrecognized event type %s' % (event.replace(',', '')))
|
|
|
|
|
if len(eventTypeList) == 0:
|
|
|
|
|
eventTypeList = notification_type_names
|
2021-07-09 00:41:30 -04:00
|
|
|
print2DArray([['DEVICE\t', 'TIME\t', 'TYPE\t', 'DESCRIPTION']])
|
2024-03-15 06:53:27 +00:00
|
|
|
# Create a separate thread for each GPU
|
2021-03-17 00:24:29 -04:00
|
|
|
for device in deviceList:
|
2023-06-07 11:56:29 +08:00
|
|
|
try:
|
2025-07-07 18:42:53 -05:00
|
|
|
thread = threading.Thread(target=printEventList, args=(device, 1000, eventTypeList))
|
|
|
|
|
thread_list.append(thread)
|
2025-08-28 11:49:36 -05:00
|
|
|
thread.start()
|
2023-06-07 11:56:29 +08:00
|
|
|
time.sleep(0.25)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
printErrLog(device, 'Unable to start new thread. %s' % (e))
|
|
|
|
|
return
|
2025-08-28 11:49:36 -05:00
|
|
|
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' and then 'Enter'
|
|
|
|
|
user_input = input()
|
2021-03-17 00:24:29 -04:00
|
|
|
# Catch user input for q or Ctrl + c
|
|
|
|
|
if user_input == 'q' or user_input == '\x03':
|
2025-08-28 11:49:36 -05:00
|
|
|
global stop_threads
|
|
|
|
|
stop_threads = True
|
2021-03-17 00:24:29 -04:00
|
|
|
print('\r')
|
|
|
|
|
break
|
2025-07-07 18:42:53 -05:00
|
|
|
for thread in thread_list:
|
|
|
|
|
thread.join()
|
2021-03-17 00:24:29 -04:00
|
|
|
|
|
|
|
|
|
2023-08-10 18:25:02 -05:00
|
|
|
def printTempGraph(deviceList, delay, temp_type):
|
2022-09-07 13:49:43 -04:00
|
|
|
# deviceList must be in ascending order
|
|
|
|
|
deviceList.sort()
|
|
|
|
|
devices = 0
|
|
|
|
|
# Print an empty line for each device
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
devices = devices + 1
|
|
|
|
|
for i in range(devices):
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
originalTerminalWidth = os.get_terminal_size()[0]
|
2025-07-07 18:42:53 -05:00
|
|
|
while not stop_threads: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
|
2023-06-07 11:56:29 +08:00
|
|
|
terminalWidth = os.get_terminal_size()[0]
|
|
|
|
|
printStrings = list()
|
2022-09-07 13:49:43 -04:00
|
|
|
for device in deviceList:
|
2023-08-10 18:25:02 -05:00
|
|
|
temp = getTemp(device, temp_type)
|
2023-06-07 11:56:29 +08:00
|
|
|
if temp == 'N/A':
|
|
|
|
|
percentage = 0
|
|
|
|
|
else:
|
|
|
|
|
percentage = temp
|
2022-09-07 13:49:43 -04:00
|
|
|
if percentage >= 100:
|
|
|
|
|
percentage = 100
|
|
|
|
|
if percentage < 0:
|
|
|
|
|
percentage = 0
|
|
|
|
|
# Get available space based on terminal width
|
|
|
|
|
availableSpace = 0
|
|
|
|
|
if terminalWidth >= 20:
|
|
|
|
|
availableSpace = terminalWidth - 20
|
|
|
|
|
# Get color based on percentage, with a non-linear scaling
|
|
|
|
|
color = getGraphColor(3.16*(percentage**1.5)**(1/2))
|
|
|
|
|
# Get graph length based on percentage and available space
|
|
|
|
|
padding = (percentage / float(100)) * availableSpace
|
|
|
|
|
if padding > availableSpace:
|
|
|
|
|
padding = availableSpace
|
|
|
|
|
paddingSpace = color[-1]
|
|
|
|
|
for i in range(int(padding)):
|
|
|
|
|
paddingSpace += paddingSpace[-1]
|
|
|
|
|
remainder = 0
|
|
|
|
|
if availableSpace >= padding:
|
|
|
|
|
remainder = availableSpace + 1 - padding
|
|
|
|
|
remainderSpace = ' ' * int(remainder)
|
|
|
|
|
# TODO: Allow terminal size to be decreased
|
|
|
|
|
if terminalWidth < originalTerminalWidth:
|
|
|
|
|
print('Terminal size cannot be decreased.\n\r')
|
|
|
|
|
return
|
2023-06-12 19:02:46 -05:00
|
|
|
if type(temp) == str:
|
|
|
|
|
tempString = temp
|
|
|
|
|
else:
|
|
|
|
|
tempString = str(int(temp))
|
2022-09-07 13:49:43 -04:00
|
|
|
# Two spare Spaces
|
2023-06-12 19:02:46 -05:00
|
|
|
tempString = (tempString + '°C').ljust(5)
|
2023-06-07 11:56:29 +08:00
|
|
|
printStrings.append('\033[2;30;47mGPU[%d] Temp %s|%s%s\x1b[0m%s' % (device, tempString, color, paddingSpace[1:], remainderSpace))
|
2022-09-07 13:49:43 -04:00
|
|
|
originalTerminalWidth = terminalWidth
|
|
|
|
|
time.sleep((delay / 1000))
|
2023-06-07 11:56:29 +08:00
|
|
|
|
2022-09-07 13:49:43 -04:00
|
|
|
if terminalWidth >= 20:
|
2023-06-07 11:56:29 +08:00
|
|
|
# go up and prepare to rewrite the lines
|
|
|
|
|
for i in printStrings:
|
|
|
|
|
print('\033[A', end='\r')
|
|
|
|
|
# print all strings
|
|
|
|
|
for i in printStrings:
|
|
|
|
|
print(i, end='\r\n')
|
2022-09-07 13:49:43 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def getGraphColor(percentage):
|
|
|
|
|
# Text / Background color mixing (Tested on PuTTY)
|
|
|
|
|
colors = ['\033[2;35;45m','\033[2;34;45m','\033[2;35;44m','\033[2;34;44m',
|
|
|
|
|
'\033[2;36;44m','\033[2;34;46m','\033[2;36;46m','\033[2;32;46m',
|
|
|
|
|
'\033[2;36;42m','\033[2;32;42m','\033[2;33;42m','\033[2;32;43m',
|
|
|
|
|
'\033[2;33;43m','\033[2;31;43m','\033[2;33;41m','\033[2;31;41m']
|
|
|
|
|
characters = [' ', '░', '░', '▒', '▒', '░']
|
|
|
|
|
# Ensure percentage is in range and rounded
|
|
|
|
|
if percentage > 99:
|
|
|
|
|
percentage = 99
|
|
|
|
|
if percentage < 0:
|
|
|
|
|
percentage = 0
|
|
|
|
|
percentage = round(percentage, 0)
|
|
|
|
|
# There are a total of 16 distinct colors, with 2 special ascii characters per
|
|
|
|
|
# color, for a total of 16*2=32 distinct colors for a gradient.
|
|
|
|
|
# Therefore every 100/32=3.125 percent the color gradient will change
|
|
|
|
|
stepSize = (100/len(colors))/2
|
|
|
|
|
characterIndex = int((percentage % (len(characters) * stepSize)) / stepSize)
|
|
|
|
|
colorIndex = int(percentage / (stepSize * 2))
|
|
|
|
|
returnStr = colors[colorIndex] + characters[characterIndex]
|
|
|
|
|
return returnStr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showTempGraph(deviceList):
|
2023-08-10 18:25:02 -05:00
|
|
|
deviceList.sort()
|
2023-09-24 02:29:07 -05:00
|
|
|
temp_type = getTemperatureLabel(deviceList)
|
|
|
|
|
printLogSpacer(' Temperature Graph ' + temp_type.capitalize() + ' ')
|
2025-07-07 18:42:53 -05:00
|
|
|
thread_list = []
|
2022-09-07 13:49:43 -04:00
|
|
|
# Start a thread for constantly printing
|
|
|
|
|
try:
|
|
|
|
|
# Create a thread (call print function, devices, delay in ms)
|
2025-07-07 18:42:53 -05:00
|
|
|
thread = threading.Thread(target=printTempGraph, args=(deviceList, 150, temp_type))
|
|
|
|
|
thread.start()
|
|
|
|
|
thread_list.append(thread)
|
2022-09-07 13:49:43 -04:00
|
|
|
except Exception as e:
|
|
|
|
|
printErrLog(device, 'Unable to start new thread. %s' % (e))
|
|
|
|
|
# Catch user input for program termination
|
|
|
|
|
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
|
|
|
|
|
getch = _Getch()
|
|
|
|
|
user_input = getch()
|
2025-07-07 18:42:53 -05:00
|
|
|
global stop_threads
|
|
|
|
|
stop_threads = True;
|
2022-09-07 13:49:43 -04:00
|
|
|
# Catch user input for q or Ctrl + c
|
|
|
|
|
if user_input == 'q' or user_input == '\x03':
|
|
|
|
|
break
|
2025-07-07 18:42:53 -05:00
|
|
|
for thread in thread_list:
|
|
|
|
|
thread.join()
|
2022-09-07 13:49:43 -04:00
|
|
|
# Reset color to default before exit
|
|
|
|
|
print('\033[A\x1b[0m\r')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2023-10-02 17:57:02 -05:00
|
|
|
def showDriverVersion(deviceList, component):
|
2020-07-15 06:01:40 -04:00
|
|
|
""" Display the software version for the specified component
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param component: Component (currently only driver)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Version of System Component ')
|
|
|
|
|
printSysLog(component_str(component) + ' version', getVersion(deviceList, component))
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showVoltage(deviceList):
|
|
|
|
|
""" Display the current voltage (in millivolts) for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Current voltage ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
vtype = rsmi_voltage_type_t(0)
|
|
|
|
|
met = rsmi_voltage_metric_t(0)
|
|
|
|
|
voltage = c_uint64()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_volt_metric_get(device, vtype, met, byref(voltage))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_volt_metric') and str(voltage.value):
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'Voltage (mV)', str(voltage.value))
|
|
|
|
|
else:
|
|
|
|
|
logging.debug('GPU voltage not supported')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2024-05-03 02:58:31 -05:00
|
|
|
def showVoltageCurve(deviceList):
|
|
|
|
|
""" Show the voltage curve points for the specified devices
|
|
|
|
|
|
|
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Voltage Curve Points ')
|
|
|
|
|
odvf = rsmi_od_volt_freq_data_t()
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf))
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_od_volt_info', silent=False) and odvf.num_regions > 0:
|
|
|
|
|
for position in range(3):
|
|
|
|
|
printLog(device, 'Voltage point %d: %sMhz %smV' % (
|
|
|
|
|
position, int(list(odvf.curve.vc_points)[position].frequency / 1000000),
|
|
|
|
|
int(list(odvf.curve.vc_points)[position].voltage)), None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def showXgmiErr(deviceList):
|
|
|
|
|
""" Display the XGMI Error status
|
|
|
|
|
|
|
|
|
|
This reads the XGMI error file, and interprets the return value from the sysfs file
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: Show XGMI error state for these devices
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer('XGMI Error status')
|
|
|
|
|
xe = rsmi_xgmi_status_t()
|
|
|
|
|
for device in deviceList:
|
2023-02-08 09:10:17 -06:00
|
|
|
ret = rocmsmi.rsmi_dev_xgmi_error_status(device, byref(xe))
|
2020-07-15 06:01:40 -04:00
|
|
|
if rsmi_ret_ok(ret, device, 'xgmi status'):
|
|
|
|
|
desc = ''
|
|
|
|
|
if xe.value is None:
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
err = int(xe.value)
|
|
|
|
|
if err == 0:
|
|
|
|
|
desc = 'No errors detected since last read'
|
|
|
|
|
elif err == 1:
|
|
|
|
|
desc = 'Single error detected since last read'
|
|
|
|
|
elif err == 2:
|
|
|
|
|
desc = 'Multiple errors detected since last read'
|
|
|
|
|
else:
|
|
|
|
|
printErrLog(device, 'Invalid return value from xgmi_error')
|
|
|
|
|
continue
|
2020-09-09 17:34:44 -04:00
|
|
|
if PRINT_JSON is True:
|
2020-07-15 06:01:40 -04:00
|
|
|
printLog(device, 'XGMI Error count', err)
|
|
|
|
|
else:
|
|
|
|
|
printLog(device, 'XGMI Error count', '%s (%s)' % (err, desc))
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2021-08-17 14:53:32 -04:00
|
|
|
def showAccessibleTopology(deviceList):
|
|
|
|
|
""" Display the HW Topology Information based on link accessibility
|
|
|
|
|
|
|
|
|
|
This reads the HW Topology file and displays the matrix for the nodes
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2021-08-17 14:53:32 -04:00
|
|
|
"""
|
|
|
|
|
devices_ind = range(len(deviceList))
|
|
|
|
|
accessible = c_bool()
|
|
|
|
|
gpu_links_type = [[0 for x in devices_ind] for y in devices_ind]
|
|
|
|
|
printLogSpacer(' Link accessibility between two GPUs ')
|
|
|
|
|
for srcdevice in deviceList:
|
|
|
|
|
for destdevice in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_is_P2P_accessible(srcdevice, destdevice, byref(accessible))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='is_P2P_accessible'):
|
2021-08-17 14:53:32 -04:00
|
|
|
gpu_links_type[srcdevice][destdevice] = accessible.value
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
formatMatrixToJSON(deviceList, gpu_links_type, "(Topology) Link accessibility between DRM devices {} and {}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
printTableRow(None, ' ')
|
|
|
|
|
for row in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % row
|
|
|
|
|
printTableRow('%-12s', tmp)
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
for gpu1 in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % gpu1
|
|
|
|
|
printTableRow('%-6s', tmp)
|
|
|
|
|
for gpu2 in deviceList:
|
|
|
|
|
printTableRow('%-12s', gpu_links_type[gpu1][gpu2])
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
|
|
|
|
|
|
2020-08-05 16:30:22 -04:00
|
|
|
def showWeightTopology(deviceList):
|
|
|
|
|
""" Display the HW Topology Information based on weights
|
|
|
|
|
|
|
|
|
|
This reads the HW Topology file and displays the matrix for the nodes
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-08-05 16:30:22 -04:00
|
|
|
"""
|
2020-09-09 17:34:44 -04:00
|
|
|
global PRINT_JSON
|
|
|
|
|
devices_ind = range(len(deviceList))
|
|
|
|
|
gpu_links_weight = [[0 for x in devices_ind] for y in devices_ind]
|
2020-08-05 16:30:22 -04:00
|
|
|
printLogSpacer(' Weight between two GPUs ')
|
|
|
|
|
for srcdevice in deviceList:
|
|
|
|
|
for destdevice in deviceList:
|
|
|
|
|
if (srcdevice == destdevice):
|
|
|
|
|
gpu_links_weight[srcdevice][destdevice] = 0
|
2020-08-20 10:22:24 -04:00
|
|
|
continue
|
2020-10-20 14:59:04 -04:00
|
|
|
weight = c_uint64()
|
2020-08-05 16:30:22 -04:00
|
|
|
ret = rocmsmi.rsmi_topo_get_link_weight(srcdevice, destdevice, byref(weight))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='get_link_weight_topology'):
|
2020-08-05 16:30:22 -04:00
|
|
|
gpu_links_weight[srcdevice][destdevice] = weight
|
|
|
|
|
else:
|
2022-08-26 20:51:24 +00:00
|
|
|
gpu_links_weight[srcdevice][destdevice] = None
|
|
|
|
|
|
2020-09-09 17:34:44 -04:00
|
|
|
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
formatMatrixToJSON(deviceList, gpu_links_weight, "(Topology) Weight between DRM devices {} and {}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
printTableRow(None, ' ')
|
2020-08-05 16:30:22 -04:00
|
|
|
for row in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % row
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-12s', tmp)
|
2020-08-05 16:30:22 -04:00
|
|
|
printEmptyLine()
|
|
|
|
|
for gpu1 in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % gpu1
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-6s', tmp)
|
2020-08-05 16:30:22 -04:00
|
|
|
for gpu2 in deviceList:
|
|
|
|
|
if (gpu1 == gpu2):
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-12s', '0')
|
2025-09-10 14:50:32 -05:00
|
|
|
elif (gpu_links_weight[gpu1][gpu2] is None):
|
2022-08-26 20:51:24 +00:00
|
|
|
printTableRow('%-12s', 'N/A')
|
2020-08-05 16:30:22 -04:00
|
|
|
else:
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-12s', gpu_links_weight[gpu1][gpu2].value)
|
2020-08-05 16:30:22 -04:00
|
|
|
printEmptyLine()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showHopsTopology(deviceList):
|
|
|
|
|
""" Display the HW Topology Information based on number of hops
|
|
|
|
|
|
|
|
|
|
This reads the HW Topology file and displays the matrix for the nodes
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-08-05 16:30:22 -04:00
|
|
|
"""
|
|
|
|
|
linktype = c_char_p()
|
2020-09-09 17:34:44 -04:00
|
|
|
devices_ind = range(len(deviceList))
|
|
|
|
|
gpu_links_hops = [[0 for x in devices_ind] for y in devices_ind]
|
2020-08-05 16:30:22 -04:00
|
|
|
printLogSpacer(' Hops between two GPUs ')
|
|
|
|
|
for srcdevice in deviceList:
|
|
|
|
|
for destdevice in deviceList:
|
|
|
|
|
if (srcdevice == destdevice):
|
|
|
|
|
gpu_links_hops[srcdevice][destdevice] = '0'
|
2020-08-20 10:22:24 -04:00
|
|
|
continue
|
2020-10-20 14:59:04 -04:00
|
|
|
hops = c_uint64()
|
2020-08-05 16:30:22 -04:00
|
|
|
ret = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='get_link_type_topology'):
|
2020-08-05 16:30:22 -04:00
|
|
|
gpu_links_hops[srcdevice][destdevice] = hops
|
|
|
|
|
else:
|
2022-08-26 20:51:24 +00:00
|
|
|
gpu_links_hops[srcdevice][destdevice] = None
|
2020-09-09 17:34:44 -04:00
|
|
|
|
|
|
|
|
if PRINT_JSON:
|
|
|
|
|
formatMatrixToJSON(deviceList, gpu_links_hops, "(Topology) Hops between DRM devices {} and {}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
printTableRow(None, ' ')
|
2020-08-05 16:30:22 -04:00
|
|
|
for row in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % row
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-12s', tmp)
|
2020-08-05 16:30:22 -04:00
|
|
|
printEmptyLine()
|
|
|
|
|
for gpu1 in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % gpu1
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-6s', tmp)
|
2020-08-05 16:30:22 -04:00
|
|
|
for gpu2 in deviceList:
|
|
|
|
|
if (gpu1 == gpu2):
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-12s', '0')
|
2025-09-10 14:50:32 -05:00
|
|
|
elif (gpu_links_hops[gpu1][gpu2] is None):
|
2022-08-26 20:51:24 +00:00
|
|
|
printTableRow('%-12s', 'N/A')
|
2020-08-05 16:30:22 -04:00
|
|
|
else:
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-12s', gpu_links_hops[gpu1][gpu2].value)
|
2020-08-05 16:30:22 -04:00
|
|
|
printEmptyLine()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showTypeTopology(deviceList):
|
|
|
|
|
""" Display the HW Topology Information based on link type
|
|
|
|
|
|
|
|
|
|
This reads the HW Topology file and displays the matrix for the nodes
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-08-05 16:30:22 -04:00
|
|
|
"""
|
2020-09-09 17:34:44 -04:00
|
|
|
devices_ind = range(len(deviceList))
|
2020-08-05 16:30:22 -04:00
|
|
|
hops = c_uint64()
|
|
|
|
|
linktype = c_uint64()
|
2020-09-09 17:34:44 -04:00
|
|
|
gpu_links_type = [[0 for x in devices_ind] for y in devices_ind]
|
2020-08-05 16:30:22 -04:00
|
|
|
printLogSpacer(' Link Type between two GPUs ')
|
|
|
|
|
for srcdevice in deviceList:
|
|
|
|
|
for destdevice in deviceList:
|
|
|
|
|
if (srcdevice == destdevice):
|
|
|
|
|
gpu_links_type[srcdevice][destdevice] = '0'
|
2020-08-20 10:22:24 -04:00
|
|
|
continue
|
2020-08-05 16:30:22 -04:00
|
|
|
ret = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='get_link_topology_type'):
|
2020-08-05 16:30:22 -04:00
|
|
|
if (linktype.value == 1):
|
|
|
|
|
gpu_links_type[srcdevice][destdevice] = "PCIE"
|
|
|
|
|
elif (linktype.value == 2):
|
|
|
|
|
gpu_links_type[srcdevice][destdevice] = "XGMI"
|
|
|
|
|
else:
|
|
|
|
|
gpu_links_type[srcdevice][destdevice] = "XXXX"
|
|
|
|
|
else:
|
2022-08-26 20:51:24 +00:00
|
|
|
gpu_links_type[srcdevice][destdevice] = "XXXX"
|
|
|
|
|
|
2020-09-09 17:34:44 -04:00
|
|
|
if PRINT_JSON:
|
|
|
|
|
formatMatrixToJSON(deviceList, gpu_links_type, "(Topology) Link type between DRM devices {} and {}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
printTableRow(None, ' ')
|
2020-08-05 16:30:22 -04:00
|
|
|
for row in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % row
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-12s', tmp)
|
2020-08-05 16:30:22 -04:00
|
|
|
printEmptyLine()
|
|
|
|
|
for gpu1 in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % gpu1
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-6s', tmp)
|
2020-08-05 16:30:22 -04:00
|
|
|
for gpu2 in deviceList:
|
|
|
|
|
if (gpu1 == gpu2):
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-12s', '0')
|
2020-08-05 16:30:22 -04:00
|
|
|
else:
|
2020-09-09 17:34:44 -04:00
|
|
|
printTableRow('%-12s', gpu_links_type[gpu1][gpu2])
|
2020-08-05 16:30:22 -04:00
|
|
|
printEmptyLine()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def showNumaTopology(deviceList):
|
|
|
|
|
""" Display the HW Topology Information for numa nodes
|
|
|
|
|
|
|
|
|
|
This reads the HW Topology file and display the matrix for the nodes
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-08-05 16:30:22 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Numa Nodes ')
|
2023-08-24 13:08:47 -05:00
|
|
|
numa_numbers = c_int32()
|
2020-08-05 16:30:22 -04:00
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_topo_get_numa_node_number(device, byref(numa_numbers))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_numa_node_number'):
|
2020-09-09 17:34:44 -04:00
|
|
|
printLog(device, "(Topology) Numa Node", numa_numbers.value)
|
|
|
|
|
|
|
|
|
|
ret = rocmsmi.rsmi_topo_numa_affinity_get(device, byref(numa_numbers))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='get_numa_affinity_topology'):
|
2021-07-29 12:43:54 -04:00
|
|
|
printLog(device, "(Topology) Numa Affinity", numa_numbers.value)
|
2020-09-09 17:34:44 -04:00
|
|
|
|
|
|
|
|
|
2020-08-05 16:30:22 -04:00
|
|
|
def showHwTopology(deviceList):
|
|
|
|
|
""" Display the HW Topology Information based on weight/hops/type
|
|
|
|
|
|
|
|
|
|
This reads the HW Topology file and displays the matrix for the nodes
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-08-05 16:30:22 -04:00
|
|
|
"""
|
|
|
|
|
showWeightTopology(deviceList)
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
showHopsTopology(deviceList)
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
showTypeTopology(deviceList)
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
showNumaTopology(deviceList)
|
|
|
|
|
|
|
|
|
|
|
2021-10-26 18:39:23 -04:00
|
|
|
def showNodesBw(deviceList):
|
|
|
|
|
""" Display max and min bandwidth between nodes.
|
|
|
|
|
Currently supports XGMI only.
|
|
|
|
|
This reads the HW Topology file and displays the matrix for the nodes
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2021-10-26 18:39:23 -04:00
|
|
|
"""
|
|
|
|
|
devices_ind = range(len(deviceList))
|
|
|
|
|
minBW = c_uint32()
|
|
|
|
|
maxBW = c_uint32()
|
2022-05-19 15:26:21 -04:00
|
|
|
hops = c_uint64()
|
|
|
|
|
linktype = c_uint64()
|
|
|
|
|
silent = False
|
|
|
|
|
nonXgmi = False
|
2021-10-26 18:39:23 -04:00
|
|
|
gpu_links_type = [[0 for x in devices_ind] for y in devices_ind]
|
|
|
|
|
printLogSpacer(' Bandwidth ')
|
|
|
|
|
for srcdevice in deviceList:
|
|
|
|
|
for destdevice in deviceList:
|
|
|
|
|
if srcdevice != destdevice:
|
|
|
|
|
ret = rocmsmi.rsmi_minmax_bandwidth_get(srcdevice, destdevice, byref(minBW), byref(maxBW))
|
2022-05-19 15:26:21 -04:00
|
|
|
#verify that link type is xgmi
|
|
|
|
|
ret2 = rocmsmi.rsmi_topo_get_link_type(srcdevice, destdevice, byref(hops), byref(linktype))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret2," {} to {}".format(srcdevice, destdevice), 'get_link_topology_type', True):
|
2022-05-19 15:26:21 -04:00
|
|
|
if linktype.value != 2:
|
|
|
|
|
nonXgmi = True
|
|
|
|
|
silent= True
|
|
|
|
|
gpu_links_type[srcdevice][destdevice] = "N/A"
|
|
|
|
|
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, " {} to {}".format(srcdevice, destdevice), 'get_link_topology_type',silent):
|
2021-10-26 18:39:23 -04:00
|
|
|
gpu_links_type[srcdevice][destdevice] = "{}-{}".format(minBW.value, maxBW.value)
|
|
|
|
|
else:
|
|
|
|
|
gpu_links_type[srcdevice][destdevice] = "N/A"
|
|
|
|
|
if PRINT_JSON:
|
2023-06-07 11:56:29 +08:00
|
|
|
# TODO
|
2021-10-26 18:39:23 -04:00
|
|
|
return
|
|
|
|
|
printTableRow(None, ' ')
|
|
|
|
|
for row in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % row
|
|
|
|
|
printTableRow('%-12s', tmp)
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
for gpu1 in deviceList:
|
|
|
|
|
tmp = 'GPU%d' % gpu1
|
|
|
|
|
printTableRow('%-6s', tmp)
|
|
|
|
|
for gpu2 in deviceList:
|
|
|
|
|
printTableRow('%-12s', gpu_links_type[gpu1][gpu2])
|
|
|
|
|
printEmptyLine()
|
|
|
|
|
printLog(None,"Format: min-max; Units: mps", None)
|
2022-05-19 15:26:21 -04:00
|
|
|
printLog(None,'"0-0" min-max bandwidth indicates devices are not connected directly', None)
|
|
|
|
|
if nonXgmi:
|
|
|
|
|
printLog(None,"Non-xGMI links detected and is currently not supported", None)
|
2021-10-26 18:39:23 -04:00
|
|
|
|
2023-01-06 11:01:18 -06:00
|
|
|
def showComputePartition(deviceList):
|
|
|
|
|
""" Returns the current compute partitioning for a list of devices
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2023-01-06 11:01:18 -06:00
|
|
|
"""
|
|
|
|
|
currentComputePartition = create_string_buffer(256)
|
|
|
|
|
printLogSpacer(' Current Compute Partition ')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
|
2023-01-06 11:01:18 -06:00
|
|
|
printLog(device, 'Compute Partition', currentComputePartition.value.decode())
|
|
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
|
|
|
|
printLog(device, 'Not supported on the given system', None)
|
|
|
|
|
else:
|
2025-09-16 10:56:03 -05:00
|
|
|
printLog(device, 'Failed to retrieve compute partition, even though device supports it.')
|
2023-01-30 15:58:03 -06:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
2023-09-21 14:53:35 -05:00
|
|
|
def showMemoryPartition(deviceList):
|
|
|
|
|
""" Returns the current memory partition for a list of devices
|
2023-01-30 15:58:03 -06:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2023-01-30 15:58:03 -06:00
|
|
|
"""
|
2023-09-21 14:53:35 -05:00
|
|
|
memoryPartition = create_string_buffer(256)
|
|
|
|
|
printLogSpacer(' Current Memory Partition ')
|
2023-01-30 15:58:03 -06:00
|
|
|
for device in deviceList:
|
2023-09-21 14:53:35 -05:00
|
|
|
ret = rocmsmi.rsmi_dev_memory_partition_get(device, memoryPartition, 256)
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'get_memory_partition',silent=True) and memoryPartition.value.decode():
|
|
|
|
|
printLog(device, 'Memory Partition', memoryPartition.value.decode())
|
2023-01-30 15:58:03 -06:00
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
|
|
|
|
printLog(device, 'Not supported on the given system', None)
|
|
|
|
|
else:
|
2025-09-16 10:56:03 -05:00
|
|
|
printLog(device, 'Failed to retrieve current memory partition, even though device supports it.')
|
2023-01-06 11:01:18 -06:00
|
|
|
printLogSpacer()
|
|
|
|
|
|
2024-06-21 15:13:15 -05:00
|
|
|
class UIntegerTypes(IntEnum):
|
|
|
|
|
UINT8_T = 0xFF
|
|
|
|
|
UINT16_T = 0xFFFF
|
|
|
|
|
UINT32_T = 0xFFFFFFFF
|
|
|
|
|
UINT64_T = 0xFFFFFFFFFFFFFFFF
|
|
|
|
|
|
|
|
|
|
def validateIfMaxUint(valToCheck, uintType: UIntegerTypes):
|
|
|
|
|
return_val = "N/A"
|
|
|
|
|
if not isinstance(valToCheck, list):
|
|
|
|
|
if valToCheck == uintType:
|
|
|
|
|
return return_val
|
|
|
|
|
else:
|
|
|
|
|
return valToCheck
|
|
|
|
|
else:
|
|
|
|
|
return_val = valToCheck
|
|
|
|
|
for idx, v in enumerate(valToCheck):
|
|
|
|
|
if v == uintType:
|
|
|
|
|
return_val[idx] = "N/A"
|
|
|
|
|
return return_val
|
|
|
|
|
|
|
|
|
|
def showGPUMetrics(deviceList):
|
|
|
|
|
""" Returns the gpu metrics for a list of devices
|
|
|
|
|
|
|
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
"""
|
|
|
|
|
printLogSpacer(' GPU Metrics ')
|
|
|
|
|
gpu_metrics = rsmi_gpu_metrics_t()
|
|
|
|
|
temp_unit="C"
|
|
|
|
|
power_unit="W"
|
|
|
|
|
energy_unit="15.259uJ (2^-16)"
|
|
|
|
|
volt_unit="mV"
|
|
|
|
|
clock_unit="MHz"
|
|
|
|
|
fan_speed="rpm"
|
|
|
|
|
percent_unit="%"
|
2024-12-12 17:56:09 -06:00
|
|
|
vram_max_bw="GB/s"
|
2024-06-21 15:13:15 -05:00
|
|
|
pcie_acc_unit="GB/s"
|
|
|
|
|
pcie_lanes_unit="Lanes"
|
|
|
|
|
pcie_speed_unit="0.1 GT/s"
|
|
|
|
|
xgmi_speed="Gbps"
|
|
|
|
|
xgmi_data_sz="kB"
|
|
|
|
|
time_unit="ns"
|
|
|
|
|
time_unit_10="10ns resolution"
|
|
|
|
|
count="Count"
|
2024-12-12 17:56:09 -06:00
|
|
|
link_status="Up/Down"
|
2024-06-21 15:13:15 -05:00
|
|
|
no_unit = None
|
|
|
|
|
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
ret = rocmsmi.rsmi_dev_gpu_metrics_info_get(device, byref(gpu_metrics))
|
|
|
|
|
metrics = {
|
|
|
|
|
"common_header": "N/A"
|
|
|
|
|
}
|
|
|
|
|
if rsmi_ret_ok(ret, device, 'rsmi_dev_gpu_metrics_info_get',silent=True):
|
|
|
|
|
metrics = {
|
|
|
|
|
"common_header": {
|
|
|
|
|
"version": float(str(gpu_metrics.common_header.format_revision) + "."
|
|
|
|
|
+ str(gpu_metrics.common_header.content_revision)),
|
|
|
|
|
"size": gpu_metrics.common_header.structure_size
|
|
|
|
|
}, "temperature_edge": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.temperature_edge, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": temp_unit,
|
|
|
|
|
}, "temperature_hotspot": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.temperature_hotspot, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": temp_unit,
|
|
|
|
|
}, "temperature_mem": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.temperature_mem, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": temp_unit,
|
|
|
|
|
}, "temperature_vrgfx": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.temperature_vrgfx, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": temp_unit,
|
|
|
|
|
}, "temperature_vrsoc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.temperature_vrsoc, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": temp_unit,
|
|
|
|
|
}, "temperature_vrmem": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.temperature_vrmem, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": temp_unit,
|
|
|
|
|
}, "average_gfx_activity": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_gfx_activity, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
}, "average_umc_activity": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_umc_activity, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
}, "average_mm_activity": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_mm_activity, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
}, "average_socket_power": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_socket_power, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": power_unit,
|
|
|
|
|
}, "energy_accumulator": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.energy_accumulator, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": energy_unit,
|
|
|
|
|
}, "system_clock_counter": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.system_clock_counter, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": time_unit,
|
|
|
|
|
}, "average_gfxclk_frequency": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_gfxclk_frequency, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "average_socclk_frequency": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_socclk_frequency, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "average_uclk_frequency": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_uclk_frequency, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "average_vclk0_frequency": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_vclk0_frequency, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "average_dclk0_frequency": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_dclk0_frequency, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "average_vclk1_frequency": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_vclk1_frequency, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "average_dclk1_frequency": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.average_dclk1_frequency, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_gfxclk": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.current_gfxclk, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_socclk": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.current_socclk, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_uclk": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.current_uclk, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_vclk0": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.current_vclk0, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_dclk0": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.current_dclk0, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_vclk1": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.current_vclk1, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_dclk1": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.current_dclk1, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "throttle_status": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.throttle_status, UIntegerTypes.UINT32_T),
|
|
|
|
|
"unit": no_unit,
|
|
|
|
|
}, "current_fan_speed": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.current_fan_speed, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": fan_speed,
|
|
|
|
|
}, "pcie_link_width": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_link_width, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": pcie_lanes_unit,
|
|
|
|
|
}, "pcie_link_speed": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_link_speed, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": pcie_speed_unit,
|
|
|
|
|
}, "gfx_activity_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.gfx_activity_acc, UIntegerTypes.UINT32_T),
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
}, "mem_activity_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.mem_activity_acc, UIntegerTypes.UINT32_T),
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
}, "temperature_hbm": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.temperature_hbm), UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": temp_unit,
|
|
|
|
|
}, "firmware_timestamp": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.firmware_timestamp, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": time_unit_10,
|
|
|
|
|
}, "voltage_soc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.voltage_soc, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": volt_unit,
|
|
|
|
|
}, "voltage_gfx": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.voltage_gfx, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": volt_unit,
|
|
|
|
|
}, "voltage_mem": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.voltage_mem, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": volt_unit,
|
|
|
|
|
}, "indep_throttle_status": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.indep_throttle_status, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": no_unit,
|
|
|
|
|
}, "current_socket_power": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.current_socket_power, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": power_unit,
|
|
|
|
|
}, "vcn_activity": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.vcn_activity), UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
}, "gfxclk_lock_status": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.gfxclk_lock_status, UIntegerTypes.UINT32_T),
|
|
|
|
|
"unit": no_unit,
|
|
|
|
|
}, "xgmi_link_width": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.xgmi_link_width, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": no_unit,
|
|
|
|
|
}, "xgmi_link_speed": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.xgmi_link_speed, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": xgmi_speed,
|
|
|
|
|
}, "pcie_bandwidth_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_bandwidth_acc, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": pcie_acc_unit,
|
|
|
|
|
}, "pcie_bandwidth_inst": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_bandwidth_inst, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": pcie_acc_unit,
|
|
|
|
|
}, "pcie_l0_to_recov_count_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_l0_to_recov_count_acc, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "pcie_replay_count_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_replay_count_acc, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "pcie_replay_rover_count_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_replay_rover_count_acc, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "xgmi_read_data_acc": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.xgmi_read_data_acc), UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": xgmi_data_sz,
|
|
|
|
|
}, "xgmi_write_data_acc": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.xgmi_write_data_acc), UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": xgmi_data_sz,
|
|
|
|
|
}, "current_gfxclks": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.current_gfxclks), UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_socclks": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.current_socclks), UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_vclk0s": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.current_vclk0s), UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "current_dclk0s": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.current_dclk0s), UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": clock_unit,
|
|
|
|
|
}, "jpeg_activity": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.jpeg_activity), UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
}, "pcie_nak_sent_count_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_nak_sent_count_acc, UIntegerTypes.UINT32_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "pcie_nak_rcvd_count_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_nak_rcvd_count_acc, UIntegerTypes.UINT32_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "accumulation_counter": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.accumulation_counter, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "prochot_residency_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.prochot_residency_acc, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "ppt_residency_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.ppt_residency_acc, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "socket_thm_residency_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.socket_thm_residency_acc, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "vr_thm_residency_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.vr_thm_residency_acc, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
}, "hbm_thm_residency_acc": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.hbm_thm_residency_acc, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
},
|
|
|
|
|
"pcie_lc_perf_other_end_recovery": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.pcie_lc_perf_other_end_recovery, UIntegerTypes.UINT32_T),
|
|
|
|
|
"unit": count,
|
|
|
|
|
},
|
2024-12-12 17:56:09 -06:00
|
|
|
"vram_max_bandwidth": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.vram_max_bandwidth, UIntegerTypes.UINT64_T),
|
|
|
|
|
"unit": vram_max_bw,
|
|
|
|
|
},
|
|
|
|
|
"xgmi_link_status": {
|
|
|
|
|
"value": validateIfMaxUint(list(gpu_metrics.xgmi_link_status), UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": link_status,
|
|
|
|
|
},
|
2024-06-21 15:13:15 -05:00
|
|
|
"num_partition": {
|
|
|
|
|
"value": validateIfMaxUint(gpu_metrics.num_partition, UIntegerTypes.UINT16_T),
|
|
|
|
|
"unit": no_unit,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_busy_inst": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.jpeg_busy": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.vcn_busy": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_busy_acc": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
2025-03-20 18:07:32 -05:00
|
|
|
"unit": count,
|
2024-06-21 15:13:15 -05:00
|
|
|
},
|
2024-12-12 17:56:09 -06:00
|
|
|
"xcp_stats.gfx_below_host_limit_acc": {
|
2025-03-20 18:07:32 -05:00
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": count,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_below_host_limit_ppt_acc": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": count,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_below_host_limit_thm_acc": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": count,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_low_utilization_acc": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": count,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_below_host_limit_total_acc": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": count,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_below_host_limit_ppt_acc": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_below_host_limit_thm_acc": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_low_utilization_acc": {
|
|
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
},
|
|
|
|
|
"xcp_stats.gfx_below_host_limit_total_acc": {
|
2024-12-12 17:56:09 -06:00
|
|
|
"value": gpu_metrics.xcp_stats,
|
|
|
|
|
"unit": percent_unit,
|
|
|
|
|
},
|
2024-06-21 15:13:15 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
printLog(device, 'Metric Version and Size (Bytes)',
|
|
|
|
|
str(metrics["common_header"]["version"]) + " " + str(metrics["common_header"]["size"]))
|
|
|
|
|
for k,v in metrics.items():
|
|
|
|
|
if k != "common_header" and 'xcp_stats' not in k:
|
|
|
|
|
if v["unit"] != None:
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(v["value"]))
|
|
|
|
|
elif v["unit"] == None:
|
|
|
|
|
printLog(device, k, str(v["value"]))
|
|
|
|
|
if 'xcp_stats.gfx_busy_inst' in k:
|
|
|
|
|
for curr_xcp, item in enumerate(v['value']):
|
|
|
|
|
print_xcp_detail = []
|
|
|
|
|
for _, val in enumerate(item.gfx_busy_inst):
|
|
|
|
|
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT32_T))
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
|
|
|
|
|
if 'xcp_stats.jpeg_busy' in k:
|
|
|
|
|
for curr_xcp, item in enumerate(v['value']):
|
|
|
|
|
print_xcp_detail = []
|
|
|
|
|
for _, val in enumerate(item.jpeg_busy):
|
|
|
|
|
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT16_T))
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
|
|
|
|
|
if 'xcp_stats.vcn_busy' in k:
|
|
|
|
|
for curr_xcp, item in enumerate(v['value']):
|
|
|
|
|
print_xcp_detail = []
|
|
|
|
|
for _, val in enumerate(item.vcn_busy):
|
|
|
|
|
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT16_T))
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
|
|
|
|
|
if 'xcp_stats.gfx_busy_acc' in k:
|
|
|
|
|
for curr_xcp, item in enumerate(v['value']):
|
|
|
|
|
print_xcp_detail = []
|
|
|
|
|
for _, val in enumerate(item.gfx_busy_acc):
|
|
|
|
|
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
|
2024-12-12 17:56:09 -06:00
|
|
|
if 'xcp_stats.gfx_below_host_limit_acc' in k:
|
|
|
|
|
for curr_xcp, item in enumerate(v['value']):
|
|
|
|
|
print_xcp_detail = []
|
|
|
|
|
for _, val in enumerate(item.gfx_below_host_limit_acc):
|
|
|
|
|
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
|
2025-03-20 18:07:32 -05:00
|
|
|
if 'xcp_stats.gfx_below_host_limit_ppt_acc' in k:
|
|
|
|
|
for curr_xcp, item in enumerate(v['value']):
|
|
|
|
|
print_xcp_detail = []
|
|
|
|
|
for _, val in enumerate(item.gfx_below_host_limit_ppt_acc):
|
|
|
|
|
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
|
|
|
|
|
if 'xcp_stats.gfx_below_host_limit_thm_acc' in k:
|
|
|
|
|
for curr_xcp, item in enumerate(v['value']):
|
|
|
|
|
print_xcp_detail = []
|
|
|
|
|
for _, val in enumerate(item.gfx_below_host_limit_thm_acc):
|
|
|
|
|
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
|
|
|
|
|
if 'xcp_stats.gfx_low_utilization_acc' in k:
|
|
|
|
|
for curr_xcp, item in enumerate(v['value']):
|
|
|
|
|
print_xcp_detail = []
|
|
|
|
|
for _, val in enumerate(item.gfx_low_utilization_acc):
|
|
|
|
|
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
|
|
|
|
|
if 'xcp_stats.gfx_below_host_limit_total_acc' in k:
|
|
|
|
|
for curr_xcp, item in enumerate(v['value']):
|
|
|
|
|
print_xcp_detail = []
|
|
|
|
|
for _, val in enumerate(item.gfx_below_host_limit_total_acc):
|
|
|
|
|
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
|
|
|
|
|
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
|
2024-06-21 15:13:15 -05:00
|
|
|
|
|
|
|
|
if int(device) < (len(deviceList) - 1):
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
|
|
|
|
printLog(device, 'Not supported on the given system', None)
|
|
|
|
|
else:
|
2025-09-16 10:56:03 -05:00
|
|
|
printLog(device, 'Failed to retrieve GPU metrics, metric version may not be supported for this device.')
|
2024-06-21 15:13:15 -05:00
|
|
|
printLogSpacer()
|
2023-01-30 15:58:03 -06:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def checkAmdGpus(deviceList):
|
|
|
|
|
""" Check if there are any AMD GPUs being queried,
|
|
|
|
|
return False if there are none
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if isAmdDevice(device):
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
2025-05-15 14:30:33 -05:00
|
|
|
def check_runtime_status() -> bool:
|
|
|
|
|
"""Check the runtime status of all AMD GPU devices managed by the amdgpu driver.
|
2025-04-14 13:05:22 -05:00
|
|
|
|
2025-05-15 14:30:33 -05:00
|
|
|
This function scans the directories under the specified path to verify the
|
|
|
|
|
runtime power management status of each device. It checks the "runtime_status"
|
|
|
|
|
file for each device to determine if the device is in an "active" state. If any
|
|
|
|
|
device is not in an "active" state it returns False. If the file is inaccessible,
|
|
|
|
|
this may be due to a system that does not support runtime power management.
|
|
|
|
|
Some GPUs support runtime power management, while others may not. This is why the default status
|
|
|
|
|
is set to True.
|
|
|
|
|
|
|
|
|
|
bool: False if any device is not in "active" state, True otherwise.
|
2025-04-14 13:05:22 -05:00
|
|
|
"""
|
2025-09-16 10:56:03 -05:00
|
|
|
base_path = "/sys/class/drm"
|
2025-05-15 14:30:33 -05:00
|
|
|
status = True # Default to True, assuming active unless proven otherwise
|
2025-04-14 13:05:22 -05:00
|
|
|
for device in os.listdir(base_path):
|
|
|
|
|
if os.path.isdir(os.path.join(base_path, device)):
|
|
|
|
|
runtime_status_path = os.path.join(base_path, device, "power", "runtime_status")
|
2025-05-15 14:30:33 -05:00
|
|
|
try:
|
2025-09-16 10:56:03 -05:00
|
|
|
with open(runtime_status_path, 'r') as file:
|
|
|
|
|
current_status = file.read().strip()
|
|
|
|
|
if current_status != "active":
|
|
|
|
|
status = False
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
logging.debug(f"Runtime status for {device}: {current_status}")
|
|
|
|
|
status = True
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
# File does not exist, skip this device
|
|
|
|
|
continue
|
2025-05-15 14:30:33 -05:00
|
|
|
except PermissionError as e:
|
|
|
|
|
# Handle permission errors gracefully
|
|
|
|
|
logging.debug(f"Permission denied while accessing {runtime_status_path} \nError: {e}")
|
|
|
|
|
continue
|
|
|
|
|
else:
|
2025-09-16 10:56:03 -05:00
|
|
|
pass
|
2025-05-15 14:30:33 -05:00
|
|
|
return status
|
2025-04-14 13:05:22 -05:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def component_str(component):
|
|
|
|
|
""" Returns the component String value
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param component: Component (currently only driver)
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
switcher = {
|
|
|
|
|
0: 'Driver'
|
|
|
|
|
}
|
|
|
|
|
return switcher.get(component, 'UNKNOWN')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def confirmOutOfSpecWarning(autoRespond):
|
|
|
|
|
""" Print the warning for running outside of specification and prompt user to accept the terms.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param autoRespond: Response to automatically provide for all prompts
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
print('''
|
|
|
|
|
******WARNING******\n
|
|
|
|
|
Operating your AMD GPU outside of official AMD specifications or outside of
|
|
|
|
|
factory settings, including but not limited to the conducting of overclocking,
|
|
|
|
|
over-volting or under-volting (including use of this interface software,
|
|
|
|
|
even if such software has been directly or indirectly provided by AMD or otherwise
|
|
|
|
|
affiliated in any way with AMD), may cause damage to your AMD GPU, system components
|
|
|
|
|
and/or result in system failure, as well as cause other problems.
|
|
|
|
|
DAMAGES CAUSED BY USE OF YOUR AMD GPU OUTSIDE OF OFFICIAL AMD SPECIFICATIONS OR
|
|
|
|
|
OUTSIDE OF FACTORY SETTINGS ARE NOT COVERED UNDER ANY AMD PRODUCT WARRANTY AND
|
|
|
|
|
MAY NOT BE COVERED BY YOUR BOARD OR SYSTEM MANUFACTURER'S WARRANTY.
|
|
|
|
|
Please use this utility with caution.
|
|
|
|
|
''')
|
|
|
|
|
if not autoRespond:
|
|
|
|
|
user_input = input('Do you accept these terms? [y/N] ')
|
|
|
|
|
else:
|
|
|
|
|
user_input = autoRespond
|
|
|
|
|
if user_input in ['Yes', 'yes', 'y', 'Y', 'YES']:
|
|
|
|
|
return
|
|
|
|
|
else:
|
|
|
|
|
sys.exit('Confirmation not given. Exiting without setting value')
|
|
|
|
|
|
2024-11-06 15:13:32 -06:00
|
|
|
def confirmChangingMemoryPartitionAndReloadingAMDGPU(autoRespond):
|
|
|
|
|
""" Print the warning for running outside of specification and prompt user to accept the terms.
|
|
|
|
|
|
|
|
|
|
:param autoRespond: Response to automatically provide for all prompts
|
|
|
|
|
"""
|
|
|
|
|
print('''
|
|
|
|
|
******WARNING******\n
|
|
|
|
|
Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads.
|
|
|
|
|
ROCm SMI will then attempt to change memory (NPS) partition mode.
|
2024-11-21 23:28:03 -06:00
|
|
|
Upon a successful set, ROCm SMI will then initiate an action to restart AMD GPU driver.
|
2024-11-06 15:13:32 -06:00
|
|
|
This action will change all GPU's in the hive to the requested memory (NPS) partition mode.
|
|
|
|
|
|
|
|
|
|
Please use this utility with caution.
|
|
|
|
|
''')
|
|
|
|
|
if not autoRespond:
|
|
|
|
|
user_input = input('Do you accept these terms? [Y/N] ')
|
|
|
|
|
else:
|
|
|
|
|
user_input = autoRespond
|
|
|
|
|
if user_input in ['Yes', 'yes', 'y', 'Y', 'YES']:
|
|
|
|
|
print('')
|
|
|
|
|
return
|
|
|
|
|
else:
|
|
|
|
|
print('Confirmation not given. Exiting without setting value')
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
def doesDeviceExist(device):
|
|
|
|
|
""" Check whether the specified device exists
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
availableDevices = listDevices()
|
2020-11-03 06:35:42 -05:00
|
|
|
filePath = '/sys/kernel/debug/dri/%d/' % (int(device))
|
|
|
|
|
if device in availableDevices or os.path.exists(filePath):
|
2020-07-15 06:01:40 -04:00
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
2020-09-23 16:33:01 -04:00
|
|
|
def initializeRsmi():
|
|
|
|
|
""" initializes rocmsmi if the amdgpu driver is initialized
|
|
|
|
|
"""
|
2024-03-04 21:08:08 -06:00
|
|
|
global rocmsmi
|
|
|
|
|
# Initialize rsmiBindings
|
|
|
|
|
rocmsmi = initRsmiBindings(silent=PRINT_JSON)
|
2020-09-23 16:33:01 -04:00
|
|
|
# Check if amdgpu is initialized before initializing rsmi
|
|
|
|
|
if driverInitialized() is True:
|
|
|
|
|
ret_init = rocmsmi.rsmi_init(0)
|
|
|
|
|
if ret_init != 0:
|
|
|
|
|
logging.error('ROCm SMI returned %s (the expected value is 0)', ret_init)
|
|
|
|
|
exit(ret_init)
|
|
|
|
|
else:
|
|
|
|
|
logging.error('Driver not initialized (amdgpu not found in modules)')
|
|
|
|
|
exit(0)
|
|
|
|
|
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def isAmdDevice(device):
|
|
|
|
|
""" Return whether the specified device is an AMD device or not
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
vendorID = c_uint16()
|
|
|
|
|
# Retrieve card vendor
|
|
|
|
|
ret = rocmsmi.rsmi_dev_vendor_id_get(device, byref(vendorID))
|
|
|
|
|
# Only continue if GPU vendor is AMD, which is 1002
|
|
|
|
|
if ret == rsmi_status_t.RSMI_STATUS_SUCCESS and str(hex(vendorID.value)) == '0x1002':
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def listDevices():
|
|
|
|
|
""" Returns a list of GPU devices """
|
|
|
|
|
numberOfDevices = c_uint32(0)
|
|
|
|
|
ret = rocmsmi.rsmi_num_monitor_devices(byref(numberOfDevices))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, metric='get_num_monitor_devices'):
|
2020-07-15 06:01:40 -04:00
|
|
|
deviceList = list(range(numberOfDevices.value))
|
|
|
|
|
return deviceList
|
|
|
|
|
else:
|
|
|
|
|
exit(ret)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load(savefilepath, autoRespond):
|
|
|
|
|
""" Load clock frequencies and fan speeds from a specified file.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param savefilepath: Path to the save file
|
|
|
|
|
:param autoRespond: Response to automatically provide for all prompts
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
printLogSpacer(' Load Settings ')
|
|
|
|
|
if not os.path.isfile(savefilepath):
|
|
|
|
|
printLog(None, 'No settings file found at %s' % (savefilepath), None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
sys.exit()
|
|
|
|
|
with open(savefilepath, 'r') as savefile:
|
|
|
|
|
jsonData = json.loads(savefile.read())
|
|
|
|
|
for (device, values) in jsonData.items():
|
|
|
|
|
if values['vJson'] != CLOCK_JSON_VERSION:
|
|
|
|
|
printLog(None, 'Unable to load legacy clock file - file v%s != current v%s' %
|
2020-09-09 17:34:44 -04:00
|
|
|
(str(values['vJson']), str(CLOCK_JSON_VERSION)), None)
|
2020-07-15 06:01:40 -04:00
|
|
|
break
|
|
|
|
|
device = int(device[4:])
|
|
|
|
|
if values['fan']:
|
|
|
|
|
setFanSpeed([device], values['fan'])
|
|
|
|
|
if values['overdrivesclk']:
|
|
|
|
|
setClockOverDrive([device], 'sclk', values['overdrivesclk'], autoRespond)
|
|
|
|
|
if values['overdrivemclk']:
|
|
|
|
|
setClockOverDrive([device], 'mclk', values['overdrivemclk'], autoRespond)
|
|
|
|
|
for clk in validClockNames:
|
|
|
|
|
if clk in values['clocks']:
|
|
|
|
|
setClocks([device], clk, values['clocks'][clk])
|
|
|
|
|
if values['profile']:
|
|
|
|
|
setProfile([device], values['profile'])
|
|
|
|
|
# Set Perf level last, since setting OverDrive sets the Performance level
|
|
|
|
|
# to manual, and Profiles only work when the Performance level is auto
|
|
|
|
|
if values['perflevel'] != -1:
|
|
|
|
|
setPerformanceLevel([device], values['perflevel'])
|
|
|
|
|
printLog(device, 'Successfully loaded values from ' + savefilepath, None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
2021-06-22 06:35:24 -04:00
|
|
|
def padHexValue(value, length):
|
|
|
|
|
""" Pad a hexadecimal value with a given length of zeros
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param value: A hexadecimal value to be padded with zeros
|
|
|
|
|
:param length: Number of zeros to pad the hexadecimal value
|
2021-06-22 06:35:24 -04:00
|
|
|
"""
|
|
|
|
|
# Ensure value entered meets the minimum length and is hexadecimal
|
|
|
|
|
if len(value) > 2 and length > 1 and value[:2].lower() == '0x' \
|
2021-07-29 12:43:54 -04:00
|
|
|
and all(c in '0123456789abcdefABCDEF' for c in value[2:]):
|
2021-06-22 06:35:24 -04:00
|
|
|
# Pad with zeros after '0x' prefix
|
|
|
|
|
return '0x' + value[2:].zfill(length)
|
|
|
|
|
return value
|
|
|
|
|
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
def profileString(profile):
|
2021-07-29 12:43:54 -04:00
|
|
|
dictionary = {1: 'CUSTOM', 2: 'VIDEO', 4: 'POWER SAVING', 8: 'COMPUTE', 16: 'VR', 32: '3D FULL SCREEN',
|
|
|
|
|
64: 'BOOTUP DEFAULT'}
|
2020-07-15 06:01:40 -04:00
|
|
|
# TODO: We should dynamically generate this to avoid hardcoding
|
|
|
|
|
if str(profile).isnumeric() and int(profile) in dictionary.keys():
|
|
|
|
|
return dictionary.get(int(profile))
|
|
|
|
|
elif not str(profile).isnumeric() and str(profile) in dictionary.values():
|
|
|
|
|
return list(dictionary.keys())[list(dictionary.values()).index(str(profile))]
|
|
|
|
|
return 'UNKNOWN'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def relaunchAsSudo():
|
|
|
|
|
""" Relaunch the SMI as sudo
|
|
|
|
|
|
|
|
|
|
To use rocm_smi_lib functions that write to sysfs, the SMI requires root access
|
|
|
|
|
Use execvp to relaunch the script with sudo privileges
|
|
|
|
|
"""
|
|
|
|
|
if os.geteuid() != 0:
|
|
|
|
|
os.execvp('sudo', ['sudo'] + sys.argv)
|
2023-01-06 11:01:18 -06:00
|
|
|
#keeping below, if we want to run sudo with user's env variables
|
|
|
|
|
#os.execvp('sudo', ['sudo', '-E'] + sys.argv)
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
|
|
|
|
|
""" Returns true if RSMI call status is 0 (success)
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
If status is not 0, error logs are written to the debug log and false is returned
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param device: DRM device identifier
|
|
|
|
|
:param my_ret: Return of RSMI call (rocm_smi_lib API)
|
|
|
|
|
:param metric: Parameter of GPU currently being analyzed
|
2024-03-15 06:53:27 +00:00
|
|
|
:param silent: Echo verbose error response.
|
2024-03-06 20:55:02 +01:00
|
|
|
True silences err output, False does not silence err output (default).
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
global RETCODE
|
|
|
|
|
global PRINT_JSON
|
|
|
|
|
if my_ret != rsmi_status_t.RSMI_STATUS_SUCCESS:
|
|
|
|
|
err_str = c_char_p()
|
|
|
|
|
rocmsmi.rsmi_status_string(my_ret, byref(err_str))
|
2023-09-21 14:53:35 -05:00
|
|
|
# leaving the commented out prints/logs to help identify errors in the future
|
|
|
|
|
# print("error string = " + str(err_str))
|
|
|
|
|
# print("error string (w/ decode)= " + str(err_str.value.decode()))
|
2020-07-15 06:01:40 -04:00
|
|
|
returnString = ''
|
|
|
|
|
if device is not None:
|
|
|
|
|
returnString += '%s GPU[%s]:' % (my_ret, device)
|
|
|
|
|
if metric is not None:
|
|
|
|
|
returnString += ' %s: ' % (metric)
|
2023-06-07 11:56:29 +08:00
|
|
|
else:
|
|
|
|
|
metric = ''
|
|
|
|
|
if err_str.value is not None:
|
|
|
|
|
returnString += '%s\t' % (err_str.value.decode())
|
2020-07-15 06:01:40 -04:00
|
|
|
if not PRINT_JSON:
|
2023-09-21 14:53:35 -05:00
|
|
|
# logging.debug('%s', returnString)
|
2022-05-19 16:58:48 -04:00
|
|
|
if not silent:
|
2023-09-07 16:20:30 -05:00
|
|
|
logging.debug('%s', returnString)
|
2022-05-19 16:58:48 -04:00
|
|
|
if my_ret in rsmi_status_verbose_err_out:
|
2023-04-13 10:43:52 -05:00
|
|
|
printLog(device, metric + ", " + rsmi_status_verbose_err_out[my_ret], None)
|
2020-07-15 06:01:40 -04:00
|
|
|
RETCODE = my_ret
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def save(deviceList, savefilepath):
|
|
|
|
|
""" Save clock frequencies and fan speeds for a list of devices to a specified file path.
|
|
|
|
|
|
2024-03-06 20:55:02 +01:00
|
|
|
:param deviceList: List of DRM devices (can be a single-item list)
|
|
|
|
|
:param savefilepath: Path to use to create the save file
|
2020-07-15 06:01:40 -04:00
|
|
|
"""
|
|
|
|
|
perfLevels = {}
|
|
|
|
|
clocks = {}
|
|
|
|
|
fanSpeeds = {}
|
|
|
|
|
overDriveGpu = {}
|
|
|
|
|
overDriveGpuMem = {}
|
|
|
|
|
profiles = {}
|
|
|
|
|
jsonData = {}
|
|
|
|
|
printLogSpacer(' Save Settings ')
|
|
|
|
|
if os.path.isfile(savefilepath):
|
|
|
|
|
printLog(None, '%s already exists. Settings not saved' % (savefilepath), None)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
sys.exit()
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if getPerfLevel(device) != -1:
|
|
|
|
|
perfLevels[device] = str(getPerfLevel(device)).lower()
|
|
|
|
|
else:
|
|
|
|
|
perfLevels[device] = 'Unsupported'
|
|
|
|
|
freq = rsmi_frequencies_t()
|
|
|
|
|
for clk_type in sorted(rsmi_clk_names_dict):
|
|
|
|
|
clocks[device] = clocks.get(device, {})
|
|
|
|
|
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_gpu_clk_freq_' + str(clk_type), True):
|
2020-07-15 06:01:40 -04:00
|
|
|
clocks[device][clk_type] = str(freq.current)
|
|
|
|
|
else:
|
|
|
|
|
clocks[device][clk_type] = '0'
|
2023-08-22 17:15:18 -05:00
|
|
|
fanSpeeds[device] = getFanSpeed(device)[1]
|
2020-07-15 06:01:40 -04:00
|
|
|
od = c_uint32()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(od))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_overdrive_level'):
|
2020-07-15 06:01:40 -04:00
|
|
|
overDriveGpu[device] = str(od.value)
|
|
|
|
|
else:
|
|
|
|
|
overDriveGpu[device] = '0'
|
|
|
|
|
# GPU memory Overdrive is legacy
|
|
|
|
|
overDriveGpuMem[device] = '0'
|
|
|
|
|
status = rsmi_power_profile_status_t()
|
|
|
|
|
ret = rocmsmi.rsmi_dev_power_profile_presets_get(device, 0, byref(status))
|
2023-04-13 10:43:52 -05:00
|
|
|
if rsmi_ret_ok(ret, device, 'get_profile_presets'):
|
2020-07-15 06:01:40 -04:00
|
|
|
profiles[device] = str(str(bin(status.current))[2:][::-1].index('1') + 1)
|
|
|
|
|
else:
|
|
|
|
|
profiles[device] = str('UNKNOWN')
|
2020-09-09 17:34:44 -04:00
|
|
|
jsonData['card%d' % (device)] = {'vJson': CLOCK_JSON_VERSION, 'clocks': clocks[device],
|
|
|
|
|
'fan': fanSpeeds[device], 'overdrivesclk': overDriveGpu[device],
|
|
|
|
|
'overdrivemclk': overDriveGpuMem[device], 'profile': profiles[device],
|
|
|
|
|
'perflevel': perfLevels[device]}
|
2023-06-07 11:56:29 +08:00
|
|
|
printLog(None, 'Current settings successfully saved to', savefilepath)
|
2020-07-15 06:01:40 -04:00
|
|
|
with open(savefilepath, 'w') as savefile:
|
|
|
|
|
json.dump(jsonData, savefile, ensure_ascii=True)
|
|
|
|
|
printLogSpacer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# The code below is for when this script is run as an executable instead of when imported as a module
|
2023-09-24 02:29:07 -05:00
|
|
|
def isConciseInfoRequested(args):
|
2024-01-31 21:03:33 -06:00
|
|
|
is_concise_req = len(sys.argv) == 1 or \
|
2023-09-24 02:29:07 -05:00
|
|
|
len(sys.argv) == 2 and (args.alldevices or (args.json or args.csv)) or \
|
|
|
|
|
len(sys.argv) == 3 and (args.alldevices and (args.json or args.csv))
|
2024-01-31 21:03:33 -06:00
|
|
|
return is_concise_req
|
2023-09-24 02:29:07 -05:00
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
if __name__ == '__main__':
|
2020-09-09 17:34:44 -04:00
|
|
|
parser = argparse.ArgumentParser(
|
2023-09-22 19:13:33 -05:00
|
|
|
description='AMD ROCm System Management Interface | ROCM-SMI version: %s' % __version__,
|
2020-09-09 17:34:44 -04:00
|
|
|
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=90, width=120))
|
2023-10-02 17:57:02 -05:00
|
|
|
groupVersion = parser.add_argument_group()
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDev = parser.add_argument_group()
|
|
|
|
|
groupDisplayOpt = parser.add_argument_group('Display Options')
|
|
|
|
|
groupDisplayTop = parser.add_argument_group('Topology')
|
|
|
|
|
groupDisplayPages = parser.add_argument_group('Pages information')
|
|
|
|
|
groupDisplayHw = parser.add_argument_group('Hardware-related information')
|
|
|
|
|
groupDisplay = parser.add_argument_group('Software-related/controlled information')
|
|
|
|
|
groupAction = parser.add_argument_group('Set options')
|
|
|
|
|
groupActionReset = parser.add_argument_group('Reset options')
|
|
|
|
|
groupActionGpuReset = parser.add_mutually_exclusive_group()
|
|
|
|
|
groupFile = parser.add_mutually_exclusive_group()
|
|
|
|
|
groupResponse = parser.add_argument_group('Auto-response options')
|
|
|
|
|
groupActionOutput = parser.add_argument_group('Output options')
|
|
|
|
|
|
2023-10-02 17:57:02 -05:00
|
|
|
groupVersion.add_argument('-V', '--version', help='Show version information', action='store_true')
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDev.add_argument('-d', '--device', help='Execute command on specified device', type=int, nargs='+')
|
|
|
|
|
groupDisplayOpt.add_argument('--alldevices', action='store_true') # ------------- function deprecated, no help menu
|
|
|
|
|
groupDisplayOpt.add_argument('--showhw', help='Show Hardware details', action='store_true')
|
|
|
|
|
groupDisplayOpt.add_argument('-a', '--showallinfo', help='Show Temperature, Fan and Clock values',
|
|
|
|
|
action='store_true')
|
2024-01-31 21:03:33 -06:00
|
|
|
groupDisplayTop.add_argument('-i', '--showid', help='Show DEVICE IDs', action='store_true')
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDisplayTop.add_argument('-v', '--showvbios', help='Show VBIOS version', action='store_true')
|
2021-03-17 00:24:29 -04:00
|
|
|
groupDisplayTop.add_argument('-e', '--showevents', help='Show event list', metavar='EVENT', type=str, nargs='*')
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDisplayTop.add_argument('--showdriverversion', help='Show kernel driver version', action='store_true')
|
2022-09-07 13:49:43 -04:00
|
|
|
groupDisplayTop.add_argument('--showtempgraph', help='Show Temperature Graph', action='store_true')
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDisplayTop.add_argument('--showfwinfo', help='Show FW information', metavar='BLOCK', type=str, nargs='*')
|
|
|
|
|
groupDisplayTop.add_argument('--showmclkrange', help='Show mclk range', action='store_true')
|
|
|
|
|
groupDisplayTop.add_argument('--showmemvendor', help='Show GPU memory vendor', action='store_true')
|
|
|
|
|
groupDisplayTop.add_argument('--showsclkrange', help='Show sclk range', action='store_true')
|
2024-01-31 21:03:33 -06:00
|
|
|
groupDisplayTop.add_argument('--showproductname', help='Show product details', action='store_true')
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDisplayTop.add_argument('--showserial', help='Show GPU\'s Serial Number', action='store_true')
|
|
|
|
|
groupDisplayTop.add_argument('--showuniqueid', help='Show GPU\'s Unique ID', action='store_true')
|
2024-05-03 02:58:31 -05:00
|
|
|
groupDisplayTop.add_argument('--showvoltagerange', help='Show voltage range', action='store_true')
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDisplayTop.add_argument('--showbus', help='Show PCI bus number', action='store_true')
|
|
|
|
|
groupDisplayPages.add_argument('--showpagesinfo', help='Show retired, pending and unreservable pages',
|
|
|
|
|
action='store_true')
|
|
|
|
|
groupDisplayPages.add_argument('--showpendingpages', help='Show pending retired pages', action='store_true')
|
|
|
|
|
groupDisplayPages.add_argument('--showretiredpages', help='Show retired pages', action='store_true')
|
|
|
|
|
groupDisplayPages.add_argument('--showunreservablepages', help='Show unreservable pages', action='store_true')
|
|
|
|
|
groupDisplayHw.add_argument('-f', '--showfan', help='Show current fan speed', action='store_true')
|
2024-03-19 01:10:43 -05:00
|
|
|
groupDisplayHw.add_argument('-P', '--showpower', help='Show current average or instant socket graphics package power consumption',
|
2020-07-15 06:01:40 -04:00
|
|
|
action='store_true')
|
|
|
|
|
groupDisplayHw.add_argument('-t', '--showtemp', help='Show current temperature', action='store_true')
|
|
|
|
|
groupDisplayHw.add_argument('-u', '--showuse', help='Show current GPU use', action='store_true')
|
|
|
|
|
groupDisplayHw.add_argument('--showmemuse', help='Show current GPU memory used', action='store_true')
|
|
|
|
|
groupDisplayHw.add_argument('--showvoltage', help='Show current GPU voltage', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('-b', '--showbw', help='Show estimated PCIe use', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('-c', '--showclocks', help='Show current clock frequencies', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('-g', '--showgpuclocks', help='Show current GPU clock frequencies', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('-l', '--showprofile', help='Show Compute Profile attributes', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('-M', '--showmaxpower', help='Show maximum graphics package power this GPU will consume',
|
|
|
|
|
action='store_true')
|
|
|
|
|
groupDisplay.add_argument('-m', '--showmemoverdrive', help='Show current GPU Memory Clock OverDrive level',
|
|
|
|
|
action='store_true')
|
|
|
|
|
groupDisplay.add_argument('-o', '--showoverdrive', help='Show current GPU Clock OverDrive level',
|
|
|
|
|
action='store_true')
|
|
|
|
|
groupDisplay.add_argument('-p', '--showperflevel', help='Show current DPM Performance Level', action='store_true')
|
2024-05-03 02:58:31 -05:00
|
|
|
groupDisplay.add_argument('-S', '--showclkvolt', help='Show supported GPU and Memory Clocks and Voltages',
|
|
|
|
|
action='store_true')
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDisplay.add_argument('-s', '--showclkfrq', help='Show supported GPU and Memory Clock', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('--showmeminfo', help='Show Memory usage information for given block(s) TYPE',
|
|
|
|
|
metavar='TYPE', type=str, nargs='+')
|
2023-06-13 11:20:27 -05:00
|
|
|
groupDisplay.add_argument('--showpids', help='Show current running KFD PIDs (pass details to VERBOSE for detailed information)',
|
|
|
|
|
metavar='VERBOSE', const="summary", type=str, nargs='?')
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDisplay.add_argument('--showpidgpus', help='Show GPUs used by specified KFD PIDs (all if no arg given)',
|
|
|
|
|
nargs='*')
|
|
|
|
|
groupDisplay.add_argument('--showreplaycount', help='Show PCIe Replay Count', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('--showrasinfo',
|
|
|
|
|
help='Show RAS enablement information and error counts for the specified block(s) (all if no arg given)',
|
|
|
|
|
nargs='*')
|
2024-05-03 02:58:31 -05:00
|
|
|
groupDisplay.add_argument('--showvc', help='Show voltage curve', action='store_true')
|
2020-07-15 06:01:40 -04:00
|
|
|
groupDisplay.add_argument('--showxgmierr', help='Show XGMI error information since last read', action='store_true')
|
2020-08-05 16:30:22 -04:00
|
|
|
groupDisplay.add_argument('--showtopo', help='Show hardware topology information', action='store_true')
|
2021-08-17 14:53:32 -04:00
|
|
|
groupDisplay.add_argument('--showtopoaccess', help='Shows the link accessibility between GPUs ', action='store_true')
|
2020-08-05 16:30:22 -04:00
|
|
|
groupDisplay.add_argument('--showtopoweight', help='Shows the relative weight between GPUs ', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('--showtopohops', help='Shows the number of hops between GPUs ', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('--showtopotype', help='Shows the link type between GPUs ', action='store_true')
|
|
|
|
|
groupDisplay.add_argument('--showtoponuma', help='Shows the numa nodes ', action='store_true')
|
2021-07-29 12:43:54 -04:00
|
|
|
groupDisplay.add_argument('--showenergycounter', help='Energy accumulator that stores amount of energy consumed',
|
|
|
|
|
action='store_true')
|
2021-10-26 18:39:23 -04:00
|
|
|
groupDisplay.add_argument('--shownodesbw', help='Shows the numa nodes ', action='store_true')
|
2023-01-06 11:01:18 -06:00
|
|
|
groupDisplay.add_argument('--showcomputepartition', help='Shows current compute partitioning ', action='store_true')
|
2023-09-21 14:53:35 -05:00
|
|
|
groupDisplay.add_argument('--showmemorypartition', help='Shows current memory partition ', action='store_true')
|
2024-06-21 15:13:15 -05:00
|
|
|
groupDisplay.add_argument('--showmetrics', help='Show current gpu metric data ', action='store_true')
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
groupActionReset.add_argument('-r', '--resetclocks', help='Reset clocks and OverDrive to default',
|
|
|
|
|
action='store_true')
|
|
|
|
|
groupActionReset.add_argument('--resetfans', help='Reset fans to automatic (driver) control', action='store_true')
|
|
|
|
|
groupActionReset.add_argument('--resetprofile', help='Reset Power Profile back to default', action='store_true')
|
|
|
|
|
groupActionReset.add_argument('--resetpoweroverdrive',
|
2024-03-15 06:53:27 +00:00
|
|
|
help='Set the maximum GPU power back to the device default state',
|
2020-07-15 06:01:40 -04:00
|
|
|
action='store_true')
|
|
|
|
|
groupActionReset.add_argument('--resetxgmierr', help='Reset XGMI error count', action='store_true')
|
2023-02-14 17:06:03 -06:00
|
|
|
groupActionReset.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true')
|
2021-07-29 12:43:54 -04:00
|
|
|
groupAction.add_argument('--setclock',
|
|
|
|
|
help='Set Clock Frequency Level(s) for specified clock (requires manual Perf level)',
|
2022-02-10 10:50:32 -05:00
|
|
|
metavar=('TYPE','LEVEL'), nargs=2)
|
2020-07-15 06:01:40 -04:00
|
|
|
groupAction.add_argument('--setsclk', help='Set GPU Clock Frequency Level(s) (requires manual Perf level)',
|
|
|
|
|
type=int, metavar='LEVEL', nargs='+')
|
|
|
|
|
groupAction.add_argument('--setmclk', help='Set GPU Memory Clock Frequency Level(s) (requires manual Perf level)',
|
|
|
|
|
type=int, metavar='LEVEL', nargs='+')
|
|
|
|
|
groupAction.add_argument('--setpcie', help='Set PCIE Clock Frequency Level(s) (requires manual Perf level)',
|
|
|
|
|
type=int, metavar='LEVEL', nargs='+')
|
|
|
|
|
groupAction.add_argument('--setslevel',
|
|
|
|
|
help='Change GPU Clock frequency (MHz) and Voltage (mV) for a specific Level',
|
|
|
|
|
metavar=('SCLKLEVEL', 'SCLK', 'SVOLT'), nargs=3)
|
|
|
|
|
groupAction.add_argument('--setmlevel',
|
|
|
|
|
help='Change GPU Memory clock frequency (MHz) and Voltage for (mV) a specific Level',
|
|
|
|
|
metavar=('MCLKLEVEL', 'MCLK', 'MVOLT'), nargs=3)
|
2024-05-03 02:58:31 -05:00
|
|
|
groupAction.add_argument('--setvc', help='Change SCLK Voltage Curve (MHz mV) for a specific point',
|
|
|
|
|
metavar=('POINT', 'SCLK', 'SVOLT'), nargs=3)
|
2021-04-20 10:31:36 -04:00
|
|
|
groupAction.add_argument('--setsrange', help='Set min and max SCLK speed', metavar=('SCLKMIN', 'SCLKMAX'), nargs=2)
|
2024-02-09 09:22:51 -06:00
|
|
|
groupAction.add_argument('--setextremum', help='Set min/max of SCLK/MCLK speed', metavar=('min|max', "sclk|mclk", 'CLK'), nargs=3)
|
2021-04-20 10:31:36 -04:00
|
|
|
groupAction.add_argument('--setmrange', help='Set min and max MCLK speed', metavar=('MCLKMIN', 'MCLKMAX'), nargs=2)
|
2020-07-15 06:01:40 -04:00
|
|
|
groupAction.add_argument('--setfan', help='Set GPU Fan Speed (Level or %%)', metavar='LEVEL')
|
|
|
|
|
groupAction.add_argument('--setperflevel', help='Set Performance Level', metavar='LEVEL')
|
|
|
|
|
groupAction.add_argument('--setoverdrive', help='Set GPU OverDrive level (requires manual|high Perf level)',
|
|
|
|
|
metavar='%')
|
|
|
|
|
groupAction.add_argument('--setmemoverdrive',
|
|
|
|
|
help='Set GPU Memory Overclock OverDrive level (requires manual|high Perf level)',
|
|
|
|
|
metavar='%')
|
|
|
|
|
groupAction.add_argument('--setpoweroverdrive', help='Set the maximum GPU power using Power OverDrive in Watts',
|
|
|
|
|
metavar='WATTS')
|
|
|
|
|
groupAction.add_argument('--setprofile',
|
|
|
|
|
help='Specify Power Profile level (#) or a quoted string of CUSTOM Profile attributes "# '
|
2021-07-29 12:43:54 -04:00
|
|
|
'# # #..." (requires manual Perf level)')
|
|
|
|
|
groupAction.add_argument('--setperfdeterminism',
|
|
|
|
|
help='Set clock frequency limit to get minimal performance variation', type=int,
|
|
|
|
|
metavar='SCLK', nargs=1)
|
2023-01-06 11:01:18 -06:00
|
|
|
groupAction.add_argument('--setcomputepartition', help='Set compute partition',
|
|
|
|
|
choices=compute_partition_type_l + [x.lower() for x in compute_partition_type_l],
|
2023-01-30 15:58:03 -06:00
|
|
|
type=str, nargs=1)
|
2023-09-21 14:53:35 -05:00
|
|
|
groupAction.add_argument('--setmemorypartition', help='Set memory partition',
|
|
|
|
|
choices=memory_partition_type_l + [x.lower() for x in memory_partition_type_l],
|
2023-01-30 15:58:03 -06:00
|
|
|
type=str, nargs=1)
|
2020-07-15 06:01:40 -04:00
|
|
|
groupAction.add_argument('--rasenable', help='Enable RAS for specified block and error type', type=str, nargs=2,
|
|
|
|
|
metavar=('BLOCK', 'ERRTYPE'))
|
|
|
|
|
groupAction.add_argument('--rasdisable', help='Disable RAS for specified block and error type', type=str, nargs=2,
|
|
|
|
|
metavar=('BLOCK', 'ERRTYPE'))
|
|
|
|
|
groupAction.add_argument('--rasinject',
|
2024-03-15 06:53:27 +00:00
|
|
|
help='Inject RAS poison for specified block (ONLY WORKS ON UNSECURED BOARDS)', type=str,
|
2020-07-15 06:01:40 -04:00
|
|
|
metavar='BLOCK', nargs=1)
|
|
|
|
|
groupActionGpuReset.add_argument('--gpureset', help='Reset specified GPU (One GPU must be specified)',
|
|
|
|
|
action='store_true')
|
|
|
|
|
|
|
|
|
|
groupFile.add_argument('--load', help='Load Clock, Fan, Performance and Profile settings from FILE', metavar='FILE')
|
|
|
|
|
groupFile.add_argument('--save', help='Save Clock, Fan, Performance and Profile settings to FILE', metavar='FILE')
|
|
|
|
|
|
|
|
|
|
groupResponse.add_argument('--autorespond',
|
|
|
|
|
help='Response to automatically provide for all prompts (NOT RECOMMENDED)',
|
|
|
|
|
metavar='RESPONSE')
|
|
|
|
|
|
|
|
|
|
groupActionOutput.add_argument('--loglevel',
|
2021-07-29 12:43:54 -04:00
|
|
|
help='How much output will be printed for what program is doing, one of debug/info/warning/error/critical',
|
2020-07-15 06:01:40 -04:00
|
|
|
metavar='LEVEL')
|
|
|
|
|
groupActionOutput.add_argument('--json', help='Print output in JSON format', action='store_true')
|
|
|
|
|
groupActionOutput.add_argument('--csv', help='Print output in CSV format', action='store_true')
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
2023-09-14 11:30:47 -05:00
|
|
|
# Must set PRINT_JSON early so the prints can be silenced
|
|
|
|
|
if args.json or args.csv:
|
|
|
|
|
PRINT_JSON = True
|
2023-10-02 17:57:02 -05:00
|
|
|
|
2020-09-23 16:33:01 -04:00
|
|
|
# Initialize the rocm SMI library
|
|
|
|
|
initializeRsmi()
|
|
|
|
|
|
2023-10-02 17:57:02 -05:00
|
|
|
if args.version:
|
|
|
|
|
showVersion(isCSV=args.csv)
|
|
|
|
|
sys.exit()
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
|
|
|
|
|
if args.loglevel is not None:
|
|
|
|
|
numericLogLevel = getattr(logging, args.loglevel.upper(), logging.WARNING)
|
|
|
|
|
logging.getLogger().setLevel(numericLogLevel)
|
|
|
|
|
|
2021-07-29 12:43:54 -04:00
|
|
|
if args.setsclk or args.setmclk or args.setpcie or args.resetfans or args.setfan or args.setperflevel or args.load \
|
|
|
|
|
or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or args.setmemoverdrive \
|
|
|
|
|
or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \
|
|
|
|
|
args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \
|
2024-05-03 02:58:31 -05:00
|
|
|
args.setvc or args.setsrange or args.setextremum or args.setmrange or args.setclock or \
|
2024-10-08 20:23:09 -05:00
|
|
|
args.setcomputepartition or args.setmemorypartition:
|
2020-11-03 06:35:42 -05:00
|
|
|
relaunchAsSudo()
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
# If there is one or more device specified, use that for all commands, otherwise use a
|
|
|
|
|
# list of all available devices. Also use "is not None" as device 0 would
|
|
|
|
|
# have args.device=0, and "if 0" returns false.
|
|
|
|
|
if args.device is not None:
|
|
|
|
|
deviceList = []
|
|
|
|
|
for device in args.device:
|
|
|
|
|
if not doesDeviceExist(device):
|
|
|
|
|
logging.warning('No such device card%s', str(device))
|
|
|
|
|
sys.exit()
|
2024-02-26 20:58:17 -06:00
|
|
|
if device is None:
|
|
|
|
|
printLog(None, 'ERROR: No DRM devices detected. Exiting', None)
|
|
|
|
|
sys.exit()
|
2020-07-15 06:01:40 -04:00
|
|
|
if (isAmdDevice(device) or args.alldevices) and device not in deviceList:
|
|
|
|
|
deviceList.append(device)
|
|
|
|
|
else:
|
|
|
|
|
deviceList = listDevices()
|
|
|
|
|
|
|
|
|
|
if deviceList is None:
|
|
|
|
|
printLog(None, 'ERROR: No DRM devices available. Exiting', None)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
# If we want JSON/CSV output, initialize the keys (devices)
|
2023-09-14 11:30:47 -05:00
|
|
|
if PRINT_JSON:
|
2020-07-15 06:01:40 -04:00
|
|
|
for device in deviceList:
|
|
|
|
|
JSON_DATA['card' + str(device)] = {}
|
|
|
|
|
|
|
|
|
|
if not PRINT_JSON:
|
|
|
|
|
print('\n')
|
2025-09-10 14:50:23 -05:00
|
|
|
if not isConciseInfoRequested(args) and not args.showhw:
|
2023-09-24 02:29:07 -05:00
|
|
|
printLogSpacer(headerString)
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
if args.showallinfo:
|
|
|
|
|
args.list = True
|
|
|
|
|
args.showid = True
|
|
|
|
|
args.showvbios = True
|
|
|
|
|
args.showdriverversion = True
|
|
|
|
|
args.showfwinfo = 'all'
|
|
|
|
|
args.showmclkrange = True
|
|
|
|
|
args.showmemvendor = True
|
|
|
|
|
args.showsclkrange = True
|
|
|
|
|
args.showproductname = True
|
|
|
|
|
args.showserial = True
|
|
|
|
|
args.showuniqueid = True
|
2024-05-03 02:58:31 -05:00
|
|
|
args.showvoltagerange = True
|
2020-07-15 06:01:40 -04:00
|
|
|
args.showbus = True
|
|
|
|
|
args.showpagesinfo = True
|
|
|
|
|
args.showfan = True
|
|
|
|
|
args.showpower = True
|
|
|
|
|
args.showtemp = True
|
|
|
|
|
args.showuse = True
|
2021-04-17 01:37:19 -04:00
|
|
|
args.showenergycounter = True
|
2020-07-15 06:01:40 -04:00
|
|
|
args.showmemuse = True
|
|
|
|
|
args.showvoltage = True
|
|
|
|
|
args.showclocks = True
|
|
|
|
|
args.showmaxpower = True
|
|
|
|
|
args.showmemoverdrive = True
|
|
|
|
|
args.showoverdrive = True
|
|
|
|
|
args.showperflevel = True
|
2023-06-13 11:20:27 -05:00
|
|
|
args.showpids = "summary"
|
2020-07-15 06:01:40 -04:00
|
|
|
args.showpidgpus = []
|
|
|
|
|
args.showreplaycount = True
|
2024-05-03 02:58:31 -05:00
|
|
|
args.showvc = True
|
2023-01-06 11:01:18 -06:00
|
|
|
args.showcomputepartition = True
|
2023-09-21 14:53:35 -05:00
|
|
|
args.showmemorypartition = True
|
2024-06-21 15:13:15 -05:00
|
|
|
args.showmetrics = True
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2020-09-09 17:34:44 -04:00
|
|
|
if not PRINT_JSON:
|
2020-07-15 06:01:40 -04:00
|
|
|
args.showprofile = True
|
|
|
|
|
args.showclkfrq = True
|
2024-05-03 02:58:31 -05:00
|
|
|
args.showclkvolt = True
|
2020-07-15 06:01:40 -04:00
|
|
|
|
2020-08-18 14:05:41 -04:00
|
|
|
# Don't do reset in combination with any other command
|
|
|
|
|
if args.gpureset:
|
|
|
|
|
if not args.device:
|
|
|
|
|
logging.error('No device specified. One device must be specified for GPU reset')
|
2021-01-07 05:35:17 -05:00
|
|
|
printLogSpacer()
|
2020-08-18 14:05:41 -04:00
|
|
|
sys.exit(1)
|
|
|
|
|
logging.debug('Only executing GPU reset, no other commands will be executed')
|
2020-11-03 06:35:42 -05:00
|
|
|
resetGpu(args.device)
|
2020-08-18 14:05:41 -04:00
|
|
|
sys.exit(RETCODE)
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
if not checkAmdGpus(deviceList):
|
|
|
|
|
logging.warning('No AMD GPUs specified')
|
2025-04-14 13:05:22 -05:00
|
|
|
if not check_runtime_status():
|
2026-01-08 10:19:45 -06:00
|
|
|
wake_device_failed = False
|
|
|
|
|
logging.debug('Using DRM device ID call to wake suspended devices')
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
try:
|
|
|
|
|
device_id = getDRMDeviceId(device, silent=True)
|
|
|
|
|
if device_id == 'N/A':
|
|
|
|
|
wake_device_failed = True
|
|
|
|
|
logging.debug(f'Failed to wake device {device} via DRM call')
|
|
|
|
|
except Exception as e:
|
|
|
|
|
wake_device_failed = True
|
|
|
|
|
logging.debug(f'Exception waking device {device}: {str(e)}')
|
|
|
|
|
if wake_device_failed:
|
|
|
|
|
logging.warning('AMD GPU device(s) is/are in a low-power state. Check power control/runtime_status\n')
|
2023-09-24 02:29:07 -05:00
|
|
|
if isConciseInfoRequested(args):
|
2020-07-15 06:01:40 -04:00
|
|
|
showAllConcise(deviceList)
|
|
|
|
|
if args.showhw:
|
|
|
|
|
showAllConciseHw(deviceList)
|
|
|
|
|
if args.showdriverversion:
|
2023-10-02 17:57:02 -05:00
|
|
|
showDriverVersion(deviceList, rsmi_sw_component_t.RSMI_SW_COMP_DRIVER)
|
2022-09-07 13:49:43 -04:00
|
|
|
if args.showtempgraph:
|
|
|
|
|
showTempGraph(deviceList)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.showid:
|
|
|
|
|
showId(deviceList)
|
|
|
|
|
if args.showuniqueid:
|
|
|
|
|
showUId(deviceList)
|
|
|
|
|
if args.showvbios:
|
|
|
|
|
showVbiosVersion(deviceList)
|
2021-03-17 00:24:29 -04:00
|
|
|
if args.showevents or str(args.showevents) == '[]':
|
|
|
|
|
showEvents(deviceList, args.showevents)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.resetclocks:
|
|
|
|
|
resetClocks(deviceList)
|
|
|
|
|
if args.showtemp:
|
|
|
|
|
showCurrentTemps(deviceList)
|
|
|
|
|
if args.showclocks:
|
|
|
|
|
showCurrentClocks(deviceList)
|
|
|
|
|
if args.showgpuclocks:
|
|
|
|
|
showCurrentClocks(deviceList, 'sclk')
|
|
|
|
|
if args.showfan:
|
|
|
|
|
showCurrentFans(deviceList)
|
|
|
|
|
if args.showperflevel:
|
|
|
|
|
showPerformanceLevel(deviceList)
|
|
|
|
|
if args.showoverdrive:
|
|
|
|
|
showOverDrive(deviceList, 'sclk')
|
|
|
|
|
if args.showmemoverdrive:
|
2020-09-07 21:29:38 -04:00
|
|
|
showOverDrive(deviceList, 'mclk')
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.showmaxpower:
|
|
|
|
|
showMaxPower(deviceList)
|
|
|
|
|
if args.showprofile:
|
|
|
|
|
showProfile(deviceList)
|
|
|
|
|
if args.showpower:
|
|
|
|
|
showPower(deviceList)
|
|
|
|
|
if args.showclkfrq:
|
|
|
|
|
showClocks(deviceList)
|
|
|
|
|
if args.showuse:
|
|
|
|
|
showGpuUse(deviceList)
|
|
|
|
|
if args.showmemuse:
|
|
|
|
|
showMemUse(deviceList)
|
|
|
|
|
if args.showmemvendor:
|
|
|
|
|
showMemVendor(deviceList)
|
|
|
|
|
if args.showbw:
|
|
|
|
|
showPcieBw(deviceList)
|
|
|
|
|
if args.showreplaycount:
|
|
|
|
|
showPcieReplayCount(deviceList)
|
|
|
|
|
if args.showserial:
|
|
|
|
|
showSerialNumber(deviceList)
|
2025-09-10 14:50:32 -05:00
|
|
|
if args.showpids is not None:
|
2023-06-13 11:20:27 -05:00
|
|
|
showPids(args.showpids)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.showpidgpus or str(args.showpidgpus) == '[]':
|
|
|
|
|
showGpusByPid(args.showpidgpus)
|
2024-05-03 02:58:31 -05:00
|
|
|
if args.showclkvolt:
|
|
|
|
|
showPowerPlayTable(deviceList)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.showvoltage:
|
|
|
|
|
showVoltage(deviceList)
|
|
|
|
|
if args.showbus:
|
|
|
|
|
showBus(deviceList)
|
|
|
|
|
if args.showmeminfo:
|
|
|
|
|
showMemInfo(deviceList, args.showmeminfo)
|
|
|
|
|
if args.showrasinfo or str(args.showrasinfo) == '[]':
|
|
|
|
|
showRasInfo(deviceList, args.showrasinfo)
|
|
|
|
|
# The second condition in the below 'if' statement checks whether showfwinfo was given arguments.
|
|
|
|
|
# It compares itself to the string representation of the empty list and prints all firmwares.
|
|
|
|
|
# This allows the user to call --showfwinfo without the 'all' argument and still print all.
|
|
|
|
|
if args.showfwinfo or str(args.showfwinfo) == '[]':
|
|
|
|
|
showFwInfo(deviceList, args.showfwinfo)
|
|
|
|
|
if args.showproductname:
|
2024-01-31 21:03:33 -06:00
|
|
|
showProduct(deviceList)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.showxgmierr:
|
|
|
|
|
showXgmiErr(deviceList)
|
2021-10-26 18:39:23 -04:00
|
|
|
if args.shownodesbw:
|
|
|
|
|
showNodesBw(deviceList)
|
2020-08-05 16:30:22 -04:00
|
|
|
if args.showtopo:
|
|
|
|
|
showHwTopology(deviceList)
|
2021-08-17 14:53:32 -04:00
|
|
|
if args.showtopoaccess:
|
|
|
|
|
showAccessibleTopology(deviceList)
|
2020-08-05 16:30:22 -04:00
|
|
|
if args.showtopoweight:
|
|
|
|
|
showWeightTopology(deviceList)
|
|
|
|
|
if args.showtopohops:
|
|
|
|
|
showHopsTopology(deviceList)
|
|
|
|
|
if args.showtopotype:
|
|
|
|
|
showTypeTopology(deviceList)
|
|
|
|
|
if args.showtoponuma:
|
|
|
|
|
showNumaTopology(deviceList)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.showpagesinfo:
|
|
|
|
|
showRetiredPages(deviceList)
|
|
|
|
|
if args.showretiredpages:
|
|
|
|
|
showRetiredPages(deviceList, 'reserved')
|
|
|
|
|
if args.showpendingpages:
|
|
|
|
|
showRetiredPages(deviceList, 'pending')
|
|
|
|
|
if args.showunreservablepages:
|
|
|
|
|
showRetiredPages(deviceList, 'unreservable')
|
|
|
|
|
if args.showsclkrange:
|
|
|
|
|
showRange(deviceList, 'sclk')
|
|
|
|
|
if args.showmclkrange:
|
|
|
|
|
showRange(deviceList, 'mclk')
|
2024-05-03 02:58:31 -05:00
|
|
|
if args.showvoltagerange:
|
|
|
|
|
showRange(deviceList, 'voltage')
|
|
|
|
|
if args.showvc:
|
|
|
|
|
showVoltageCurve(deviceList)
|
2021-04-17 01:37:19 -04:00
|
|
|
if args.showenergycounter:
|
|
|
|
|
showEnergy(deviceList)
|
2023-01-06 11:01:18 -06:00
|
|
|
if args.showcomputepartition:
|
|
|
|
|
showComputePartition(deviceList)
|
2023-09-21 14:53:35 -05:00
|
|
|
if args.showmemorypartition:
|
|
|
|
|
showMemoryPartition(deviceList)
|
2024-06-21 15:13:15 -05:00
|
|
|
if args.showmetrics:
|
|
|
|
|
showGPUMetrics(deviceList)
|
2020-09-18 03:10:38 -04:00
|
|
|
if args.setclock:
|
2022-02-10 10:50:32 -05:00
|
|
|
setClocks(deviceList, args.setclock[0], [int(args.setclock[1])])
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.setsclk:
|
|
|
|
|
setClocks(deviceList, 'sclk', args.setsclk)
|
|
|
|
|
if args.setmclk:
|
|
|
|
|
setClocks(deviceList, 'mclk', args.setmclk)
|
|
|
|
|
if args.setpcie:
|
|
|
|
|
setClocks(deviceList, 'pcie', args.setpcie)
|
|
|
|
|
if args.setslevel:
|
2021-07-29 12:43:54 -04:00
|
|
|
setPowerPlayTableLevel(deviceList, 'sclk', args.setslevel[0], args.setslevel[1], args.setslevel[2],
|
|
|
|
|
args.autorespond)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.setmlevel:
|
2021-07-29 12:43:54 -04:00
|
|
|
setPowerPlayTableLevel(deviceList, 'mclk', args.setmlevel[0], args.setmlevel[1], args.setmlevel[2],
|
|
|
|
|
args.autorespond)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.resetfans:
|
|
|
|
|
resetFans(deviceList)
|
|
|
|
|
if args.setfan:
|
|
|
|
|
setFanSpeed(deviceList, args.setfan)
|
|
|
|
|
if args.setperflevel:
|
|
|
|
|
setPerformanceLevel(deviceList, args.setperflevel)
|
|
|
|
|
if args.setoverdrive:
|
|
|
|
|
setClockOverDrive(deviceList, 'sclk', args.setoverdrive, args.autorespond)
|
|
|
|
|
if args.setmemoverdrive:
|
|
|
|
|
setClockOverDrive(deviceList, 'mclk', args.setmemoverdrive, args.autorespond)
|
|
|
|
|
if args.setpoweroverdrive:
|
|
|
|
|
setPowerOverDrive(deviceList, args.setpoweroverdrive, args.autorespond)
|
|
|
|
|
if args.resetpoweroverdrive:
|
|
|
|
|
resetPowerOverDrive(deviceList, args.autorespond)
|
|
|
|
|
if args.setprofile:
|
|
|
|
|
setProfile(deviceList, args.setprofile)
|
2024-05-03 02:58:31 -05:00
|
|
|
if args.setvc:
|
|
|
|
|
setVoltageCurve(deviceList, args.setvc[0], args.setvc[1], args.setvc[2], args.autorespond)
|
2024-02-09 09:22:51 -06:00
|
|
|
if args.setextremum:
|
|
|
|
|
setClockExtremum(deviceList, args.setextremum[0], args.setextremum[1], args.setextremum[2], args.autorespond)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.setsrange:
|
2020-08-27 15:00:53 -04:00
|
|
|
setClockRange(deviceList, 'sclk', args.setsrange[0], args.setsrange[1], args.autorespond)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.setmrange:
|
2020-08-27 15:00:53 -04:00
|
|
|
setClockRange(deviceList, 'mclk', args.setmrange[0], args.setmrange[1], args.autorespond)
|
2020-10-30 13:40:26 -04:00
|
|
|
if args.setperfdeterminism:
|
|
|
|
|
setPerfDeterminism(deviceList, args.setperfdeterminism[0])
|
2023-01-06 11:01:18 -06:00
|
|
|
if args.setcomputepartition:
|
|
|
|
|
setComputePartition(deviceList, args.setcomputepartition[0])
|
2023-09-21 14:53:35 -05:00
|
|
|
if args.setmemorypartition:
|
2024-11-06 15:13:32 -06:00
|
|
|
setMemoryPartition(deviceList, args.setmemorypartition[0], args.autorespond)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.resetprofile:
|
|
|
|
|
resetProfile(deviceList)
|
|
|
|
|
if args.resetxgmierr:
|
|
|
|
|
resetXgmiErr(deviceList)
|
2020-10-30 13:40:26 -04:00
|
|
|
if args.resetperfdeterminism:
|
|
|
|
|
resetPerfDeterminism(deviceList)
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.rasenable:
|
2020-10-22 17:12:32 -04:00
|
|
|
setRas(deviceList, 'enable', args.rasenable[0], args.rasenable[1])
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.rasdisable:
|
2020-10-22 17:12:32 -04:00
|
|
|
setRas(deviceList, 'disable', args.rasdisable[0], args.rasdisable[1])
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.rasinject:
|
2020-10-22 17:12:32 -04:00
|
|
|
setRas(deviceList, 'inject', args.rasinject[0], args.rasinject[1])
|
2020-07-15 06:01:40 -04:00
|
|
|
if args.load:
|
|
|
|
|
load(args.load, args.autorespond)
|
|
|
|
|
if args.save:
|
|
|
|
|
save(deviceList, args.save)
|
|
|
|
|
|
2022-07-18 03:34:19 -04:00
|
|
|
if RETCODE and not PRINT_JSON:
|
|
|
|
|
logging.debug(' \t\t One or more commands failed.')
|
|
|
|
|
# Set RETCODE value to 0, unless loglevel is None or 'warning' (default)
|
|
|
|
|
if args.loglevel is None or getattr(logging, args.loglevel.upper(), logging.WARNING) == logging.WARNING:
|
|
|
|
|
RETCODE = 0
|
|
|
|
|
|
2020-07-15 06:01:40 -04:00
|
|
|
if PRINT_JSON:
|
|
|
|
|
# Check that we have some actual data to print, instead of the
|
|
|
|
|
# empty list that we initialized above
|
|
|
|
|
for device in deviceList:
|
|
|
|
|
if not JSON_DATA['card' + str(device)]:
|
|
|
|
|
JSON_DATA.pop('card' + str(device))
|
|
|
|
|
if not JSON_DATA:
|
2024-09-16 13:50:26 -04:00
|
|
|
logging.warning("No JSON data to report")
|
2020-07-15 06:01:40 -04:00
|
|
|
sys.exit(RETCODE)
|
|
|
|
|
|
|
|
|
|
if not args.csv:
|
|
|
|
|
print(json.dumps(JSON_DATA))
|
|
|
|
|
else:
|
|
|
|
|
devCsv = ''
|
|
|
|
|
sysCsv = ''
|
|
|
|
|
# JSON won't have any 'system' data without one of these flags
|
2025-09-10 14:50:23 -05:00
|
|
|
if args.showdriverversion and not args.showallinfo:
|
2020-07-15 06:01:40 -04:00
|
|
|
sysCsv = formatCsv(['system'])
|
|
|
|
|
print('%s' % (sysCsv))
|
2025-09-10 14:50:23 -05:00
|
|
|
elif args.showallinfo:
|
2020-07-15 06:01:40 -04:00
|
|
|
sysCsv = formatCsv(['system'])
|
|
|
|
|
devCsv = formatCsv(deviceList)
|
|
|
|
|
print('%s\n%s' % (sysCsv, devCsv))
|
|
|
|
|
else:
|
|
|
|
|
devCsv = formatCsv(deviceList)
|
|
|
|
|
print(devCsv)
|
|
|
|
|
|
2025-09-10 14:50:23 -05:00
|
|
|
if not isConciseInfoRequested(args) and not args.showhw:
|
2023-09-24 02:29:07 -05:00
|
|
|
printLogSpacer(footerString)
|
2020-07-15 06:01:40 -04:00
|
|
|
|
|
|
|
|
rsmi_ret_ok(rocmsmi.rsmi_shut_down())
|
|
|
|
|
exit(RETCODE)
|