[SWDEV-568642] amd-smi: Add amd-smi --rocm-smi for rocm-smi compatibility mode (#2363)
Implement new --rocm-smi flag that displays GPU information in ROCm-SMI compatible format Signed-off-by: Sumanth Gavini <sumanth.gavini@amd.com>
이 커밋은 다음에 포함됨:
@@ -19,6 +19,7 @@ add_custom_command(
|
||||
${PY_PACKAGE_DIR}/amdsmi_logger.py
|
||||
${PY_PACKAGE_DIR}/amdsmi_parser.py
|
||||
${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py
|
||||
${PY_PACKAGE_DIR}/amdsmi_rocm_smi_compat.py
|
||||
${PY_PACKAGE_DIR}/BDF.py
|
||||
${PY_PACKAGE_DIR}/README.md
|
||||
${PY_PACKAGE_DIR}/Release_Notes.md
|
||||
@@ -32,6 +33,7 @@ add_custom_command(
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_logger.py ${PY_PACKAGE_DIR}/
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_parser.py ${PY_PACKAGE_DIR}/
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_cli_exceptions.py ${PY_PACKAGE_DIR}/
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_rocm_smi_compat.py ${PY_PACKAGE_DIR}/
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/BDF.py ${PY_PACKAGE_DIR}/
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/README.md ${PY_PACKAGE_DIR}/
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/Release_Notes.md ${PY_PACKAGE_DIR}/)
|
||||
@@ -49,6 +51,7 @@ add_custom_target(
|
||||
${PY_PACKAGE_DIR}/amdsmi_logger.py
|
||||
${PY_PACKAGE_DIR}/amdsmi_parser.py
|
||||
${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py
|
||||
${PY_PACKAGE_DIR}/amdsmi_rocm_smi_compat.py
|
||||
${PY_PACKAGE_DIR}/BDF.py
|
||||
${PY_PACKAGE_DIR}/README.md
|
||||
${PY_PACKAGE_DIR}/Release_Notes.md)
|
||||
|
||||
@@ -123,6 +123,11 @@ def configure_logging_and_execute(args, amd_smi_commands):
|
||||
|
||||
logging.debug(args)
|
||||
|
||||
# Check if --rocm-smi flag is set (top-level flag, not a subcommand)
|
||||
if hasattr(args, 'rocm_smi') and args.rocm_smi:
|
||||
amd_smi_commands.rocm_smi(args)
|
||||
return
|
||||
|
||||
# Execute subcommands
|
||||
try:
|
||||
args.func(args)
|
||||
@@ -160,6 +165,7 @@ if __name__ == "__main__":
|
||||
amd_smi_commands.partition,
|
||||
amd_smi_commands.ras,
|
||||
amd_smi_commands.node,
|
||||
amd_smi_commands.rocm_smi,
|
||||
amd_smi_commands.default,
|
||||
sys_argv=sys.argv,
|
||||
helpers=amd_smi_helpers)
|
||||
@@ -170,7 +176,7 @@ if __name__ == "__main__":
|
||||
|
||||
# Store possible subcommands & aliases for later errors
|
||||
valid_commands = amd_smi_parser.possible_commands
|
||||
valid_commands += ['--help', '-h']
|
||||
valid_commands += ['--help', '-h', '--rocm-smi']
|
||||
|
||||
# Convert arguments to lowercase, but preserve case for folder path values
|
||||
processed_argv = []
|
||||
|
||||
@@ -7779,3 +7779,62 @@ class AMDSMICommands():
|
||||
print(e)
|
||||
|
||||
listener.stop()
|
||||
|
||||
|
||||
def rocm_smi(self, args):
|
||||
"""
|
||||
Display GPU information in ROCm-SMI compatible format (showAllConcise).
|
||||
This provides a drop-in replacement for rocm-smi --showallconcise using amdsmi backend.
|
||||
|
||||
Args:
|
||||
args: Parsed arguments (unused for this command)
|
||||
"""
|
||||
try:
|
||||
# Import the ROCm-SMI compatible functions from the compatibility module
|
||||
import sys
|
||||
import os
|
||||
# Add the current directory to path if needed
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
if current_dir not in sys.path:
|
||||
sys.path.insert(0, current_dir)
|
||||
|
||||
import amdsmi_rocm_smi_compat
|
||||
showAllConcise = amdsmi_rocm_smi_compat.showAllConcise
|
||||
listDevices = amdsmi_rocm_smi_compat.listDevices
|
||||
initializeRsmi = amdsmi_rocm_smi_compat.initializeRsmi
|
||||
check_runtime_status = amdsmi_rocm_smi_compat.check_runtime_status
|
||||
|
||||
# Initialize AMD SMI
|
||||
if not initializeRsmi():
|
||||
logging.error("Failed to initialize AMD SMI")
|
||||
return
|
||||
|
||||
try:
|
||||
# Get processor handles
|
||||
deviceList = listDevices()
|
||||
|
||||
if not deviceList:
|
||||
logging.error("No AMD GPU devices found")
|
||||
return
|
||||
|
||||
# Check runtime status (low power state warning)
|
||||
if not check_runtime_status():
|
||||
print("\nWARNING: AMD GPU device(s) is/are in a low-power state. Check power control/runtime_status\n")
|
||||
|
||||
# Display ROCm-SMI compatible output
|
||||
showAllConcise(deviceList)
|
||||
|
||||
finally:
|
||||
# Shutdown AMD SMI
|
||||
try:
|
||||
amdsmi_interface.amdsmi_shut_down()
|
||||
except:
|
||||
pass
|
||||
|
||||
except ImportError as e:
|
||||
logging.error(f"Could not import ROCm-SMI compatibility module: {e}")
|
||||
logging.error("Make sure amdsmi_rocm_smi_compat.py is in the amdsmi_cli directory")
|
||||
print("ERROR: ROCm-SMI compatibility mode not available")
|
||||
except Exception as e:
|
||||
logging.error(f"Error in ROCm-SMI compatibility mode: {e}")
|
||||
print(f"ERROR: {e}")
|
||||
|
||||
@@ -70,7 +70,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
"""
|
||||
def __init__(self, version, list, static, firmware, bad_pages, metric,
|
||||
process, profile, event, topology, set_value, reset, monitor,
|
||||
xgmi, partition, ras, node, default, sys_argv=None,
|
||||
xgmi, partition, ras, node, rocm_smi, default, sys_argv=None,
|
||||
helpers=None):
|
||||
|
||||
# Helper variables
|
||||
@@ -113,6 +113,17 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
add_help=True,
|
||||
prog=self.program_name)
|
||||
|
||||
# Add top-level --rocm-smi flag
|
||||
self.add_argument('--rocm-smi', action='store_true',
|
||||
help='Display GPU information in ROCm-SMI compatible format')
|
||||
|
||||
# Add top-level command modifiers (for --rocm-smi and other top-level flags)
|
||||
loglevel_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
||||
loglevel_help = f"Set the logging level from the possible choices: {', '.join(loglevel_choices)}"
|
||||
self.add_argument('--loglevel', action='store', type=str.upper, required=False,
|
||||
help=loglevel_help, default='ERROR', metavar='LEVEL',
|
||||
choices=loglevel_choices)
|
||||
|
||||
# Setup subparsers
|
||||
self.subparsers = self.add_subparsers(
|
||||
title="AMD-SMI Commands",
|
||||
|
||||
@@ -0,0 +1,941 @@
|
||||
#!/usr/bin/env python3
|
||||
"""ROCm_SMI_LIB CLI Tool - AMD-SMI Backend
|
||||
|
||||
This tool acts as a command line interface for manipulating
|
||||
and monitoring the amdgpu kernel, using the AMD-SMI library
|
||||
instead of the rocm_smi_lib API.
|
||||
This tool uses the amdsmi Python package to call the AMD SMI library.
|
||||
Recommended: At least one AMD GPU with ROCm driver installed
|
||||
Required: AMD SMI library installed (pip install amdsmi)
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
|
||||
# Version information
|
||||
SMI_MAJ = 4
|
||||
SMI_MIN = 0
|
||||
SMI_PAT = 0
|
||||
SMI_HASH = 'amdsmi'
|
||||
__version__ = '%s.%s.%s+%s' % (SMI_MAJ, SMI_MIN, SMI_PAT, SMI_HASH)
|
||||
|
||||
# Set to 1 if an error occurs
|
||||
RETCODE = 0
|
||||
|
||||
# If we want JSON format output instead
|
||||
PRINT_JSON = False
|
||||
PRINT_CSV = False
|
||||
|
||||
# Output formatting
|
||||
headerString = ' ROCm System Management Interface '
|
||||
footerString = ' End of ROCm SMI Log '
|
||||
appWidth = 106
|
||||
|
||||
# DRM ioctl calculation helpers
|
||||
# Reference: include/uapi/asm-generic/ioctl.h from Linux kernel
|
||||
# https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/asm-generic/ioctl.h
|
||||
_IOC_NRBITS = 8 # Number of bits for command number
|
||||
_IOC_TYPEBITS = 8 # Number of bits for device type
|
||||
_IOC_SIZEBITS = 14 # Number of bits for data size
|
||||
_IOC_DIRBITS = 2 # Number of bits for direction
|
||||
|
||||
_IOC_NRSHIFT = 0
|
||||
_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS
|
||||
_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS
|
||||
_IOC_DIRSHIFT = _IOC_SIZESHIFT + _IOC_SIZEBITS
|
||||
|
||||
_IOC_WRITE = 1 # ioctl direction: write (userland -> kernel)
|
||||
|
||||
def _IOC(dir, type, nr, size):
|
||||
"""Create ioctl command number
|
||||
Reference: _IOC() macro from include/uapi/asm-generic/ioctl.h
|
||||
"""
|
||||
return (dir << _IOC_DIRSHIFT) | (type << _IOC_TYPESHIFT) | (nr << _IOC_NRSHIFT) | (size << _IOC_SIZESHIFT)
|
||||
|
||||
def _IOW(type, nr, size):
|
||||
"""Create write ioctl command number
|
||||
Reference: _IOW() macro from include/uapi/asm-generic/ioctl.h
|
||||
"""
|
||||
return _IOC(_IOC_WRITE, type, nr, size)
|
||||
|
||||
# DRM and AMDGPU ioctl constants
|
||||
# Reference: include/uapi/drm/drm.h and include/uapi/drm/amdgpu_drm.h from Linux kernel
|
||||
# https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/drm/drm.h
|
||||
# https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/drm/amdgpu_drm.h
|
||||
DRM_IOCTL_BASE = ord('d') # 0x64 - DRM ioctl type character (from drm.h)
|
||||
DRM_COMMAND_BASE = 0x40 # Base for driver-specific commands (from drm.h)
|
||||
DRM_AMDGPU_INFO = 0x05 # AMDGPU info command number (from amdgpu_drm.h, line 51)
|
||||
# Size of struct drm_amdgpu_info - we use minimal size for the ioctl call
|
||||
# Reference: struct drm_amdgpu_info in include/uapi/drm/amdgpu_drm.h
|
||||
DRM_AMDGPU_INFO_SIZE = 32
|
||||
|
||||
# Calculate DRM_IOCTL_AMDGPU_INFO
|
||||
# Reference: Line 68 in amdgpu_drm.h:
|
||||
# #define DRM_IOCTL_AMDGPU_INFO DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_INFO, struct drm_amdgpu_info)
|
||||
DRM_IOCTL_AMDGPU_INFO = _IOW(DRM_IOCTL_BASE, DRM_COMMAND_BASE + DRM_AMDGPU_INFO, DRM_AMDGPU_INFO_SIZE)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(level=logging.WARNING, format='%(levelname)s: %(message)s')
|
||||
|
||||
def driverInitialized():
|
||||
""" Returns true if amdgpu is found in the list of initialized modules
|
||||
"""
|
||||
driverInitialized = False
|
||||
if os.path.exists("/sys/module/amdgpu"):
|
||||
if os.path.exists("/sys/module/amdgpu/initstate"):
|
||||
# amdgpu is loadable module
|
||||
with open("/sys/module/amdgpu/initstate") as initstate:
|
||||
if 'live' in initstate.read():
|
||||
driverInitialized = True
|
||||
else:
|
||||
# amdgpu is built into the kernel
|
||||
driverInitialized = True
|
||||
return driverInitialized
|
||||
|
||||
def find_kfd_topology_path():
|
||||
"""Find KFD topology path dynamically"""
|
||||
possible_paths = [
|
||||
"/sys/class/kfd/kfd/topology/nodes",
|
||||
"/sys/devices/virtual/kfd/kfd/topology/nodes"
|
||||
]
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
return None
|
||||
|
||||
def find_drm_path():
|
||||
"""Find DRM class path dynamically"""
|
||||
possible_paths = [
|
||||
"/sys/class/drm",
|
||||
"/sys/devices/virtual/drm"
|
||||
]
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
return None
|
||||
|
||||
def initializeRsmi():
|
||||
"""Initialize AMD SMI library - equivalent to rocm_smi.initializeRsmi()"""
|
||||
try:
|
||||
import amdsmi
|
||||
amdsmi.amdsmi_init()
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to initialize AMD SMI: {e}")
|
||||
return False
|
||||
|
||||
def listDevices():
|
||||
"""Get list of GPU devices - equivalent to rocm_smi.listDevices()"""
|
||||
try:
|
||||
import amdsmi
|
||||
processors = amdsmi.amdsmi_get_processor_handles()
|
||||
return processors
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to get processor handles: {e}")
|
||||
return []
|
||||
|
||||
def checkAmdGpus(deviceList):
|
||||
"""Check if AMD GPUs are present - equivalent to rocm_smi.checkAmdGpus()"""
|
||||
if not deviceList:
|
||||
logging.warning('No AMD GPUs specified')
|
||||
return False
|
||||
return True
|
||||
|
||||
def check_runtime_pm_status(card_path):
|
||||
"""
|
||||
Check if a specific GPU card is in runtime PM suspended state.
|
||||
Equivalent to rocm_smi.check_runtime_pm_status()
|
||||
|
||||
Args:
|
||||
card_path: Path to the card (e.g., /sys/class/drm/card0)
|
||||
|
||||
Returns:
|
||||
bool: True if suspended, False if active or runtime PM disabled
|
||||
"""
|
||||
try:
|
||||
runtime_status = os.path.join(card_path, "device/power/runtime_status")
|
||||
runtime_enabled = os.path.join(card_path, "device/power/runtime_enabled")
|
||||
|
||||
# Check if runtime PM is enabled
|
||||
if os.path.exists(runtime_enabled):
|
||||
with open(runtime_enabled) as f:
|
||||
enabled = f.read().strip()
|
||||
if "disabled" in enabled or "forbidden" in enabled:
|
||||
return False
|
||||
|
||||
# Check runtime status
|
||||
if os.path.exists(runtime_status):
|
||||
with open(runtime_status) as f:
|
||||
status = f.read().strip()
|
||||
return "suspended" in status
|
||||
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
|
||||
def wake_device(card_path):
|
||||
"""
|
||||
Wake a GPU device from runtime PM suspended state.
|
||||
Equivalent to rocm_smi.wake_device()
|
||||
|
||||
Uses DRM_IOCTL_AMDGPU_INFO ioctl to wake the device from runtime suspend.
|
||||
|
||||
Args:
|
||||
card_path: Path to the card (e.g., /sys/class/drm/card0)
|
||||
|
||||
Returns:
|
||||
bool: True if device was woken or already active, False on error
|
||||
"""
|
||||
try:
|
||||
import re
|
||||
import fcntl
|
||||
import struct
|
||||
import errno
|
||||
|
||||
# Extract card number from card_path (e.g., card0 -> 0)
|
||||
card_name = os.path.basename(card_path)
|
||||
card_match = re.match(r'card(\d+)', card_name)
|
||||
if not card_match:
|
||||
return False
|
||||
|
||||
# Find renderD device in the same DRM directory
|
||||
drm_path = os.path.dirname(card_path)
|
||||
render_name = None
|
||||
|
||||
# Look for renderD* in the drm directory
|
||||
for entry in os.listdir(drm_path):
|
||||
if re.match(r'renderD\d+', entry):
|
||||
# Check if this renderD is associated with our card
|
||||
# by verifying they point to the same device
|
||||
entry_link = os.path.join(drm_path, entry)
|
||||
card_link = card_path
|
||||
|
||||
try:
|
||||
entry_real = os.path.realpath(entry_link)
|
||||
card_real = os.path.realpath(card_link)
|
||||
|
||||
# They should share the same parent device directory
|
||||
if os.path.dirname(entry_real) == os.path.dirname(card_real):
|
||||
render_name = entry
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
if not render_name:
|
||||
return False
|
||||
|
||||
render_path = f"/dev/dri/{render_name}"
|
||||
|
||||
# Open the DRM device and send ioctl to wake it
|
||||
if os.path.exists(render_path):
|
||||
try:
|
||||
fd = os.open(render_path, os.O_RDWR | os.O_CLOEXEC)
|
||||
|
||||
# Create empty drm_amdgpu_info struct (all zeros)
|
||||
request = struct.pack('I' * 8, *([0] * 8)) # 32 bytes of zeros
|
||||
|
||||
# Send DRM_IOCTL_AMDGPU_INFO ioctl to wake the device
|
||||
try:
|
||||
fcntl.ioctl(fd, DRM_IOCTL_AMDGPU_INFO, request)
|
||||
except OSError as e:
|
||||
# EINVAL is acceptable - device may already be awake
|
||||
if e.errno != errno.EINVAL:
|
||||
os.close(fd)
|
||||
return False
|
||||
|
||||
os.close(fd)
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
|
||||
def check_runtime_status():
|
||||
"""
|
||||
Check if any GPUs are in low power state and wake them if needed.
|
||||
Equivalent to rocm_smi.check_runtime_status()
|
||||
|
||||
Returns:
|
||||
bool: True if all devices are active, False if any are suspended
|
||||
"""
|
||||
try:
|
||||
drm_path = find_drm_path()
|
||||
if not drm_path:
|
||||
return True
|
||||
|
||||
all_active = True
|
||||
for card in os.listdir(drm_path):
|
||||
if card.startswith("card") and "-" not in card:
|
||||
card_path = os.path.join(drm_path, card)
|
||||
|
||||
# Check if this card is suspended
|
||||
is_suspended = check_runtime_pm_status(card_path)
|
||||
if is_suspended:
|
||||
all_active = False
|
||||
# Try to wake the device
|
||||
wake_device(card_path)
|
||||
|
||||
return all_active
|
||||
except:
|
||||
return True
|
||||
|
||||
def get_bdf(processor):
|
||||
"""Get BDF (Bus:Device.Function)"""
|
||||
try:
|
||||
import amdsmi
|
||||
bdf = amdsmi.amdsmi_get_gpu_device_bdf(processor)
|
||||
return bdf
|
||||
except:
|
||||
return None
|
||||
|
||||
def bdf_to_location_id(bdf):
|
||||
"""Convert BDF string to location_id integer used by KFD
|
||||
BDF format: "0000:44:00.0" -> location_id calculation
|
||||
"""
|
||||
try:
|
||||
# Parse BDF: domain:bus:device.function
|
||||
parts = bdf.replace(':', ' ').replace('.', ' ').split()
|
||||
if len(parts) >= 4:
|
||||
domain = int(parts[0], 16)
|
||||
bus = int(parts[1], 16)
|
||||
device = int(parts[2], 16)
|
||||
function = int(parts[3], 16)
|
||||
|
||||
# KFD location_id = (bus << 8) | (device << 3) | function
|
||||
location_id = (bus << 8) | (device << 3) | function
|
||||
return location_id
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def get_device_info(processor, bdf=None):
|
||||
"""Get device ID and GPU ID (GUID) from KFD topology"""
|
||||
try:
|
||||
import amdsmi
|
||||
asic_info = amdsmi.amdsmi_get_gpu_asic_info(processor)
|
||||
device_id = asic_info.get('device_id', 'N/A')
|
||||
|
||||
# Get GPU ID from KFD topology by matching location_id
|
||||
kfd_path = find_kfd_topology_path()
|
||||
gpu_id = 'N/A'
|
||||
|
||||
if kfd_path and os.path.exists(kfd_path) and bdf:
|
||||
location_id = bdf_to_location_id(bdf)
|
||||
if location_id is not None:
|
||||
# Look through KFD nodes to find matching location_id
|
||||
for node_dir in os.listdir(kfd_path):
|
||||
props_file = os.path.join(kfd_path, node_dir, "properties")
|
||||
if os.path.exists(props_file):
|
||||
with open(props_file) as f:
|
||||
props_dict = {}
|
||||
for line in f:
|
||||
if ' ' in line:
|
||||
parts = line.strip().split(None, 1)
|
||||
if len(parts) == 2:
|
||||
props_dict[parts[0]] = parts[1]
|
||||
|
||||
# Match by location_id
|
||||
if props_dict.get('location_id') == str(location_id):
|
||||
gpu_id_file = os.path.join(kfd_path, node_dir, "gpu_id")
|
||||
if os.path.exists(gpu_id_file):
|
||||
with open(gpu_id_file) as gf:
|
||||
gpu_id = gf.read().strip()
|
||||
break
|
||||
|
||||
return {
|
||||
'device_id': device_id,
|
||||
'guid': gpu_id
|
||||
}
|
||||
except Exception as e:
|
||||
return {'device_id': 'N/A', 'guid': 'N/A'}
|
||||
|
||||
def get_kfd_node_id(processor, bdf=None):
|
||||
"""Get KFD node ID by matching BDF to location_id in topology"""
|
||||
try:
|
||||
import amdsmi
|
||||
|
||||
if not bdf:
|
||||
return amdsmi.amdsmi_topo_get_numa_node_number(processor)
|
||||
|
||||
kfd_path = find_kfd_topology_path()
|
||||
if kfd_path and os.path.exists(kfd_path):
|
||||
location_id = bdf_to_location_id(bdf)
|
||||
if location_id is not None:
|
||||
# Look through KFD nodes to find matching location_id
|
||||
for node_dir in os.listdir(kfd_path):
|
||||
props_file = os.path.join(kfd_path, node_dir, "properties")
|
||||
if os.path.exists(props_file):
|
||||
with open(props_file) as f:
|
||||
props_dict = {}
|
||||
for line in f:
|
||||
if ' ' in line:
|
||||
parts = line.strip().split(None, 1)
|
||||
if len(parts) == 2:
|
||||
props_dict[parts[0]] = parts[1]
|
||||
|
||||
# Match by location_id
|
||||
if props_dict.get('location_id') == str(location_id):
|
||||
return int(node_dir)
|
||||
|
||||
# Fallback to NUMA node
|
||||
return amdsmi.amdsmi_topo_get_numa_node_number(processor)
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_temperature(processor, temp_type='edge'):
|
||||
"""Get GPU temperature"""
|
||||
try:
|
||||
import amdsmi
|
||||
# Map temp_type string to enum
|
||||
temp_type_map = {
|
||||
'edge': amdsmi.AmdSmiTemperatureType.EDGE,
|
||||
'junction': amdsmi.AmdSmiTemperatureType.HOTSPOT,
|
||||
'hotspot': amdsmi.AmdSmiTemperatureType.HOTSPOT,
|
||||
'memory': amdsmi.AmdSmiTemperatureType.VRAM,
|
||||
'vram': amdsmi.AmdSmiTemperatureType.VRAM,
|
||||
}
|
||||
|
||||
temp_enum = temp_type_map.get(temp_type.lower(), amdsmi.AmdSmiTemperatureType.EDGE)
|
||||
|
||||
temp = amdsmi.amdsmi_get_temp_metric(
|
||||
processor,
|
||||
temp_enum,
|
||||
amdsmi.AmdSmiTemperatureMetric.CURRENT
|
||||
)
|
||||
return temp
|
||||
except:
|
||||
# Try other temperature types
|
||||
try:
|
||||
for ttype in [amdsmi.AmdSmiTemperatureType.HOTSPOT,
|
||||
amdsmi.AmdSmiTemperatureType.EDGE,
|
||||
amdsmi.AmdSmiTemperatureType.VRAM]:
|
||||
try:
|
||||
temp = amdsmi.amdsmi_get_temp_metric(
|
||||
processor,
|
||||
ttype,
|
||||
amdsmi.AmdSmiTemperatureMetric.CURRENT
|
||||
)
|
||||
return temp
|
||||
except:
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def getTemperatureLabel(deviceList):
|
||||
"""Discover the first available temperature label - matches rocm-smi priority"""
|
||||
import amdsmi
|
||||
if not deviceList:
|
||||
return 'edge'
|
||||
|
||||
# ROCm-SMI always prefers edge temperature first, then junction, then memory
|
||||
# This matches the original rocm-smi behavior
|
||||
processor = deviceList[0]
|
||||
temp_types = [
|
||||
('edge', amdsmi.AmdSmiTemperatureType.EDGE),
|
||||
('junction', amdsmi.AmdSmiTemperatureType.HOTSPOT),
|
||||
('memory', amdsmi.AmdSmiTemperatureType.VRAM),
|
||||
]
|
||||
|
||||
for label, temp_type in temp_types:
|
||||
try:
|
||||
temp = amdsmi.amdsmi_get_temp_metric(
|
||||
processor,
|
||||
temp_type,
|
||||
amdsmi.AmdSmiTemperatureMetric.CURRENT
|
||||
)
|
||||
if temp is not None and temp > 0:
|
||||
return label
|
||||
except:
|
||||
continue
|
||||
|
||||
return 'edge'
|
||||
|
||||
def getPowerLabel(deviceList):
|
||||
"""Discover the first available power label - matches rocm-smi output"""
|
||||
import amdsmi
|
||||
if not deviceList:
|
||||
return '(Avg)'
|
||||
|
||||
# ROCm-SMI always uses "(Avg)" for Average Graphics Package Power
|
||||
# Return (Avg) to match rocm-smi behavior
|
||||
return '(Avg)'
|
||||
|
||||
def get_power_from_sysfs(bdf):
|
||||
"""Get power from sysfs without waking GPU"""
|
||||
try:
|
||||
if not bdf:
|
||||
return None
|
||||
|
||||
drm_path = find_drm_path()
|
||||
if not drm_path:
|
||||
return None
|
||||
|
||||
for card in os.listdir(drm_path):
|
||||
if card.startswith("card") and "-" not in card:
|
||||
card_bdf_path = os.path.join(drm_path, card, "device/uevent")
|
||||
if os.path.exists(card_bdf_path):
|
||||
with open(card_bdf_path) as f:
|
||||
content = f.read()
|
||||
if bdf in content:
|
||||
hwmon_path = os.path.join(drm_path, card, "device/hwmon")
|
||||
if os.path.exists(hwmon_path):
|
||||
for hwmon in os.listdir(hwmon_path):
|
||||
power_file = os.path.join(hwmon_path, hwmon, "power1_average")
|
||||
if os.path.exists(power_file):
|
||||
with open(power_file) as f:
|
||||
power_uw = int(f.read().strip())
|
||||
return power_uw / 1000000.0
|
||||
break
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_power(processor, bdf=None):
|
||||
"""Get power consumption"""
|
||||
try:
|
||||
import amdsmi
|
||||
# Try sysfs first (doesn't wake GPU)
|
||||
power_sysfs = get_power_from_sysfs(bdf)
|
||||
if power_sysfs is not None:
|
||||
return power_sysfs
|
||||
|
||||
# Fallback to API
|
||||
power_dict = amdsmi.amdsmi_get_power_info(processor)
|
||||
if 'average_socket_power' in power_dict:
|
||||
power_w = power_dict['average_socket_power']
|
||||
if power_w != 'N/A' and isinstance(power_w, (int, float)):
|
||||
return float(power_w)
|
||||
if 'socket_power' in power_dict:
|
||||
power_w = power_dict['socket_power']
|
||||
if power_w != 'N/A' and isinstance(power_w, (int, float)):
|
||||
return float(power_w)
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_clock_freq_from_sysfs(bdf, clk_type):
|
||||
"""Get clock frequency from sysfs without waking GPU"""
|
||||
try:
|
||||
import amdsmi
|
||||
if not bdf:
|
||||
return None
|
||||
|
||||
clk_file_map = {
|
||||
amdsmi.AmdSmiClkType.SYS: 'pp_dpm_sclk',
|
||||
amdsmi.AmdSmiClkType.MEM: 'pp_dpm_mclk',
|
||||
}
|
||||
|
||||
if clk_type not in clk_file_map:
|
||||
return None
|
||||
|
||||
drm_path = find_drm_path()
|
||||
if not drm_path:
|
||||
return None
|
||||
|
||||
for card in os.listdir(drm_path):
|
||||
if card.startswith("card") and "-" not in card:
|
||||
card_bdf_path = os.path.join(drm_path, card, "device/uevent")
|
||||
if os.path.exists(card_bdf_path):
|
||||
with open(card_bdf_path) as f:
|
||||
content = f.read()
|
||||
if bdf in content:
|
||||
dpm_file = os.path.join(drm_path, card, f"device/{clk_file_map[clk_type]}")
|
||||
if os.path.exists(dpm_file):
|
||||
with open(dpm_file) as f:
|
||||
for line in f:
|
||||
if '*' in line:
|
||||
parts = line.split(':')
|
||||
if len(parts) >= 2:
|
||||
freq_str = parts[1].replace('*', '').strip().replace('Mhz', '').replace('MHz', '')
|
||||
return float(freq_str)
|
||||
break
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_clock_freq(processor, clk_type, bdf=None):
|
||||
"""Get clock frequency"""
|
||||
try:
|
||||
import amdsmi
|
||||
# Try sysfs first (doesn't wake GPU)
|
||||
freq_sysfs = get_clock_freq_from_sysfs(bdf, clk_type)
|
||||
if freq_sysfs is not None:
|
||||
return freq_sysfs
|
||||
|
||||
# Fallback to API
|
||||
freq_dict = amdsmi.amdsmi_get_clk_freq(processor, clk_type)
|
||||
if 'current' in freq_dict and 'frequency' in freq_dict:
|
||||
current_idx = freq_dict['current']
|
||||
frequencies = freq_dict['frequency']
|
||||
if current_idx < len(frequencies):
|
||||
freq_hz = frequencies[current_idx]
|
||||
return freq_hz / 1000000
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_gpu_usage_from_sysfs(bdf):
|
||||
"""Get GPU usage from sysfs without waking GPU"""
|
||||
try:
|
||||
if not bdf:
|
||||
return None
|
||||
|
||||
drm_path = find_drm_path()
|
||||
if not drm_path:
|
||||
return None
|
||||
|
||||
for card in os.listdir(drm_path):
|
||||
if card.startswith("card") and "-" not in card:
|
||||
card_bdf_path = os.path.join(drm_path, card, "device/uevent")
|
||||
if os.path.exists(card_bdf_path):
|
||||
with open(card_bdf_path) as f:
|
||||
content = f.read()
|
||||
if bdf in content:
|
||||
busy_file = os.path.join(drm_path, card, "device/gpu_busy_percent")
|
||||
if os.path.exists(busy_file):
|
||||
with open(busy_file) as f:
|
||||
return int(f.read().strip())
|
||||
break
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_gpu_usage(processor, bdf=None):
|
||||
"""Get GPU utilization percentage"""
|
||||
try:
|
||||
import amdsmi
|
||||
# Try sysfs first (doesn't wake GPU)
|
||||
usage_sysfs = get_gpu_usage_from_sysfs(bdf)
|
||||
if usage_sysfs is not None:
|
||||
return usage_sysfs
|
||||
|
||||
# Fallback to API
|
||||
usage = amdsmi.amdsmi_get_gpu_busy_percent(processor)
|
||||
return usage
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_memory_usage(processor):
|
||||
"""Get VRAM usage percentage"""
|
||||
try:
|
||||
import amdsmi
|
||||
vram_used = amdsmi.amdsmi_get_gpu_memory_usage(processor, amdsmi.AmdSmiMemoryType.VRAM)
|
||||
vram_total = amdsmi.amdsmi_get_gpu_memory_total(processor, amdsmi.AmdSmiMemoryType.VRAM)
|
||||
if vram_total > 0:
|
||||
return (vram_used / vram_total) * 100
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_fan_speed(processor):
|
||||
"""Get fan speed percentage"""
|
||||
try:
|
||||
import amdsmi
|
||||
fan_speed = amdsmi.amdsmi_get_gpu_fan_speed(processor, 0)
|
||||
return fan_speed
|
||||
except:
|
||||
# Fan not supported (e.g., OAM modules) - return 0
|
||||
return 0
|
||||
|
||||
def get_perf_level(processor):
|
||||
"""Get performance level"""
|
||||
try:
|
||||
import amdsmi
|
||||
perf_level = amdsmi.amdsmi_get_gpu_perf_level(processor)
|
||||
if isinstance(perf_level, str):
|
||||
if 'AUTO' in perf_level:
|
||||
return 'auto'
|
||||
elif 'LOW' in perf_level:
|
||||
return 'low'
|
||||
elif 'HIGH' in perf_level:
|
||||
return 'high'
|
||||
elif 'MANUAL' in perf_level:
|
||||
return 'manual'
|
||||
elif 'STABLE_STD' in perf_level:
|
||||
return 'stable_std'
|
||||
elif 'STABLE_PEAK' in perf_level:
|
||||
return 'stable_peak'
|
||||
elif 'STABLE_MIN_MCLK' in perf_level:
|
||||
return 'stable_min_mclk'
|
||||
elif 'STABLE_MIN_SCLK' in perf_level:
|
||||
return 'stable_min_sclk'
|
||||
return perf_level
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_power_cap(processor):
|
||||
"""Get power cap in Watts"""
|
||||
try:
|
||||
import amdsmi
|
||||
power_cap_info = amdsmi.amdsmi_get_power_cap_info(processor)
|
||||
if 'power_cap' in power_cap_info:
|
||||
return power_cap_info['power_cap'] / 1000000.0
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_memory_partition(processor):
|
||||
"""Get memory partition"""
|
||||
try:
|
||||
import amdsmi
|
||||
partition = amdsmi.amdsmi_get_gpu_memory_partition(processor)
|
||||
# partition is already a string like 'NPS1', 'NPS2', etc.
|
||||
if isinstance(partition, str):
|
||||
return partition
|
||||
# If it's an enum, convert it
|
||||
partition_names = {
|
||||
amdsmi.AmdSmiMemoryPartitionType.NPS1: 'NPS1',
|
||||
amdsmi.AmdSmiMemoryPartitionType.NPS2: 'NPS2',
|
||||
amdsmi.AmdSmiMemoryPartitionType.NPS4: 'NPS4',
|
||||
amdsmi.AmdSmiMemoryPartitionType.NPS8: 'NPS8',
|
||||
}
|
||||
return partition_names.get(partition, str(partition))
|
||||
except:
|
||||
return 'N/A'
|
||||
|
||||
def get_compute_partition(processor):
|
||||
"""Get compute partition"""
|
||||
try:
|
||||
import amdsmi
|
||||
partition = amdsmi.amdsmi_get_gpu_compute_partition(processor)
|
||||
# partition is already a string like 'SPX', 'DPX', etc.
|
||||
if isinstance(partition, str):
|
||||
return partition
|
||||
# If it's an enum, convert it
|
||||
partition_names = {
|
||||
amdsmi.AmdSmiComputePartitionType.SPX: 'SPX',
|
||||
amdsmi.AmdSmiComputePartitionType.DPX: 'DPX',
|
||||
amdsmi.AmdSmiComputePartitionType.TPX: 'TPX',
|
||||
amdsmi.AmdSmiComputePartitionType.QPX: 'QPX',
|
||||
amdsmi.AmdSmiComputePartitionType.CPX: 'CPX',
|
||||
}
|
||||
return partition_names.get(partition, str(partition))
|
||||
except:
|
||||
return 'N/A'
|
||||
|
||||
def printLogSpacer(displayString=None, fill='=', contentSizeToFit=0):
|
||||
""" Prints formatted spacer line
|
||||
|
||||
:param displayString: String to display in the center
|
||||
:param fill: Character to use for padding
|
||||
:param contentSizeToFit: Width to fit content (overrides appWidth)
|
||||
"""
|
||||
global appWidth, PRINT_JSON
|
||||
resizeValue = appWidth
|
||||
if contentSizeToFit != 0:
|
||||
resizeValue = contentSizeToFit
|
||||
if resizeValue % 2: # if odd -> make even
|
||||
resizeValue += 1
|
||||
|
||||
if not PRINT_JSON:
|
||||
if displayString:
|
||||
if len(displayString) % 2:
|
||||
displayString += fill
|
||||
logSpacer = fill * int((resizeValue - len(displayString)) / 2) + displayString + fill * int(
|
||||
(resizeValue - len(displayString)) / 2)
|
||||
else:
|
||||
logSpacer = fill * resizeValue
|
||||
print(logSpacer)
|
||||
|
||||
def printLog(device, logstr, logname, useItalics=False):
|
||||
""" Print out to the log
|
||||
|
||||
:param device: Device to print (can be None)
|
||||
:param logstr: String to print
|
||||
:param logname: Log name (can be None)
|
||||
:param useItalics: Whether to use italics (not used in this implementation)
|
||||
"""
|
||||
global PRINT_JSON
|
||||
if not PRINT_JSON:
|
||||
print(logstr)
|
||||
|
||||
def showAllConcise(deviceList):
|
||||
"""
|
||||
Equivalent to rocm_smi.showAllConcise()
|
||||
Display critical info for all devices in a concise format
|
||||
"""
|
||||
global PRINT_JSON, headerString, footerString
|
||||
import amdsmi
|
||||
|
||||
if PRINT_JSON:
|
||||
print('NOT_SUPPORTED: Cannot print JSON/CSV output for concise output')
|
||||
sys.exit(1)
|
||||
|
||||
silent = True
|
||||
|
||||
# CRITICAL: Cache BDF first to minimize GPU wake-up
|
||||
bdf_cache = {}
|
||||
for i, processor in enumerate(deviceList):
|
||||
bdf_cache[i] = get_bdf(processor)
|
||||
|
||||
# Detect temperature and power labels
|
||||
available_temp_type = getTemperatureLabel(deviceList)
|
||||
temp_label = f"({available_temp_type.capitalize()})"
|
||||
power_label = getPowerLabel(deviceList)
|
||||
|
||||
# Header and subheader setup
|
||||
header = ['Device', 'Node', 'IDs', '', 'Temp', 'Power', 'Partitions',
|
||||
'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
|
||||
subheader = ['', '', '(DID,', 'GUID)', temp_label, power_label,
|
||||
'(Mem, Compute, ID)',
|
||||
'', '', '', '', '', '', '']
|
||||
|
||||
# Add additional spaces to match header
|
||||
for idx, item in enumerate(subheader):
|
||||
header_size = len(header[idx])
|
||||
subheader_size = len(subheader[idx])
|
||||
if header_size != subheader_size:
|
||||
numSpacesToFill_subheader = header_size - subheader_size
|
||||
numSpacesToFill_header = subheader_size - header_size
|
||||
if numSpacesToFill_subheader > 0:
|
||||
subheader[idx] = subheader[idx] + (' ' * numSpacesToFill_subheader)
|
||||
if numSpacesToFill_header > 0:
|
||||
header[idx] = header[idx] + (' ' * numSpacesToFill_header)
|
||||
|
||||
head_widths = [len(head) + 2 for head in header]
|
||||
values = {}
|
||||
degree_sign = u'\N{DEGREE SIGN}'
|
||||
|
||||
# Collect data for each device
|
||||
for i, processor in enumerate(deviceList):
|
||||
bdf = bdf_cache.get(i)
|
||||
|
||||
# Read sysfs values FIRST (before other API calls that might wake GPU)
|
||||
sclk = get_clock_freq(processor, amdsmi.AmdSmiClkType.SYS, bdf)
|
||||
sclk_str = f"{sclk:.0f}Mhz" if sclk is not None else "N/A"
|
||||
|
||||
mclk = get_clock_freq(processor, amdsmi.AmdSmiClkType.MEM, bdf)
|
||||
mclk_str = f"{mclk:.0f}Mhz" if mclk is not None else "N/A"
|
||||
|
||||
gpu_usage = get_gpu_usage(processor, bdf)
|
||||
gpu_str = f"{gpu_usage}%" if gpu_usage is not None else "N/A"
|
||||
|
||||
power = get_power(processor, bdf)
|
||||
power_str = f"{power:.1f}W" if power is not None else "N/A"
|
||||
|
||||
# Now get other info (may wake GPU but critical sysfs values already read)
|
||||
node = get_kfd_node_id(processor, bdf)
|
||||
if node is None:
|
||||
node = i
|
||||
|
||||
device_info = get_device_info(processor, bdf)
|
||||
device_id = device_info['device_id']
|
||||
guid_str = str(device_info['guid'])
|
||||
if len(guid_str) > 10:
|
||||
guid_str = guid_str[-5:]
|
||||
|
||||
temp = get_temperature(processor, available_temp_type)
|
||||
temp_val = f"{temp:.1f}{degree_sign}C" if temp is not None else "N/A"
|
||||
|
||||
mem_part = get_memory_partition(processor)
|
||||
comp_part = get_compute_partition(processor)
|
||||
combined_partition_data = f"{mem_part}, {comp_part}, 0"
|
||||
|
||||
fan = get_fan_speed(processor)
|
||||
fan_str = f"{fan}%" if fan is not None else "N/A"
|
||||
|
||||
perf = get_perf_level(processor)
|
||||
perf_str = perf if perf is not None else "N/A"
|
||||
|
||||
pwr_cap = get_power_cap(processor)
|
||||
pwr_cap_str = f"{pwr_cap:.1f}W" if pwr_cap is not None else "N/A"
|
||||
|
||||
vram = get_memory_usage(processor)
|
||||
vram_str = f"{vram:.0f}%" if vram is not None else "N/A"
|
||||
|
||||
# Store values
|
||||
values[f'card{i}'] = [i, node,
|
||||
str(device_id) + ", ",
|
||||
str(guid_str),
|
||||
temp_val, power_str,
|
||||
combined_partition_data,
|
||||
sclk_str, mclk_str, fan_str, perf_str,
|
||||
pwr_cap_str,
|
||||
vram_str,
|
||||
gpu_str]
|
||||
|
||||
# Calculate column widths
|
||||
val_widths = {}
|
||||
for i in range(len(deviceList)):
|
||||
val_widths[i] = [len(str(val)) + 2 for val in values[f'card{i}']]
|
||||
|
||||
max_widths = head_widths
|
||||
for i in range(len(deviceList)):
|
||||
for col in range(len(val_widths[i])):
|
||||
max_widths[col] = max(max_widths[col], val_widths[i][col])
|
||||
|
||||
# Display concise info
|
||||
header_output = "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header))
|
||||
subheader_output = "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader))
|
||||
|
||||
printLogSpacer(headerString, contentSizeToFit=len(header_output))
|
||||
printLogSpacer(' Concise Info ', contentSizeToFit=len(header_output))
|
||||
printLog(None, header_output, None)
|
||||
printLog(None, subheader_output, None, useItalics=True)
|
||||
printLogSpacer(fill='=', contentSizeToFit=len(header_output))
|
||||
|
||||
for i in range(len(deviceList)):
|
||||
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
|
||||
zip(range(len(max_widths)), values[f'card{i}'])), None)
|
||||
|
||||
printLogSpacer(footerString, contentSizeToFit=len(header_output))
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function - equivalent to rocm_smi.py main execution
|
||||
"""
|
||||
global RETCODE
|
||||
|
||||
# Import amdsmi
|
||||
try:
|
||||
import amdsmi
|
||||
except ImportError:
|
||||
print("ERROR: Could not import amdsmi module")
|
||||
print("Install with: pip install amdsmi")
|
||||
return 1
|
||||
|
||||
# Check if driver is initialized
|
||||
if not driverInitialized():
|
||||
logging.error('Unable to detect amdgpu driver. Exiting...')
|
||||
return 1
|
||||
|
||||
# Initialize AMD SMI (equivalent to rocm_smi.initializeRsmi())
|
||||
if not initializeRsmi():
|
||||
logging.error("Failed to initialize AMD SMI")
|
||||
return 1
|
||||
|
||||
try:
|
||||
# Get processor handles (equivalent to rocm_smi.listDevices())
|
||||
deviceList = listDevices()
|
||||
|
||||
if not deviceList:
|
||||
logging.error("No AMD GPU devices found")
|
||||
return 1
|
||||
|
||||
# Check AMD GPUs (equivalent to rocm_smi.checkAmdGpus())
|
||||
if not checkAmdGpus(deviceList):
|
||||
logging.warning('No AMD GPUs specified')
|
||||
|
||||
# Check runtime status (equivalent to rocm_smi.check_runtime_status())
|
||||
if not check_runtime_status():
|
||||
print("\nWARNING: AMD GPU device(s) is/are in a low-power state. Check power control/runtime_status\n")
|
||||
|
||||
# Call showAllConcise (equivalent to rocm_smi.showAllConcise(deviceList))
|
||||
showAllConcise(deviceList)
|
||||
|
||||
finally:
|
||||
# Shutdown AMD SMI
|
||||
try:
|
||||
amdsmi.amdsmi_shut_down()
|
||||
except:
|
||||
pass
|
||||
|
||||
return RETCODE
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
새 이슈에서 참조
사용자 차단