From 4b81912baaf8a377f76db6dab58c1fd9f2f7887e Mon Sep 17 00:00:00 2001 From: Sumanth Gavini Date: Mon, 26 Jan 2026 14:37:46 -0600 Subject: [PATCH] [SWDEV-568642] amd-smi: Add amd-smi --rocm-smi for rocm-smi compatibility mode (#2363) Implement new --rocm-smi flag that displays GPU information in ROCm-SMI compatible format Signed-off-by: Sumanth Gavini --- projects/amdsmi/amdsmi_cli/CMakeLists.txt | 3 + projects/amdsmi/amdsmi_cli/amdsmi_cli.py | 8 +- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 59 ++ projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 13 +- .../amdsmi_cli/amdsmi_rocm_smi_compat.py | 941 ++++++++++++++++++ 5 files changed, 1022 insertions(+), 2 deletions(-) create mode 100755 projects/amdsmi/amdsmi_cli/amdsmi_rocm_smi_compat.py diff --git a/projects/amdsmi/amdsmi_cli/CMakeLists.txt b/projects/amdsmi/amdsmi_cli/CMakeLists.txt index d9728176c2..df4b8e335b 100644 --- a/projects/amdsmi/amdsmi_cli/CMakeLists.txt +++ b/projects/amdsmi/amdsmi_cli/CMakeLists.txt @@ -19,6 +19,7 @@ add_custom_command( ${PY_PACKAGE_DIR}/amdsmi_logger.py ${PY_PACKAGE_DIR}/amdsmi_parser.py ${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py + ${PY_PACKAGE_DIR}/amdsmi_rocm_smi_compat.py ${PY_PACKAGE_DIR}/BDF.py ${PY_PACKAGE_DIR}/README.md ${PY_PACKAGE_DIR}/Release_Notes.md @@ -32,6 +33,7 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_logger.py ${PY_PACKAGE_DIR}/ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_parser.py ${PY_PACKAGE_DIR}/ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_cli_exceptions.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_rocm_smi_compat.py ${PY_PACKAGE_DIR}/ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/BDF.py ${PY_PACKAGE_DIR}/ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/README.md ${PY_PACKAGE_DIR}/ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/Release_Notes.md ${PY_PACKAGE_DIR}/) @@ -49,6 +51,7 @@ add_custom_target( ${PY_PACKAGE_DIR}/amdsmi_logger.py ${PY_PACKAGE_DIR}/amdsmi_parser.py ${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py + ${PY_PACKAGE_DIR}/amdsmi_rocm_smi_compat.py ${PY_PACKAGE_DIR}/BDF.py ${PY_PACKAGE_DIR}/README.md ${PY_PACKAGE_DIR}/Release_Notes.md) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py index d7fcf198af..b7ec6b5efb 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py @@ -123,6 +123,11 @@ def configure_logging_and_execute(args, amd_smi_commands): logging.debug(args) + # Check if --rocm-smi flag is set (top-level flag, not a subcommand) + if hasattr(args, 'rocm_smi') and args.rocm_smi: + amd_smi_commands.rocm_smi(args) + return + # Execute subcommands try: args.func(args) @@ -160,6 +165,7 @@ if __name__ == "__main__": amd_smi_commands.partition, amd_smi_commands.ras, amd_smi_commands.node, + amd_smi_commands.rocm_smi, amd_smi_commands.default, sys_argv=sys.argv, helpers=amd_smi_helpers) @@ -170,7 +176,7 @@ if __name__ == "__main__": # Store possible subcommands & aliases for later errors valid_commands = amd_smi_parser.possible_commands - valid_commands += ['--help', '-h'] + valid_commands += ['--help', '-h', '--rocm-smi'] # Convert arguments to lowercase, but preserve case for folder path values processed_argv = [] diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index ee822e16c6..e1a2222362 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -7779,3 +7779,62 @@ class AMDSMICommands(): print(e) listener.stop() + + + def rocm_smi(self, args): + """ + Display GPU information in ROCm-SMI compatible format (showAllConcise). + This provides a drop-in replacement for rocm-smi --showallconcise using amdsmi backend. + + Args: + args: Parsed arguments (unused for this command) + """ + try: + # Import the ROCm-SMI compatible functions from the compatibility module + import sys + import os + # Add the current directory to path if needed + current_dir = os.path.dirname(os.path.abspath(__file__)) + if current_dir not in sys.path: + sys.path.insert(0, current_dir) + + import amdsmi_rocm_smi_compat + showAllConcise = amdsmi_rocm_smi_compat.showAllConcise + listDevices = amdsmi_rocm_smi_compat.listDevices + initializeRsmi = amdsmi_rocm_smi_compat.initializeRsmi + check_runtime_status = amdsmi_rocm_smi_compat.check_runtime_status + + # Initialize AMD SMI + if not initializeRsmi(): + logging.error("Failed to initialize AMD SMI") + return + + try: + # Get processor handles + deviceList = listDevices() + + if not deviceList: + logging.error("No AMD GPU devices found") + return + + # Check runtime status (low power state warning) + if not check_runtime_status(): + print("\nWARNING: AMD GPU device(s) is/are in a low-power state. Check power control/runtime_status\n") + + # Display ROCm-SMI compatible output + showAllConcise(deviceList) + + finally: + # Shutdown AMD SMI + try: + amdsmi_interface.amdsmi_shut_down() + except: + pass + + except ImportError as e: + logging.error(f"Could not import ROCm-SMI compatibility module: {e}") + logging.error("Make sure amdsmi_rocm_smi_compat.py is in the amdsmi_cli directory") + print("ERROR: ROCm-SMI compatibility mode not available") + except Exception as e: + logging.error(f"Error in ROCm-SMI compatibility mode: {e}") + print(f"ERROR: {e}") diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 2097faffc2..e345999c6e 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -70,7 +70,7 @@ class AMDSMIParser(argparse.ArgumentParser): """ def __init__(self, version, list, static, firmware, bad_pages, metric, process, profile, event, topology, set_value, reset, monitor, - xgmi, partition, ras, node, default, sys_argv=None, + xgmi, partition, ras, node, rocm_smi, default, sys_argv=None, helpers=None): # Helper variables @@ -113,6 +113,17 @@ class AMDSMIParser(argparse.ArgumentParser): add_help=True, prog=self.program_name) + # Add top-level --rocm-smi flag + self.add_argument('--rocm-smi', action='store_true', + help='Display GPU information in ROCm-SMI compatible format') + + # Add top-level command modifiers (for --rocm-smi and other top-level flags) + loglevel_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] + loglevel_help = f"Set the logging level from the possible choices: {', '.join(loglevel_choices)}" + self.add_argument('--loglevel', action='store', type=str.upper, required=False, + help=loglevel_help, default='ERROR', metavar='LEVEL', + choices=loglevel_choices) + # Setup subparsers self.subparsers = self.add_subparsers( title="AMD-SMI Commands", diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_rocm_smi_compat.py b/projects/amdsmi/amdsmi_cli/amdsmi_rocm_smi_compat.py new file mode 100755 index 0000000000..9bb845d9fa --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/amdsmi_rocm_smi_compat.py @@ -0,0 +1,941 @@ +#!/usr/bin/env python3 +"""ROCm_SMI_LIB CLI Tool - AMD-SMI Backend + +This tool acts as a command line interface for manipulating +and monitoring the amdgpu kernel, using the AMD-SMI library +instead of the rocm_smi_lib API. +This tool uses the amdsmi Python package to call the AMD SMI library. +Recommended: At least one AMD GPU with ROCm driver installed +Required: AMD SMI library installed (pip install amdsmi) +""" + +from __future__ import print_function +import sys +import os +import logging + +# Version information +SMI_MAJ = 4 +SMI_MIN = 0 +SMI_PAT = 0 +SMI_HASH = 'amdsmi' +__version__ = '%s.%s.%s+%s' % (SMI_MAJ, SMI_MIN, SMI_PAT, SMI_HASH) + +# Set to 1 if an error occurs +RETCODE = 0 + +# If we want JSON format output instead +PRINT_JSON = False +PRINT_CSV = False + +# Output formatting +headerString = ' ROCm System Management Interface ' +footerString = ' End of ROCm SMI Log ' +appWidth = 106 + +# DRM ioctl calculation helpers +# Reference: include/uapi/asm-generic/ioctl.h from Linux kernel +# https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/asm-generic/ioctl.h +_IOC_NRBITS = 8 # Number of bits for command number +_IOC_TYPEBITS = 8 # Number of bits for device type +_IOC_SIZEBITS = 14 # Number of bits for data size +_IOC_DIRBITS = 2 # Number of bits for direction + +_IOC_NRSHIFT = 0 +_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS +_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS +_IOC_DIRSHIFT = _IOC_SIZESHIFT + _IOC_SIZEBITS + +_IOC_WRITE = 1 # ioctl direction: write (userland -> kernel) + +def _IOC(dir, type, nr, size): + """Create ioctl command number + Reference: _IOC() macro from include/uapi/asm-generic/ioctl.h + """ + return (dir << _IOC_DIRSHIFT) | (type << _IOC_TYPESHIFT) | (nr << _IOC_NRSHIFT) | (size << _IOC_SIZESHIFT) + +def _IOW(type, nr, size): + """Create write ioctl command number + Reference: _IOW() macro from include/uapi/asm-generic/ioctl.h + """ + return _IOC(_IOC_WRITE, type, nr, size) + +# DRM and AMDGPU ioctl constants +# Reference: include/uapi/drm/drm.h and include/uapi/drm/amdgpu_drm.h from Linux kernel +# https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/drm/drm.h +# https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/drm/amdgpu_drm.h +DRM_IOCTL_BASE = ord('d') # 0x64 - DRM ioctl type character (from drm.h) +DRM_COMMAND_BASE = 0x40 # Base for driver-specific commands (from drm.h) +DRM_AMDGPU_INFO = 0x05 # AMDGPU info command number (from amdgpu_drm.h, line 51) +# Size of struct drm_amdgpu_info - we use minimal size for the ioctl call +# Reference: struct drm_amdgpu_info in include/uapi/drm/amdgpu_drm.h +DRM_AMDGPU_INFO_SIZE = 32 + +# Calculate DRM_IOCTL_AMDGPU_INFO +# Reference: Line 68 in amdgpu_drm.h: +# #define DRM_IOCTL_AMDGPU_INFO DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_INFO, struct drm_amdgpu_info) +DRM_IOCTL_AMDGPU_INFO = _IOW(DRM_IOCTL_BASE, DRM_COMMAND_BASE + DRM_AMDGPU_INFO, DRM_AMDGPU_INFO_SIZE) + +# Setup logging +logging.basicConfig(level=logging.WARNING, format='%(levelname)s: %(message)s') + +def driverInitialized(): + """ Returns true if amdgpu is found in the list of initialized modules + """ + driverInitialized = False + if os.path.exists("/sys/module/amdgpu"): + if os.path.exists("/sys/module/amdgpu/initstate"): + # amdgpu is loadable module + with open("/sys/module/amdgpu/initstate") as initstate: + if 'live' in initstate.read(): + driverInitialized = True + else: + # amdgpu is built into the kernel + driverInitialized = True + return driverInitialized + +def find_kfd_topology_path(): + """Find KFD topology path dynamically""" + possible_paths = [ + "/sys/class/kfd/kfd/topology/nodes", + "/sys/devices/virtual/kfd/kfd/topology/nodes" + ] + for path in possible_paths: + if os.path.exists(path): + return path + return None + +def find_drm_path(): + """Find DRM class path dynamically""" + possible_paths = [ + "/sys/class/drm", + "/sys/devices/virtual/drm" + ] + for path in possible_paths: + if os.path.exists(path): + return path + return None + +def initializeRsmi(): + """Initialize AMD SMI library - equivalent to rocm_smi.initializeRsmi()""" + try: + import amdsmi + amdsmi.amdsmi_init() + return True + except Exception as e: + logging.error(f"Failed to initialize AMD SMI: {e}") + return False + +def listDevices(): + """Get list of GPU devices - equivalent to rocm_smi.listDevices()""" + try: + import amdsmi + processors = amdsmi.amdsmi_get_processor_handles() + return processors + except Exception as e: + logging.error(f"Failed to get processor handles: {e}") + return [] + +def checkAmdGpus(deviceList): + """Check if AMD GPUs are present - equivalent to rocm_smi.checkAmdGpus()""" + if not deviceList: + logging.warning('No AMD GPUs specified') + return False + return True + +def check_runtime_pm_status(card_path): + """ + Check if a specific GPU card is in runtime PM suspended state. + Equivalent to rocm_smi.check_runtime_pm_status() + + Args: + card_path: Path to the card (e.g., /sys/class/drm/card0) + + Returns: + bool: True if suspended, False if active or runtime PM disabled + """ + try: + runtime_status = os.path.join(card_path, "device/power/runtime_status") + runtime_enabled = os.path.join(card_path, "device/power/runtime_enabled") + + # Check if runtime PM is enabled + if os.path.exists(runtime_enabled): + with open(runtime_enabled) as f: + enabled = f.read().strip() + if "disabled" in enabled or "forbidden" in enabled: + return False + + # Check runtime status + if os.path.exists(runtime_status): + with open(runtime_status) as f: + status = f.read().strip() + return "suspended" in status + + return False + except: + return False + +def wake_device(card_path): + """ + Wake a GPU device from runtime PM suspended state. + Equivalent to rocm_smi.wake_device() + + Uses DRM_IOCTL_AMDGPU_INFO ioctl to wake the device from runtime suspend. + + Args: + card_path: Path to the card (e.g., /sys/class/drm/card0) + + Returns: + bool: True if device was woken or already active, False on error + """ + try: + import re + import fcntl + import struct + import errno + + # Extract card number from card_path (e.g., card0 -> 0) + card_name = os.path.basename(card_path) + card_match = re.match(r'card(\d+)', card_name) + if not card_match: + return False + + # Find renderD device in the same DRM directory + drm_path = os.path.dirname(card_path) + render_name = None + + # Look for renderD* in the drm directory + for entry in os.listdir(drm_path): + if re.match(r'renderD\d+', entry): + # Check if this renderD is associated with our card + # by verifying they point to the same device + entry_link = os.path.join(drm_path, entry) + card_link = card_path + + try: + entry_real = os.path.realpath(entry_link) + card_real = os.path.realpath(card_link) + + # They should share the same parent device directory + if os.path.dirname(entry_real) == os.path.dirname(card_real): + render_name = entry + break + except: + continue + + if not render_name: + return False + + render_path = f"/dev/dri/{render_name}" + + # Open the DRM device and send ioctl to wake it + if os.path.exists(render_path): + try: + fd = os.open(render_path, os.O_RDWR | os.O_CLOEXEC) + + # Create empty drm_amdgpu_info struct (all zeros) + request = struct.pack('I' * 8, *([0] * 8)) # 32 bytes of zeros + + # Send DRM_IOCTL_AMDGPU_INFO ioctl to wake the device + try: + fcntl.ioctl(fd, DRM_IOCTL_AMDGPU_INFO, request) + except OSError as e: + # EINVAL is acceptable - device may already be awake + if e.errno != errno.EINVAL: + os.close(fd) + return False + + os.close(fd) + return True + except OSError: + return False + + return False + except: + return False + +def check_runtime_status(): + """ + Check if any GPUs are in low power state and wake them if needed. + Equivalent to rocm_smi.check_runtime_status() + + Returns: + bool: True if all devices are active, False if any are suspended + """ + try: + drm_path = find_drm_path() + if not drm_path: + return True + + all_active = True + for card in os.listdir(drm_path): + if card.startswith("card") and "-" not in card: + card_path = os.path.join(drm_path, card) + + # Check if this card is suspended + is_suspended = check_runtime_pm_status(card_path) + if is_suspended: + all_active = False + # Try to wake the device + wake_device(card_path) + + return all_active + except: + return True + +def get_bdf(processor): + """Get BDF (Bus:Device.Function)""" + try: + import amdsmi + bdf = amdsmi.amdsmi_get_gpu_device_bdf(processor) + return bdf + except: + return None + +def bdf_to_location_id(bdf): + """Convert BDF string to location_id integer used by KFD + BDF format: "0000:44:00.0" -> location_id calculation + """ + try: + # Parse BDF: domain:bus:device.function + parts = bdf.replace(':', ' ').replace('.', ' ').split() + if len(parts) >= 4: + domain = int(parts[0], 16) + bus = int(parts[1], 16) + device = int(parts[2], 16) + function = int(parts[3], 16) + + # KFD location_id = (bus << 8) | (device << 3) | function + location_id = (bus << 8) | (device << 3) | function + return location_id + except: + pass + return None + +def get_device_info(processor, bdf=None): + """Get device ID and GPU ID (GUID) from KFD topology""" + try: + import amdsmi + asic_info = amdsmi.amdsmi_get_gpu_asic_info(processor) + device_id = asic_info.get('device_id', 'N/A') + + # Get GPU ID from KFD topology by matching location_id + kfd_path = find_kfd_topology_path() + gpu_id = 'N/A' + + if kfd_path and os.path.exists(kfd_path) and bdf: + location_id = bdf_to_location_id(bdf) + if location_id is not None: + # Look through KFD nodes to find matching location_id + for node_dir in os.listdir(kfd_path): + props_file = os.path.join(kfd_path, node_dir, "properties") + if os.path.exists(props_file): + with open(props_file) as f: + props_dict = {} + for line in f: + if ' ' in line: + parts = line.strip().split(None, 1) + if len(parts) == 2: + props_dict[parts[0]] = parts[1] + + # Match by location_id + if props_dict.get('location_id') == str(location_id): + gpu_id_file = os.path.join(kfd_path, node_dir, "gpu_id") + if os.path.exists(gpu_id_file): + with open(gpu_id_file) as gf: + gpu_id = gf.read().strip() + break + + return { + 'device_id': device_id, + 'guid': gpu_id + } + except Exception as e: + return {'device_id': 'N/A', 'guid': 'N/A'} + +def get_kfd_node_id(processor, bdf=None): + """Get KFD node ID by matching BDF to location_id in topology""" + try: + import amdsmi + + if not bdf: + return amdsmi.amdsmi_topo_get_numa_node_number(processor) + + kfd_path = find_kfd_topology_path() + if kfd_path and os.path.exists(kfd_path): + location_id = bdf_to_location_id(bdf) + if location_id is not None: + # Look through KFD nodes to find matching location_id + for node_dir in os.listdir(kfd_path): + props_file = os.path.join(kfd_path, node_dir, "properties") + if os.path.exists(props_file): + with open(props_file) as f: + props_dict = {} + for line in f: + if ' ' in line: + parts = line.strip().split(None, 1) + if len(parts) == 2: + props_dict[parts[0]] = parts[1] + + # Match by location_id + if props_dict.get('location_id') == str(location_id): + return int(node_dir) + + # Fallback to NUMA node + return amdsmi.amdsmi_topo_get_numa_node_number(processor) + except: + return None + +def get_temperature(processor, temp_type='edge'): + """Get GPU temperature""" + try: + import amdsmi + # Map temp_type string to enum + temp_type_map = { + 'edge': amdsmi.AmdSmiTemperatureType.EDGE, + 'junction': amdsmi.AmdSmiTemperatureType.HOTSPOT, + 'hotspot': amdsmi.AmdSmiTemperatureType.HOTSPOT, + 'memory': amdsmi.AmdSmiTemperatureType.VRAM, + 'vram': amdsmi.AmdSmiTemperatureType.VRAM, + } + + temp_enum = temp_type_map.get(temp_type.lower(), amdsmi.AmdSmiTemperatureType.EDGE) + + temp = amdsmi.amdsmi_get_temp_metric( + processor, + temp_enum, + amdsmi.AmdSmiTemperatureMetric.CURRENT + ) + return temp + except: + # Try other temperature types + try: + for ttype in [amdsmi.AmdSmiTemperatureType.HOTSPOT, + amdsmi.AmdSmiTemperatureType.EDGE, + amdsmi.AmdSmiTemperatureType.VRAM]: + try: + temp = amdsmi.amdsmi_get_temp_metric( + processor, + ttype, + amdsmi.AmdSmiTemperatureMetric.CURRENT + ) + return temp + except: + continue + except: + pass + return None + +def getTemperatureLabel(deviceList): + """Discover the first available temperature label - matches rocm-smi priority""" + import amdsmi + if not deviceList: + return 'edge' + + # ROCm-SMI always prefers edge temperature first, then junction, then memory + # This matches the original rocm-smi behavior + processor = deviceList[0] + temp_types = [ + ('edge', amdsmi.AmdSmiTemperatureType.EDGE), + ('junction', amdsmi.AmdSmiTemperatureType.HOTSPOT), + ('memory', amdsmi.AmdSmiTemperatureType.VRAM), + ] + + for label, temp_type in temp_types: + try: + temp = amdsmi.amdsmi_get_temp_metric( + processor, + temp_type, + amdsmi.AmdSmiTemperatureMetric.CURRENT + ) + if temp is not None and temp > 0: + return label + except: + continue + + return 'edge' + +def getPowerLabel(deviceList): + """Discover the first available power label - matches rocm-smi output""" + import amdsmi + if not deviceList: + return '(Avg)' + + # ROCm-SMI always uses "(Avg)" for Average Graphics Package Power + # Return (Avg) to match rocm-smi behavior + return '(Avg)' + +def get_power_from_sysfs(bdf): + """Get power from sysfs without waking GPU""" + try: + if not bdf: + return None + + drm_path = find_drm_path() + if not drm_path: + return None + + for card in os.listdir(drm_path): + if card.startswith("card") and "-" not in card: + card_bdf_path = os.path.join(drm_path, card, "device/uevent") + if os.path.exists(card_bdf_path): + with open(card_bdf_path) as f: + content = f.read() + if bdf in content: + hwmon_path = os.path.join(drm_path, card, "device/hwmon") + if os.path.exists(hwmon_path): + for hwmon in os.listdir(hwmon_path): + power_file = os.path.join(hwmon_path, hwmon, "power1_average") + if os.path.exists(power_file): + with open(power_file) as f: + power_uw = int(f.read().strip()) + return power_uw / 1000000.0 + break + return None + except: + return None + +def get_power(processor, bdf=None): + """Get power consumption""" + try: + import amdsmi + # Try sysfs first (doesn't wake GPU) + power_sysfs = get_power_from_sysfs(bdf) + if power_sysfs is not None: + return power_sysfs + + # Fallback to API + power_dict = amdsmi.amdsmi_get_power_info(processor) + if 'average_socket_power' in power_dict: + power_w = power_dict['average_socket_power'] + if power_w != 'N/A' and isinstance(power_w, (int, float)): + return float(power_w) + if 'socket_power' in power_dict: + power_w = power_dict['socket_power'] + if power_w != 'N/A' and isinstance(power_w, (int, float)): + return float(power_w) + return None + except: + return None + +def get_clock_freq_from_sysfs(bdf, clk_type): + """Get clock frequency from sysfs without waking GPU""" + try: + import amdsmi + if not bdf: + return None + + clk_file_map = { + amdsmi.AmdSmiClkType.SYS: 'pp_dpm_sclk', + amdsmi.AmdSmiClkType.MEM: 'pp_dpm_mclk', + } + + if clk_type not in clk_file_map: + return None + + drm_path = find_drm_path() + if not drm_path: + return None + + for card in os.listdir(drm_path): + if card.startswith("card") and "-" not in card: + card_bdf_path = os.path.join(drm_path, card, "device/uevent") + if os.path.exists(card_bdf_path): + with open(card_bdf_path) as f: + content = f.read() + if bdf in content: + dpm_file = os.path.join(drm_path, card, f"device/{clk_file_map[clk_type]}") + if os.path.exists(dpm_file): + with open(dpm_file) as f: + for line in f: + if '*' in line: + parts = line.split(':') + if len(parts) >= 2: + freq_str = parts[1].replace('*', '').strip().replace('Mhz', '').replace('MHz', '') + return float(freq_str) + break + return None + except: + return None + +def get_clock_freq(processor, clk_type, bdf=None): + """Get clock frequency""" + try: + import amdsmi + # Try sysfs first (doesn't wake GPU) + freq_sysfs = get_clock_freq_from_sysfs(bdf, clk_type) + if freq_sysfs is not None: + return freq_sysfs + + # Fallback to API + freq_dict = amdsmi.amdsmi_get_clk_freq(processor, clk_type) + if 'current' in freq_dict and 'frequency' in freq_dict: + current_idx = freq_dict['current'] + frequencies = freq_dict['frequency'] + if current_idx < len(frequencies): + freq_hz = frequencies[current_idx] + return freq_hz / 1000000 + return None + except: + return None + +def get_gpu_usage_from_sysfs(bdf): + """Get GPU usage from sysfs without waking GPU""" + try: + if not bdf: + return None + + drm_path = find_drm_path() + if not drm_path: + return None + + for card in os.listdir(drm_path): + if card.startswith("card") and "-" not in card: + card_bdf_path = os.path.join(drm_path, card, "device/uevent") + if os.path.exists(card_bdf_path): + with open(card_bdf_path) as f: + content = f.read() + if bdf in content: + busy_file = os.path.join(drm_path, card, "device/gpu_busy_percent") + if os.path.exists(busy_file): + with open(busy_file) as f: + return int(f.read().strip()) + break + return None + except: + return None + +def get_gpu_usage(processor, bdf=None): + """Get GPU utilization percentage""" + try: + import amdsmi + # Try sysfs first (doesn't wake GPU) + usage_sysfs = get_gpu_usage_from_sysfs(bdf) + if usage_sysfs is not None: + return usage_sysfs + + # Fallback to API + usage = amdsmi.amdsmi_get_gpu_busy_percent(processor) + return usage + except: + return None + +def get_memory_usage(processor): + """Get VRAM usage percentage""" + try: + import amdsmi + vram_used = amdsmi.amdsmi_get_gpu_memory_usage(processor, amdsmi.AmdSmiMemoryType.VRAM) + vram_total = amdsmi.amdsmi_get_gpu_memory_total(processor, amdsmi.AmdSmiMemoryType.VRAM) + if vram_total > 0: + return (vram_used / vram_total) * 100 + return None + except: + return None + +def get_fan_speed(processor): + """Get fan speed percentage""" + try: + import amdsmi + fan_speed = amdsmi.amdsmi_get_gpu_fan_speed(processor, 0) + return fan_speed + except: + # Fan not supported (e.g., OAM modules) - return 0 + return 0 + +def get_perf_level(processor): + """Get performance level""" + try: + import amdsmi + perf_level = amdsmi.amdsmi_get_gpu_perf_level(processor) + if isinstance(perf_level, str): + if 'AUTO' in perf_level: + return 'auto' + elif 'LOW' in perf_level: + return 'low' + elif 'HIGH' in perf_level: + return 'high' + elif 'MANUAL' in perf_level: + return 'manual' + elif 'STABLE_STD' in perf_level: + return 'stable_std' + elif 'STABLE_PEAK' in perf_level: + return 'stable_peak' + elif 'STABLE_MIN_MCLK' in perf_level: + return 'stable_min_mclk' + elif 'STABLE_MIN_SCLK' in perf_level: + return 'stable_min_sclk' + return perf_level + except: + return None + +def get_power_cap(processor): + """Get power cap in Watts""" + try: + import amdsmi + power_cap_info = amdsmi.amdsmi_get_power_cap_info(processor) + if 'power_cap' in power_cap_info: + return power_cap_info['power_cap'] / 1000000.0 + return None + except: + return None + +def get_memory_partition(processor): + """Get memory partition""" + try: + import amdsmi + partition = amdsmi.amdsmi_get_gpu_memory_partition(processor) + # partition is already a string like 'NPS1', 'NPS2', etc. + if isinstance(partition, str): + return partition + # If it's an enum, convert it + partition_names = { + amdsmi.AmdSmiMemoryPartitionType.NPS1: 'NPS1', + amdsmi.AmdSmiMemoryPartitionType.NPS2: 'NPS2', + amdsmi.AmdSmiMemoryPartitionType.NPS4: 'NPS4', + amdsmi.AmdSmiMemoryPartitionType.NPS8: 'NPS8', + } + return partition_names.get(partition, str(partition)) + except: + return 'N/A' + +def get_compute_partition(processor): + """Get compute partition""" + try: + import amdsmi + partition = amdsmi.amdsmi_get_gpu_compute_partition(processor) + # partition is already a string like 'SPX', 'DPX', etc. + if isinstance(partition, str): + return partition + # If it's an enum, convert it + partition_names = { + amdsmi.AmdSmiComputePartitionType.SPX: 'SPX', + amdsmi.AmdSmiComputePartitionType.DPX: 'DPX', + amdsmi.AmdSmiComputePartitionType.TPX: 'TPX', + amdsmi.AmdSmiComputePartitionType.QPX: 'QPX', + amdsmi.AmdSmiComputePartitionType.CPX: 'CPX', + } + return partition_names.get(partition, str(partition)) + except: + return 'N/A' + +def printLogSpacer(displayString=None, fill='=', contentSizeToFit=0): + """ Prints formatted spacer line + + :param displayString: String to display in the center + :param fill: Character to use for padding + :param contentSizeToFit: Width to fit content (overrides appWidth) + """ + global appWidth, PRINT_JSON + resizeValue = appWidth + if contentSizeToFit != 0: + resizeValue = contentSizeToFit + if resizeValue % 2: # if odd -> make even + resizeValue += 1 + + if not PRINT_JSON: + if displayString: + if len(displayString) % 2: + displayString += fill + logSpacer = fill * int((resizeValue - len(displayString)) / 2) + displayString + fill * int( + (resizeValue - len(displayString)) / 2) + else: + logSpacer = fill * resizeValue + print(logSpacer) + +def printLog(device, logstr, logname, useItalics=False): + """ Print out to the log + + :param device: Device to print (can be None) + :param logstr: String to print + :param logname: Log name (can be None) + :param useItalics: Whether to use italics (not used in this implementation) + """ + global PRINT_JSON + if not PRINT_JSON: + print(logstr) + +def showAllConcise(deviceList): + """ + Equivalent to rocm_smi.showAllConcise() + Display critical info for all devices in a concise format + """ + global PRINT_JSON, headerString, footerString + import amdsmi + + if PRINT_JSON: + print('NOT_SUPPORTED: Cannot print JSON/CSV output for concise output') + sys.exit(1) + + silent = True + + # CRITICAL: Cache BDF first to minimize GPU wake-up + bdf_cache = {} + for i, processor in enumerate(deviceList): + bdf_cache[i] = get_bdf(processor) + + # Detect temperature and power labels + available_temp_type = getTemperatureLabel(deviceList) + temp_label = f"({available_temp_type.capitalize()})" + power_label = getPowerLabel(deviceList) + + # Header and subheader setup + header = ['Device', 'Node', 'IDs', '', 'Temp', 'Power', 'Partitions', + 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%'] + subheader = ['', '', '(DID,', 'GUID)', temp_label, power_label, + '(Mem, Compute, ID)', + '', '', '', '', '', '', ''] + + # Add additional spaces to match header + for idx, item in enumerate(subheader): + header_size = len(header[idx]) + subheader_size = len(subheader[idx]) + if header_size != subheader_size: + numSpacesToFill_subheader = header_size - subheader_size + numSpacesToFill_header = subheader_size - header_size + if numSpacesToFill_subheader > 0: + subheader[idx] = subheader[idx] + (' ' * numSpacesToFill_subheader) + if numSpacesToFill_header > 0: + header[idx] = header[idx] + (' ' * numSpacesToFill_header) + + head_widths = [len(head) + 2 for head in header] + values = {} + degree_sign = u'\N{DEGREE SIGN}' + + # Collect data for each device + for i, processor in enumerate(deviceList): + bdf = bdf_cache.get(i) + + # Read sysfs values FIRST (before other API calls that might wake GPU) + sclk = get_clock_freq(processor, amdsmi.AmdSmiClkType.SYS, bdf) + sclk_str = f"{sclk:.0f}Mhz" if sclk is not None else "N/A" + + mclk = get_clock_freq(processor, amdsmi.AmdSmiClkType.MEM, bdf) + mclk_str = f"{mclk:.0f}Mhz" if mclk is not None else "N/A" + + gpu_usage = get_gpu_usage(processor, bdf) + gpu_str = f"{gpu_usage}%" if gpu_usage is not None else "N/A" + + power = get_power(processor, bdf) + power_str = f"{power:.1f}W" if power is not None else "N/A" + + # Now get other info (may wake GPU but critical sysfs values already read) + node = get_kfd_node_id(processor, bdf) + if node is None: + node = i + + device_info = get_device_info(processor, bdf) + device_id = device_info['device_id'] + guid_str = str(device_info['guid']) + if len(guid_str) > 10: + guid_str = guid_str[-5:] + + temp = get_temperature(processor, available_temp_type) + temp_val = f"{temp:.1f}{degree_sign}C" if temp is not None else "N/A" + + mem_part = get_memory_partition(processor) + comp_part = get_compute_partition(processor) + combined_partition_data = f"{mem_part}, {comp_part}, 0" + + fan = get_fan_speed(processor) + fan_str = f"{fan}%" if fan is not None else "N/A" + + perf = get_perf_level(processor) + perf_str = perf if perf is not None else "N/A" + + pwr_cap = get_power_cap(processor) + pwr_cap_str = f"{pwr_cap:.1f}W" if pwr_cap is not None else "N/A" + + vram = get_memory_usage(processor) + vram_str = f"{vram:.0f}%" if vram is not None else "N/A" + + # Store values + values[f'card{i}'] = [i, node, + str(device_id) + ", ", + str(guid_str), + temp_val, power_str, + combined_partition_data, + sclk_str, mclk_str, fan_str, perf_str, + pwr_cap_str, + vram_str, + gpu_str] + + # Calculate column widths + val_widths = {} + for i in range(len(deviceList)): + val_widths[i] = [len(str(val)) + 2 for val in values[f'card{i}']] + + max_widths = head_widths + for i in range(len(deviceList)): + for col in range(len(val_widths[i])): + max_widths[col] = max(max_widths[col], val_widths[i][col]) + + # Display concise info + header_output = "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header)) + subheader_output = "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)) + + printLogSpacer(headerString, contentSizeToFit=len(header_output)) + printLogSpacer(' Concise Info ', contentSizeToFit=len(header_output)) + printLog(None, header_output, None) + printLog(None, subheader_output, None, useItalics=True) + printLogSpacer(fill='=', contentSizeToFit=len(header_output)) + + for i in range(len(deviceList)): + printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in + zip(range(len(max_widths)), values[f'card{i}'])), None) + + printLogSpacer(footerString, contentSizeToFit=len(header_output)) + +def main(): + """ + Main function - equivalent to rocm_smi.py main execution + """ + global RETCODE + + # Import amdsmi + try: + import amdsmi + except ImportError: + print("ERROR: Could not import amdsmi module") + print("Install with: pip install amdsmi") + return 1 + + # Check if driver is initialized + if not driverInitialized(): + logging.error('Unable to detect amdgpu driver. Exiting...') + return 1 + + # Initialize AMD SMI (equivalent to rocm_smi.initializeRsmi()) + if not initializeRsmi(): + logging.error("Failed to initialize AMD SMI") + return 1 + + try: + # Get processor handles (equivalent to rocm_smi.listDevices()) + deviceList = listDevices() + + if not deviceList: + logging.error("No AMD GPU devices found") + return 1 + + # Check AMD GPUs (equivalent to rocm_smi.checkAmdGpus()) + if not checkAmdGpus(deviceList): + logging.warning('No AMD GPUs specified') + + # Check runtime status (equivalent to rocm_smi.check_runtime_status()) + if not check_runtime_status(): + print("\nWARNING: AMD GPU device(s) is/are in a low-power state. Check power control/runtime_status\n") + + # Call showAllConcise (equivalent to rocm_smi.showAllConcise(deviceList)) + showAllConcise(deviceList) + + finally: + # Shutdown AMD SMI + try: + amdsmi.amdsmi_shut_down() + except: + pass + + return RETCODE + +if __name__ == '__main__': + sys.exit(main())