From 013400bee7fb47284398f162b0e079c2e2eaab85 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Mon, 9 Jan 2023 11:19:35 -0600 Subject: [PATCH] Added AMD-SMI Linux Baremetal Change-Id: I39ec76f4e4a8ca32eba10f4541585b2284e71539 Signed-off-by: Maisam Arif --- amd_smi_cli/BDF.py | 96 ++++ amd_smi_cli/_version.py | 1 + amd_smi_cli/amd_smi.py | 31 ++ amd_smi_cli/amd_smi_commands.py | 91 ++++ amd_smi_cli/amd_smi_helpers.py | 151 ++++++ amd_smi_cli/amd_smi_init.py | 71 +++ amd_smi_cli/amd_smi_logger.py | 46 ++ amd_smi_cli/amd_smi_modules.py | 97 ++++ amd_smi_cli/amd_smi_parser.py | 619 ++++++++++++++++++++++++ amd_smi_cli/amdsmiBindings.py | 818 ++++++++++++++++++++++++++++++++ example/amd_smi_drm_example.cc | 2 + 11 files changed, 2023 insertions(+) create mode 100644 amd_smi_cli/BDF.py create mode 100644 amd_smi_cli/_version.py create mode 100644 amd_smi_cli/amd_smi.py create mode 100644 amd_smi_cli/amd_smi_commands.py create mode 100644 amd_smi_cli/amd_smi_helpers.py create mode 100644 amd_smi_cli/amd_smi_init.py create mode 100644 amd_smi_cli/amd_smi_logger.py create mode 100644 amd_smi_cli/amd_smi_modules.py create mode 100644 amd_smi_cli/amd_smi_parser.py create mode 100644 amd_smi_cli/amdsmiBindings.py diff --git a/amd_smi_cli/BDF.py b/amd_smi_cli/BDF.py new file mode 100644 index 0000000000..a4cf29f3da --- /dev/null +++ b/amd_smi_cli/BDF.py @@ -0,0 +1,96 @@ +import logging +import platform +import re + + +class BDF(object): + """ BDF Class to cast and compare BDF objects using built-in python comparators + + Useful for validating a BDF string and converting it to a BDF object + This allows us to handle BDF objects in a pythonic way + + Attributes: + __eq__: The equals comparator + __: An integer count of the eggs we have laid. + """ + + def __init__(self, bdf): + """Init a BDF object""" + if isinstance(bdf, BDF): + self.segment, self.bus, self.device, self.function = tuple(bdf) + else: + if bdf.startswith("BDF("): + bdf = bdf.replace('BDF(', '').replace(')', '') + # Tell if this is baremetal vs Virtualization + self.operating_system = platform.system() + + try: + bdf_components = [int(x, 16) for x in re.split('[:.]', bdf)] + except ValueError as e: + logging.error(f"Invalid string passed: {bdf}") + raise e + + self.segment = bdf_components[0] if len(bdf_components) == 4 else 0 + self.bus, self.device, self.function = bdf_components[-3:] + if self.segment > 65535: + raise ValueError("BDF Segment can't be greater than 65535") + if self.bus > 255: + raise ValueError("BDF Bus can't be greater than 255") + if self.device > 31: + raise ValueError("BDF Device can't be greater than 31") + if self.function > 7: + raise ValueError("BDF Function can't be greater than 7") + + def __eq__(self, passed_bdf): + """Overrides the == operator and allows for BDF objects to be compared to BDF strings""" + + # Only accept strings and BDF objects + if isinstance(passed_bdf, str): + if passed_bdf == '': + return False + passed_bdf = BDF(passed_bdf) + elif not isinstance(passed_bdf, BDF): + return False + + if self.segment == passed_bdf.segment and \ + self.bus == passed_bdf.bus and \ + self.device == passed_bdf.device and \ + self.function == passed_bdf.function: + return True + else: + return False + + def __ne__(self, passed_bdf): + """Overrides the != operator and allows for BDF objects to be compared to BDF strings""" + # Since we overrided the == operator we can use that to make this simple + return not self == passed_bdf + + def __add__(self, passed_bdf): + """Overrides the + operator and allows for concatenation""" + return str(self) + passed_bdf + + def __radd__(self, passed_bdf): + """Overrides the + operator and allows for concatenation""" + return passed_bdf + str(self) + + def __str__(self): + """Cast BDF object to a string""" + return "{:04X}:{:02X}:{:02X}:{}".format(self.segment, self.bus, self.device, self.function) + + def __repr__(self): + """How the BDF object is represented""" + return f"BDF({self})" + + def __iter__(self): + """Make the BDF object iterable over its 4 values""" + yield from (self.segment, self.bus, self.device, self.function) + + def __contains__(self, passed_bdf): + """Overrided the 'in' comparator in python""" + passed_bdf = str(BDF(passed_bdf)) + + bdf_regex = "(?:[0-6]?[0-9a-fA-F]{1,4}:)?[0-2]?[0-9a-fA-F]{1,2}:[0-9a-fA-F]{1,2}\.[0-7]" + for match in re.findall(bdf_regex, passed_bdf): + if self == match: + return True + return False diff --git a/amd_smi_cli/_version.py b/amd_smi_cli/_version.py new file mode 100644 index 0000000000..b3c06d4883 --- /dev/null +++ b/amd_smi_cli/_version.py @@ -0,0 +1 @@ +__version__ = "0.0.1" \ No newline at end of file diff --git a/amd_smi_cli/amd_smi.py b/amd_smi_cli/amd_smi.py new file mode 100644 index 0000000000..88c9d9c3c3 --- /dev/null +++ b/amd_smi_cli/amd_smi.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 + +# from amd_smi_init import * + +from amd_smi_commands import AMD_SMI_Commands +from amd_smi_parser import AMD_SMI_Parser + + +# sudo /src/out/ubuntu-20.04/20.04/bin/rocm-smi -bc --json | python -m json.tool + + +if __name__ == "__main__": + amd_smi_commands = AMD_SMI_Commands() + amd_smi_parser = AMD_SMI_Parser(amd_smi_commands.version, + amd_smi_commands.discovery, + amd_smi_commands.static, + amd_smi_commands.firmware, + amd_smi_commands.bad_pages, + amd_smi_commands.metric, + amd_smi_commands.process, + amd_smi_commands.profile, + amd_smi_commands.event, + amd_smi_commands.topology, + amd_smi_commands.set_value, + amd_smi_commands.reset, + amd_smi_commands.misc, + amd_smi_commands.gpu_v) + + args = amd_smi_parser.parse_args() + args.func(args) # This needs to be there to handle subparsers with no subcommands + # AMDSMI logger print out json, csv, or string diff --git a/amd_smi_cli/amd_smi_commands.py b/amd_smi_cli/amd_smi_commands.py new file mode 100644 index 0000000000..e5ea6b8478 --- /dev/null +++ b/amd_smi_cli/amd_smi_commands.py @@ -0,0 +1,91 @@ +#!/usr/bin/python3 + +import platform +import signal +import subprocess +import sys +import time +import traceback + +from pathlib import Path + +# from amd_smi_init import * +from BDF import BDF +from _version import __version__ + +from amd_smi_logger import AMD_SMI_Logger + + + +class AMD_SMI_Commands(object): + # def __init__(self, amd_smi_logger) -> None: + # logger = amd_smi_logger + # # Make an AMD-SMI-Object-Logger only with the commands object on init + # # Call the logger when we want to store a print: + # # self.logger.store_output(gpu_id, string) # store in ordered dict + # Every function prints the logger at the end + # logger.printoutput(args.json, args.csv) # Which in Logger handles and checks for json or csv + # Check if init can accept args given, if so then init can be used to call watch functions for looping + + + def version(self, args): + kernel_version = 123 + print(f'AMD-SMI version: {__version__} | Kernel version: {kernel_version}') + + + def discovery(self, args): + print('discovery test') + + + def static(self, args): + #This is where the arg handling comes through + print(args.asic) + print(args.bus) + print(args.driver) + print('static test') + + + def firmware(self, args): + print('firmware test') + + + def bad_pages(self, args): + # Retired Pages + print('Bad Pages test') + + + def metric(self, args): + print('Metric test') + + + def process(self, args): + print('Process Test') + + + def profile(self, args): + print('Profile test') + + + def event(self, args): + print('event test') + + + def topology(self, args): + print('topology test') + + + def set_value(self, args): + print('set_value test') + + + def reset(self, args): + print('reset test') + + + def misc(self, args): + print('misc test') + + + def gpu_v(self, args): + print('misc test') + diff --git a/amd_smi_cli/amd_smi_helpers.py b/amd_smi_cli/amd_smi_helpers.py new file mode 100644 index 0000000000..e23c792a19 --- /dev/null +++ b/amd_smi_cli/amd_smi_helpers.py @@ -0,0 +1,151 @@ +#!/usr/bin/python3 + +import argparse +import platform +import signal +import subprocess +import sys +import time +import traceback +import logging + +from pathlib import Path + +from BDF import BDF +from amd_smi_init import * + + +class AMD_SMI_Helpers(object): + def __init__(self) -> None: + # implement basic config for debug logs + self.operating_system = platform.system() + self._is_hypervisor = False + self._is_virtual_os = False + self._is_baremetal = False + self._is_linux = False + self._is_windows = False + + self.virtual_operating_systems_product_names = ['KVM', 'VirtualBox', 'VMware'] #@TODO get KVM example + + if self.operating_system.startswith('Linux'): + self._is_linux = True + # logging.debug(f'whatever:{self._is_linux}') + # KVM hypervisor check @TODO + + product_name = '' + product_name_path = Path('/sys/class/dmi/id/product_name') + if product_name_path.exists(): + product_name = product_name_path.read_text().strip() + + if product_name == '': + # Unable to determine product_name default to baremetal + self._is_baremetal = True + else: + for vm_os in self.virtual_operating_systems_product_names: + if product_name.startswith(vm_os): + # Log that this is a virtual OS + self._is_virtual_os = True + break + + # The current way I determine if a system is baremetal by deduction of the other two arguments + self._is_baremetal = not self._is_hypervisor and not self._is_virtual_os + + + if self.operating_system.startswith('VMkernel'): + self._is_hypervisor = True + + + if self.operating_system.startswith('Window'): + # Check Powershell for Hyper-V enablement + self._is_windows = True + + # Get-CimInstance -ClassName Win32_ComputerSystem Manufacturer + + + # if self.product_name == '' and not self._is_hypervisor: + # self._is_virtual_os = any(self.product_name.startswith(virtual_os) for virtual_os in self.virtual_operating_systems) + + + # self.operating_system = '' + + + def os_info(self): + # Return OS info + # operating_system = + + + # if sys.platform.startswith('win'): + + # elif sys.platform.startswith('linux'): + + return True + + + def is_virtual_os(self): + return self._is_virtual_os + + + def is_hypervisor(self): + # Returns True if hypervisor is enabled on the system + return self._is_hypervisor + + + def is_baremetal(self): + # Returns True if system is baremetal, if system is hypervisor this should return False + return self._is_baremetal + + + + def is_linux(self): + return self._is_linux + + + def is_windows(self): + return self._is_windows + + + def get_gpu_choices(self): + # Return in format {gpu_index : (BDF, UUID)} + + gpu_choices = {} + gpu_index = '1' + gpu_bdf = BDF('0000:23:00.0') + gpu_uuid = '1234' + gpu_choices[gpu_index] = (gpu_bdf, gpu_uuid) + return gpu_choices + + + def get_devices(self): + pass + + + def get_device_from_socket(self): + pass + + + def get_amd_gpu_bdfs(self): + pass + + + def get_amd_cpu_bdfs(self): + pass + + + + # def getBus(device): + # """ Return the bus identifier of a given device + + # @param device: DRM device identifier + # """ + # bdfid = c_uint64(0) + # ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid)) + + # # BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) |((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7) + # domain = (bdfid.value >> 32) & 0xffffffff + # bus = (bdfid.value >> 8) & 0xff + # device = (bdfid.value >> 3) & 0x1f + # function = bdfid.value & 0x7 + + # pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function) + # if rsmi_ret_ok(ret, device): + # return pic_id diff --git a/amd_smi_cli/amd_smi_init.py b/amd_smi_cli/amd_smi_init.py new file mode 100644 index 0000000000..b6daa1abc0 --- /dev/null +++ b/amd_smi_cli/amd_smi_init.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 + +### Handle init singularly +# Python imports module does not re-execute code on import + +import atexit +import logging +import signal +import sys + +from pathlib import Path + +# Handle bindings for windows, Hyper-v and KVM seperately +from amdsmiBindings import * + +# Using basic python logging for user errors and development +# logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG) # Logging for Development +logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.ERROR) # User level logging + +# On initial import set initialized variable +amd_smi_initialized = False + +def check_return(return_code, error_statment): #@TODO would raising an exception be better? + if return_code != amdsmi_status.AMDSMI_STATUS_SUCCESS: + logging.error(error_statment) + sys.exit(return_code) + + +def check_amdgpu_driver(): #@TODO Handle KVM logic + """ Returns true if amdgpu is found in the list of initialized modules """ + amd_gpu_status_file = Path("/sys/module/amdgpu/initstate") + + if amd_gpu_status_file.exists(): + if amd_gpu_status_file.read_text().strip() == 'live': + return True + + return False + + +def init_amd_smi(flag=amdsmi_init_flags.AMD_SMI_INIT_AMD_GPUS): + """ Initializes AMD-SMI """ + # Check if amdgpu driver is up + if check_amdgpu_driver(): + # Only init AMD GPUs for now, waiting for future support for AMD CPUs + init_status = amdsmi.amdsmi_init(flag) + check_return(return_code=init_status, error_statment=f'AMD SMI initialization returned {init_status} (the expected value is {amdsmi_status_t.AMDSMI_STATUS_SUCCESS})') + logging.info('amd-smi initialized successfully') + else: + logging.error('Driver not initialized (amdgpu not found in modules)') + exit(-1) + + +def amdsmi_shut_down(): + """ Shutdown AMD-SMI """ + # Only init AMD GPUs for now, waiting for future support for AMD CPUs + shut_down_status = amdsmi.amdsmi_shut_down() + check_return(return_code=shut_down_status, error_statment=f'AMD SMI Shutdown code returned {shut_down_status} (the expected value is {amdsmi_status_t.AMDSMI_STATUS_SUCCESS})') + logging.debug('amd-smi shutdown successfully') + + +def signal_handler(sig, frame): + logging.debug(f'Handling signal: {sig}') + sys.exit(0) + + +if not amd_smi_initialized: + init_amd_smi() + amd_smi_initialized = True + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + atexit.register(amdsmi_shut_down) diff --git a/amd_smi_cli/amd_smi_logger.py b/amd_smi_cli/amd_smi_logger.py new file mode 100644 index 0000000000..5826be47f6 --- /dev/null +++ b/amd_smi_cli/amd_smi_logger.py @@ -0,0 +1,46 @@ +#!/usr/bin/python3 + +# import orderedDict +import json + +class AMD_SMI_Logger(object): + def __init__(self) -> None: + # self.card = {} + # backwards compatability + pass + + def store_output(self, target_device, log): + pass + + def print_output(self, format=''): + # JSON, CSV, text + # split into 3 different formats + # for elem in self.cards: + # print pretty + pass + + def print_json(self): + # json_data = '[{"ID":10,"Name":"Pankaj","Role":"CEO"},' \ + # '{"ID":20,"Name":"David Lee","Role":"Editor"}]' + + # json_object = json.loads(json_data) + + # json_formatted_str = json.dumps(json_object, indent=2) + + # print(json_formatted_str) + pass + + def print_csv(self): + # # Opening JSON file and loading the data + # # into the variable data + # with open('data.json') as json_file: + # data = json.load(json_file) + + # employee_data = data['emp_details'] + + # # now we will open a file for writing + # data_file = open('data_file.csv', 'w') + + # # create the csv writer object + # csv_writer = csv.writer(data_file) + pass \ No newline at end of file diff --git a/amd_smi_cli/amd_smi_modules.py b/amd_smi_cli/amd_smi_modules.py new file mode 100644 index 0000000000..97556b9d14 --- /dev/null +++ b/amd_smi_cli/amd_smi_modules.py @@ -0,0 +1,97 @@ +#!/usr/bin/python3 + +import argparse +import platform +import signal +import subprocess +import sys +import time +import traceback + +from pathlib import Path + +import BDF +from amd_smi_init import * + +class AMD_SMI_Modules(object): + def __init__(self) -> None: + pass + + + def get_socket_handles(self): + ### Returns tuple of (int, list of ctypes: socket_handles) + socket_count = c_uint32(0) + return_code = amdsmi.amdsmi_get_socket_handles(byref(socket_count), None) + check_return(return_code=return_code, error_statment="Invalid get_socket_handles request") + + sockets = [0] * socket_count.value # 1 + socket_handles = (c_void_p * socket_count.value)(*sockets) # That is a pointer, not a multiplication + return_code = amdsmi.amdsmi_get_socket_handles(byref(socket_count), socket_handles) + check_return(return_code=return_code, error_statment=f"Invalid get_socket_handles with {socket_count.value} sockets") + return (socket_count.value, socket_handles) + + + def get_device_handles(self, socket_handle): + """Gets the Device Handles that are in the current socket""" + ### Returns tuple of (int, list of ctypes: device_handles) + device_count = c_uint32(0) + return_code = amdsmi.amdsmi_get_device_handles(socket_handle, byref(device_count), None) + check_return(return_code=return_code, error_statment="Invalid get_device_handles request") + + devices = [0] * device_count.value + device_handles = (c_void_p * len(devices))(*devices) + return_code = amdsmi.amdsmi_get_device_handles(socket_handle, byref(device_count), byref(device_handles)) + check_return(return_code=return_code, error_statment=f"Invalid get_device_handles with {device_count.value} devices") + return (device_count.value, device_handles) + + + def get_socket_info(self, socket_handle): + """ Given a socket_handle, return the socket_info, which is just a BDF object""" + socket_info = create_string_buffer(128) # createstringbuffer or something??? c_char_p + return_code = amdsmi.amdsmi_get_socket_info(socket_handle, byref(socket_info), c_size_t(128)) + check_return(return_code=return_code, error_statment="Invalid get_socket_info request") + socket_bdf = BDF.BDF(socket_info.value.decode()) + return(socket_bdf) + + + def get_device_type(self, device_handle, format=True): + # format: True for string; False for int + # Returns device_type string for the given device_handle + dev_type = c_int(0) + return_code = amdsmi.amdsmi_get_device_type(device_handle, byref(dev_type)) + check_return(return_code=return_code, error_statment="Invalid get_device_type request") + + if format == True: # Return string + return device_type__enumvalues[dev_type.value] + + return dev_type.value # Return int + + + def get_device_bdf(self, device_handle): + + # class amdsmi_bdf_t (Union): + # _fields_ = [ + # ('bdf_submodule', bdf_submodule), + # ('as_uint', c_uint64) + # ] + bdf = amdsmi_bdf_t() + # bdf.bdf_submodule + + + + return_code = amdsmi.amdsmi_get_device_bdf(device_handle, bdf) + check_return(return_code=return_code, error_statment="Invalid amdsmi_get_device_bdf request") + return (bdf) + + + def get_device_handle_from_bdf(self, bdf): + pass + + + def get_fan_speed(self, bdf): + pass + + def show_retired_pages(self): + # num_pages = c_uint32() + # records = rsmi_retired_page_record_t() + pass \ No newline at end of file diff --git a/amd_smi_cli/amd_smi_parser.py b/amd_smi_cli/amd_smi_parser.py new file mode 100644 index 0000000000..0edf07417f --- /dev/null +++ b/amd_smi_cli/amd_smi_parser.py @@ -0,0 +1,619 @@ +#!/usr/bin/python3 + +import argparse +import platform + +from _version import __version__ +from amd_smi_helpers import AMD_SMI_Helpers + +# sudo /src/out/ubuntu-20.04/20.04/bin/rocm-smi -bc --json | python -m json.tool + +class AMD_SMI_Parser(argparse.ArgumentParser): + + def __init__(self, version, discovery, static, firmware, bad_pages, metric, + process, profile, event,topology, set_value, reset, misc, gpu_v): + + # Helper variables + self.amd_smi_helpers = AMD_SMI_Helpers() + self.gpu_choices = self.amd_smi_helpers.get_gpu_choices() + self.vf_choices = ['3','2','1'] + + # Adjust argument parser options + super().__init__( + formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90), + # formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=f'AMD System Management Interface | Version: {__version__}', #@TODO add the enviornment + add_help=True, + prog='amd-smi') + + # Setup subparsers + subparsers = self.add_subparsers( + title="AMD-SMI Commands", + parser_class=argparse.ArgumentParser, + required=True, + help='Descriptions:', + # dest='cmd', + metavar="") + + # Add all subparsers + # Add --json, --csv,--file,--loglevel, watch, watch_time, & iterations && backwards compatability --gpuvsmi --rocmsmi + self.add_version_parser(subparsers, version) + self.add_discovery_parser(subparsers, discovery) + self.add_static_parser(subparsers, static) + self.add_firmware_parser(subparsers, firmware) + self.add_bad_pages_parser(subparsers, bad_pages) + self.add_metric_parser(subparsers, metric) + self.add_process_parser(subparsers, process) + self.add_profile_parser(subparsers, profile) + self.add_event_parser(subparsers, event) + self.add_topology_parser(subparsers, topology) + # self.add_set_value_parser(subparsers, set_value) + self.add_reset_parser(subparsers, reset) + self.add_misc_parser(subparsers, misc) + self.add_gpu_v_parser(subparsers, misc) + + + def add_version_parser(self, subparsers, func): + # Subparser help text + version_help = "Display version information" + + # Create version subparser + version_parser = subparsers.add_parser('version', help=version_help, description=None) + version_parser._optionals.title = None + version_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + version_parser.set_defaults(func=func) + + + def add_discovery_parser(self, subparsers, func): + # Subparser help text + discovery_help = "Display discovery information" + discovery_subcommand_help = """Lists all the devices on the system and the links between devices. + Lists all the sockets and for each socket, GPUs and/or CPUs associated to + that socket alongside some basic information for each device. + In virtualization environment, it can also list VFs associated to each + GPU with some basic information for each VF.""" + + # Create discovery subparser + discovery_parser = subparsers.add_parser('discovery', help=discovery_help, description=discovery_subcommand_help) + discovery_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + discovery_parser.set_defaults(func=func) + + + def add_static_parser(self, subparsers, func): + # Subparser help text + static_help = "Gets static information about the specified GPU" + static_subcommand_help = """If no argument is provided, return static information for all GPUs on the system. + If no static argument is specified all static information will be displayed.""" + static_optionals_title = "Static Arguments" + + # Optional arguments help text + gpu_help = "Select a GPU from the possible choices" + vf_help = """Gets general information about the specified VF (timeslice, fb info, …). + Available only on virtualization OSs""" + asic_help = "All asic information" + bus_help = "All bus information" + vbios_help = "All video bios information (if available)" + limit_help = "All limit metric values (i.e. power and thermal limits)" + driver_help = "Displays driver version" + caps_help = "All caps information" + + # Options arguments help text for Hypervisors and Baremetal + ras_help = "Displays RAS features information" + board_help = "All board information" # Linux Baremetal only @TODO is applicable to Azure + + # Options arguments help text for Hypervisors + dfc_help = "All DFC FW table information" + fb_help = "Displays Frame Buffer information" + num_vf_help = "Displays number of supported and enabled VFs" + + # Create static subparser + static_parser = subparsers.add_parser('static', help=static_help, description=static_subcommand_help) + static_parser._optionals.title = static_optionals_title + static_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + static_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = static_parser.add_mutually_exclusive_group() + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # Optional Args + static_parser.add_argument('-a', '--asic', action='store_true', required=False, help=asic_help) + static_parser.add_argument('-b', '--bus', action='store_true', required=False, help=bus_help) + static_parser.add_argument('-v', '--vbios', action='store_true', required=False, help=vbios_help) + static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) + static_parser.add_argument('-d', '--driver', action='store_true', required=False, help=driver_help) + static_parser.add_argument('-c', '--caps', action='store_true', required=False, help=caps_help) + + # Options to display on Hypervisors and Baremetal + if self.amd_smi_helpers.is_hypervisor() or self.amd_smi_helpers.is_baremetal(): + static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) + if self.amd_smi_helpers.is_linux(): #@TODO Check if applicable to Azure + static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help) + + # Options to only display on a Hypervisor + if self.amd_smi_helpers.is_hypervisor(): + device_args.add_argument('--vf', action='store', help=vf_help, choices=self.vf_choices) + static_parser.add_argument('-du', '--dfc-ucode', action='store_true', required=False, help=dfc_help) + static_parser.add_argument('-f', '--fb-info', action='store_true', required=False, help=fb_help) + static_parser.add_argument('-n', '--num-vf', action='store_true', required=False, help=num_vf_help) + + + def add_firmware_parser(self, subparsers, func): + # Subparser help text + firmware_help = "Gets firmware information about the specified GPU" + firmware_subcommand_help = "If no argument is provided, return firmware information for all GPUs on the system." + firmware_optionals_title = "Firmware Arguments" + + # Optional arguments help text + gpu_help = "Select a GPU from the possible choices" + vf_help = """Gets general information about the specified VF (timeslice, fb info, …). + Available only on virtualization OSs""" + fw_list_help = "All FW list information" + err_records_help = "All error records information" + + # Create firmware subparser + firmware_parser = subparsers.add_parser('firmware', help=firmware_help, description=firmware_subcommand_help) + firmware_parser._optionals.title = firmware_optionals_title + firmware_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + firmware_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = firmware_parser.add_mutually_exclusive_group() + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # Optional Args + firmware_parser.add_argument('-f', '--fw-list', action='store_true', required=False, help=fw_list_help) # Redundant? + + # Options to only display on a Hypervisor + if self.amd_smi_helpers.is_hypervisor(): + device_args.add_argument('--vf', action='store', help=vf_help, choices=self.vf_choices) + firmware_parser.add_argument('-e', '--error-records', action='store_true', required=False, help=err_records_help) + + + def add_bad_pages_parser(self, subparsers, func): #@TODO Retired pages? + if not (self.amd_smi_helpers.is_baremetal() and self.amd_smi_helpers.is_linux()): + # The bad_pages subcommand is only applicable to Linux Baremetal systems + return + + # Subparser help text + bad_pages_help = "Gets bad page information about the specified GPU" + bad_pages_subcommand_help = "If no argument is provided, return bad page information for all GPUs on the system." + bad_pages_optionals_title = "Bad pages Arguments" + + # Optional arguments help text + gpu_help = "Select a GPU from the possible choices" + pending_help = "Displays all pending retired pages" + retired_help = "Displays retired pages" #@TODO Wording + un_res_help = "Displays unreservable pages" + + # Create bad_pages subparser + bad_pages_parser = subparsers.add_parser('bad_pages', help=bad_pages_help, description=bad_pages_subcommand_help) + bad_pages_parser._optionals.title = bad_pages_optionals_title + bad_pages_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + bad_pages_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = bad_pages_parser.add_mutually_exclusive_group() + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # Optional Args + bad_pages_parser.add_argument('-p', '--pending', action='store_true', required=False, help=pending_help) + bad_pages_parser.add_argument('-r', '--retired', action='store_true', required=False, help=retired_help) + bad_pages_parser.add_argument('-u', '--un-res', action='store_true', required=False, help=un_res_help) + + + def add_metric_parser(self, subparsers, func): + # Subparser help text + metric_help = "Gets metric/performance information about the specified GPU" + metric_subcommand_help = """If no argument is provided, return metric information for all GPUs on the system. + If no metric argument is specified all metric information will be displayed.""" + metric_optionals_title = "Metric arguments" + + # Optional arguments help text + gpu_help = "Select a GPU from the possible choices" + vf_help = """Gets general information about the specified VF (timeslice, fb info, …). + Available only on virtualization OSs""" + usage_help = "All metrics usage information" + + # Help text for Arguments only Available on Virtual OS and Baremetal platforms + fb_usage_help = "Total and used framebuffer" + + # Help text for Arguments only on Hypervisor and Baremetal platforms + power_help = "Current power usage" + clock_help = "Average, max, and current clock frequencies" + temperature_help = "Current temperatures" + ecc_help = "Number of ECC errors" + pcie_help = "Current PCIe speed and width" + voltage_help = "Current GPU voltages" + + # Help text for Arguments only on Linux Baremetal platforms + fan_help = "Current fan speed" + pcie_usage_help = "Estimated PCIe link usage" + vc_help = "Display voltage curve" + overdrive_help = "Current GPU clock overdrive level" + mo_help = "Current memory clock overdrive level" + perf_level_help = "Current DPM performance level" + replay_count_help = "PCIe replay count" + xgmi_err_help = "XGMI error information since last read" + energy_help = "Amount of energy consumed" #@TODO ? Available only on host Linux Baremetal platforms + + # Help text for Arguments only on Hypervisors + schedule_help = "All scheduling information" + guard_help = "All guard information" + guest_help = "All guest data information" + + # Create metric subparser + metric_parser = subparsers.add_parser('metric', help=metric_help, description=metric_subcommand_help) + metric_parser._optionals.title = metric_optionals_title + metric_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + metric_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = metric_parser.add_mutually_exclusive_group() + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # Optional Args + metric_parser.add_argument('-u', '--usage', action='store_true', required=False, help=usage_help) + + # Optional Args for Virtual OS and Baremetal systems + if self.amd_smi_helpers.is_virtual_os() or self.amd_smi_helpers.is_baremetal(): + metric_parser.add_argument('-b', '--fb-usage', action='store_true', required=False, help=fb_usage_help) + + # Optional Args for Hypervisors and Baremetal systems + if self.amd_smi_helpers.is_hypervisor() or self.amd_smi_helpers.is_baremetal(): + metric_parser.add_argument('-p', '--power', action='store_true', required=False, help=power_help) + metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help) + metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) + metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) + metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help) + metric_parser.add_argument('-v', '--voltage', action='store_true', required=False, help=voltage_help) + + # Optional Args for Linux Baremetal Systems #@TODO Discuss logic if Linux Hypervisors would be allowed to have this + if self.amd_smi_helpers.is_baremetal() and self.amd_smi_helpers.is_linux(): + metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) + metric_parser.add_argument('-s', '--pcie-usage', action='store_true', required=False, help=pcie_usage_help) + metric_parser.add_argument('-V', '--voltage-curve', action='store_true', required=False, help=vc_help) + metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) + metric_parser.add_argument('-m', '--mem-overdrive', action='store_true', required=False, help=mo_help) + metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help) + metric_parser.add_argument('-r', '--replay-count', action='store_true', required=False, help=replay_count_help) + metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help) + metric_parser.add_argument('-E', '--energy', action='store_true', required=False, help=energy_help) + + # Options to only display to Hypervisors + if self.amd_smi_helpers.is_hypervisor(): + device_args.add_argument('--vf', action='store', help=vf_help, choices=self.vf_choices) + metric_parser.add_argument('-s', '--schedule', action='store_true', required=False, help=schedule_help) + metric_parser.add_argument('-g', '--guard', action='store_true', required=False, help=guard_help) + metric_parser.add_argument('-G', '--guest', action='store_true', required=False, help=guest_help) + + + def add_process_parser(self, subparsers, func): + if self.amd_smi_helpers.is_hypervisor(): + # Don't add this subparser on Hypervisors + return + + # Subparser help text + process_help = "Lists general process information running on the specified GPU" + process_subcommand_help = """If no argument is provided, returns information for all GPUs on the system. + If no argument is provided all process information will be displayed.""" + process_optionals_title = "Process arguments" + + # Required arguments help text + gpu_help = "Select a GPU from the possible choices" + + # Help text for Arguments only on Guest and BM platforms + general_help = "pid, process name, memory usage" + engine_help = "All engine usages" + pid_help = "Gets all process information about the specified process based on Process ID" + name_help = """Gets all process information about the specified process based on Process Name. + If multiple processes have the same name information is returned for all of them.""" #@TODO wording + + # Create process subparser + process_parser = subparsers.add_parser('process', help=process_help, description=process_subcommand_help) + process_parser._optionals.title = process_optionals_title + process_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + process_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = process_parser.add_mutually_exclusive_group() + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # Optional Args + process_parser.add_argument('-g', '--general', action='store_true', required=False, help=general_help) + process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help) + process_parser.add_argument('-p', '--pid', action='store', required=False, help=pid_help) + process_parser.add_argument('-n', '--name', action='store', required=False, help=name_help) + + + def add_profile_parser(self, subparsers, func): + if not (self.amd_smi_helpers.is_windows() and self.amd_smi_helpers.is_hypervisor()): + # This subparser only applies to Azure Hyper-V systems + return + + # Subparser help text + profile_help = "Displays information about all profiles and current profile" + profile_subcommand_help = "If no argument is provided, returns information for all GPUs on the system." + profile_optionals_title = "Profile Arguments" + + # Required arguments help text + gpu_help = "Select a GPU from the possible choices" + + # Create profile subparser + profile_parser = subparsers.add_parser('profile', help=profile_help, description=profile_subcommand_help) + profile_parser._optionals.title = profile_optionals_title + profile_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + profile_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = profile_parser.add_mutually_exclusive_group() + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + + def add_event_parser(self, subparsers, func): + if self.amd_smi_helpers.is_linux() and not self.amd_smi_helpers.is_virtual_os(): + # This subparser only applies to Linux BareMetal & Linux Hypervisors + return + + # Subparser help text + event_help = "Displays event information for the given GPU" + event_subcommand_help = "If no argument is provided, returns event information for all GPUs on the system." + event_optionals_title = "Event Arguments" + + # Required arguments help text + gpu_help = "Select a GPU from the possible choices" + + # Create event subparser + event_parser = subparsers.add_parser('event', help=event_help, description=event_subcommand_help) + event_parser._optionals.title = event_optionals_title + event_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + event_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = event_parser.add_mutually_exclusive_group() + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + + def add_topology_parser(self, subparsers, func): + if not(self.amd_smi_helpers.is_baremetal() and self.amd_smi_helpers.is_linux()): + # This subparser is only applicable to Baremetal Linux @TODO confirm how KVM should work + return + + # Subparser help text + topology_help = "Displays topology information of the devices." + topology_subcommand_help = "If no argument is provided, returns information for all GPUs on the system." + topology_optionals_title = "Topology arguments" + + # Required arguments help text + gpu_help = "Select a GPU from the possible choices" + + # Help text for Arguments only on Guest and BM platforms + topo_access_help = "Displays link accessibility between GPUs" + topo_weight_help = "Displays relative weight between GPUs" + topo_hops_help = "Displays the number of hops between GPUs" + topo_type_help = "Displays the link type between GPUs." + topo_numa_help = "Displays the numa nodes." + + # Create topology subparser + topology_parser = subparsers.add_parser('topology', help=topology_help, description=topology_subcommand_help) + topology_parser._optionals.title = topology_optionals_title + topology_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + topology_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = topology_parser.add_mutually_exclusive_group() + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # Optional Args + topology_parser.add_argument('-a', '--topo-access', action='store_true', required=False, help=topo_access_help) + topology_parser.add_argument('-w', '--topo-weight', action='store_true', required=False, help=topo_weight_help) + topology_parser.add_argument('-o', '--topo-hops', action='store_true', required=False, help=topo_hops_help) + topology_parser.add_argument('-t', '--topo-type', action='store_true', required=False, help=topo_type_help) + topology_parser.add_argument('-n', '--topo-numa', action='store_true', required=False, help=topo_numa_help) + + + def add_set_value_parser(self, subparsers, func): + if not(self.amd_smi_helpers.is_baremetal() and self.amd_smi_helpers.is_linux()): + # This subparser is only applicable to Baremetal Linux @TODO confirm how KVM should work + return + + # Subparser help text + set_value_help = "Set options for devices." + set_value_subcommand_help = "The user must specify one of the options for the set configuration." + set_value_optionals_title = "Set Arguments" + + # Required arguments help text + gpu_help = "Select a GPU from the possible choices" + + # Help text for Arguments only on Guest and BM platforms + set_clk_help = "Sets clock frequency levels for specified clocks" + set_sclk_help = "Sets GPU clock frequency levels" + set_mclk_help = "Sets memory clock frequency levels" + set_pcie_help = "Sets PCIe clock frequency levels" + set_slevel_help = "Change GPU clock frequency and voltage for a specific level" + set_mlevel_help = "Change GPU memory frequency and voltage for a specific level" + set_vc_help = "Change SCLK voltage curve for a specified point" + set_srange_help = "Sets min and max SCLK speed" + set_mrange_help = "Sets min and max MCLK speed" + set_fan_help = "Sets GPU fan speed (level or %)" + set_perf_level_help = "Sets performance level" + set_overdrive_help = "Set GPU overdrive level" + set_mem_overdrive_help = "Set memory overclock overdrive level" + set_power_overdrive_help = "Set the maximum GPU power using power overdrive in Watts" + set_profile_help = "Set power profile level (#) or a quoted string of custom profile attributes" + set_perf_det_help = "Set GPU clock frequency limit to get minimal performance variation" + ras_enable_help = "Enable RAS for specified block and error type" + ras_disable_help = "Disable RAS for specified block and error type." + ras_inject_help = "Inject RAS poison for specified block" + +# -c, --setclk +# . +# -s, --setsclk +# . +# -m, --setmclk +# . +# -p, --setpcie +# . +# -S, --setslevel +# . +# -M, --setmlevel +# . +# -v, --setvc +# . +# -r, --setsrange +# +# -R, --setmrange +# . +# -f, --setfan +# +# -pl, --setperflevel +# +# -o, --setoverdrive % +# Set GPU overdrive level. +# -O, --setmemoverdrive % +# Set memory overclock overdrive level. +# -po, --setpoweroverdrive +# Set the maximum GPU power using power overdrive in Watts. +# -P, --setprofile +# Set power profile level (#) or a quoted string of custom profile attributes (“ # # # # “) +# -pd, --setperfdet +# Set GPU clock frequency limit to get minimal performance variation. +# -re, --rasenable +# Enable RAS for specified block and error type. +# -rd, --rasdisable +# Disable RAS for specified block and error type. +# -ri, --rasinject +# Inject RAS poison for specified block + + # Create set_value subparser + set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help) + set_value_parser._optionals.title = set_value_optionals_title + set_value_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + set_value_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = set_value_parser.add_mutually_exclusive_group(required=True) + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # Optional Args + set_value_parser.add_argument('-c', '--setclk', action='store', required=False, help=topo_access_help) + set_value_parser.add_argument('-s', '--topo-weight', action='store', required=False, help=topo_weight_help) + set_value_parser.add_argument('-m', '--topo-hops', action='store', required=False, help=topo_hops_help) + set_value_parser.add_argument('-p', '--topo-type', action='store', required=False, help=topo_type_help) + set_value_parser.add_argument('-S', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-M', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-v', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-r', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-R', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-f', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-pl', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-o' '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-O', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-po', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-P', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-pd', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-re', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-rd', '--topo-numa', action='store', required=False, help=topo_numa_help) + set_value_parser.add_argument('-ri', '--topo-numa', action='store', required=False, help=topo_numa_help) + + + def add_reset_parser(self, subparsers, func): + if not(self.amd_smi_helpers.is_baremetal() and self.amd_smi_helpers.is_linux()): + # This subparser is only applicable to Baremetal Linux @TODO confirm how KVM should work + return + + # Subparser help text + reset_help = "Reset options for devices." + reset_subcommand_help = "The user must specify one of the options to reset devices." + reset_optionals_title = "Reset Arguments" + + # Required arguments help text + gpu_help = "Select a GPU from the possible choices" + + # Help text for Arguments only on Guest and BM platforms + gpureset_help = "Reset the specified GPU" + resetclk_help = "Reset clocks and overdrive to default" + resetfans_help = "Reset fans to automatic (driver) control" + resetprofile_help = "Reset power profile back to default" + resetpoweroverdrive_help = "Set the maximum GPU power back to the device default state" + resetxgmierr_help = "Reset XGMI error counts" + resetperfdet_help = "Disable performance determinism" + + # Create reset subparser + reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help) + reset_parser._optionals.title = reset_optionals_title + reset_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + reset_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = reset_parser.add_mutually_exclusive_group(required=True) + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # Optional Args + reset_parser.add_argument('-g', '--gpureset', action='store_true', required=False, help=gpureset_help) + reset_parser.add_argument('-c', '--resetclk', action='store_true', required=False, help=resetclk_help) + reset_parser.add_argument('-f', '--resetfans', action='store_true', required=False, help=resetfans_help) + reset_parser.add_argument('-p', '--resetprofile', action='store_true', required=False, help=resetprofile_help) + reset_parser.add_argument('-o', '--resetpoweroverdrive', action='store_true', required=False, help=resetpoweroverdrive_help) + reset_parser.add_argument('-x', '--resetxgmierr', action='store_true', required=False, help=resetxgmierr_help) + reset_parser.add_argument('-d', '--resetperfdet', action='store_true', required=False, help=resetperfdet_help) + + + def add_misc_parser(self, subparsers, func): + if not(self.amd_smi_helpers.is_baremetal() and self.amd_smi_helpers.is_linux()): + # This subparser is only applicable to Baremetal Linux @TODO confirm how KVM should work + return + + # Subparser help text + misc_help = "The miscellaneous options" + misc_subcommand_help = "The user must specify one of the options to reset devices." + misc_optionals_title = "Misc Arguments" + + # Optional arguments help text + gpu_help = "Select a GPU from the possible choices" + load_help = "Load clock, fan, performance, and profile settings from a given file." + save_help = "Save clock, fan, performance, and profile settings to a given file." + + # Create misc subparser + misc_parser = subparsers.add_parser('misc', help=misc_help, description=misc_subcommand_help) + misc_parser._optionals.title = misc_optionals_title + misc_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + misc_parser.set_defaults(func=func) + + # Mutually Exclusive Args within the subparser + device_args = misc_parser.add_mutually_exclusive_group(required=True) + device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # Optional Args + misc_parser.add_argument('-l', '--load', action='store', type=open, required=False, help=load_help) + misc_parser.add_argument('-s', '--save', action='store', type=open, required=False, help=save_help) + + + # def add_gpu_v_parser(self, subparsers, func): + # if not(self.amd_smi_helpers.is_baremetal() and self.amd_smi_helpers.is_linux()): + # # This subparser is only applicable to Baremetal Linux @TODO confirm how KVM should work + # return + + # # Subparser help text + # gpu_v_help = "The gpu_v options" + # gpu_v_subcommand_help = "The user must specify one of the options to reset devices." + # gpu_v_optionals_title = "gpu_v Arguments" + + # # Optional arguments help text + # gpu_help = "Select a GPU from the possible choices" + # load_help = "Load clock, fan, performance, and profile settings from a given file." + # save_help = "Save clock, fan, performance, and profile settings to a given file." + + # # Create gpu_v subparser + # gpu_v_parser = subparsers.add_parser('gpu_v', help=gpu_v_help, description=gpu_v_subcommand_help) + # gpu_v_parser._optionals.title = gpu_v_optionals_title + # gpu_v_parser.formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=80, width=90) + # gpu_v_parser.set_defaults(func=func) + + # # Mutually Exclusive Args within the subparser + # device_args = gpu_v_parser.add_mutually_exclusive_group(required=True) + # device_args.add_argument('--gpu', action='store', help=gpu_help, choices=self.gpu_choices) + + # # Optional Args + # gpu_v_parser.add_argument('-l', '--load', action='store', type=open, required=False, help=load_help) + # gpu_v_parser.add_argument('-s', '--save', action='store', type=open, required=False, help=save_help) diff --git a/amd_smi_cli/amdsmiBindings.py b/amd_smi_cli/amdsmiBindings.py new file mode 100644 index 0000000000..df9222d537 --- /dev/null +++ b/amd_smi_cli/amdsmiBindings.py @@ -0,0 +1,818 @@ +#!/usr/bin/env python3 +"""! @brief AMD_SMI FFI""" + +from ctypes import * +from enum import Enum +import os + + +path_amdsmi = '/opt/rocm/lib/libamd_smi64.so' #@TODO make this dynamic + +try: + cdll.LoadLibrary(path_amdsmi) + amdsmi = CDLL(path_amdsmi) +except OSError: + print("Unable to load libamd_smi64.so library\n") + exit(1) + +## +# @brief Initialization flags +# +# may be OR'd together and passed to smi.amdsmi_init() +## + +class amdsmi_init_flags(c_int): + AMD_SMI_INIT_ALL_DEVICES = 0x0 # Default option + AMD_SMI_INIT_AMD_CPUS = (1 << 0) + AMD_SMI_INIT_AMD_GPUS = (1 << 1) + AMD_SMI_INIT_NON_AMD_CPUS = (1 << 2) + AMD_SMI_INIT_NON_AMD_GPUS = (1 << 3) + +# Maximum size definitions GPUVSMI +AMDSMI_MAX_MM_IP_COUNT = 8 +AMDSMI_MAX_DATE_LENGTH = 32 # YYYY-MM-DD:HH:MM:SS.MSC # +AMDSMI_MAX_STRING_LENGTH = 64 +AMDSMI_NORMAL_STRING_LENGTH = 32 +AMDSMI_MAX_DEVICES = 32 +AMDSMI_MAX_NAME = 32 +AMDSMI_MAX_DRIVER_VERSION_LENGTH = 80 +AMDSMI_PRODUCT_NAME_LENGTH = 128 +AMDSMI_MAX_CONTAINER_TYPE = 2 + +AMDSMI_GPU_UUID_SIZE = 38 + + +class amdsmi_mm_ip(c_int): + MM_UVD = 0 + MM_VCE = 1 + MM_VCN = 2 + MM__MAX = 3 + + +class amdsmi_container_types(c_int): + CONTAINER_LXC = 0 + CONTAINER_DOCKER = 1 + +# ! opaque handler point to underlying implementation +amdsmi_device_handle = POINTER(c_uint) +amdsmi_socket_handle = POINTER(c_uint) + +class device_type(c_int): + UNKNOWN = 0 + AMD_GPU = 1 + AMD_CPU = 2 + NON_AMD_GPU = 3 + NON_AMD_CPU = 4 + +device_type__enumvalues = { + 0: 'UNKNOWN', + 1: 'AMD_GPU', + 2: 'AMD_CPU', + 3: 'NON_AMD_GPU', + 4: 'NON_AMD_CPU', +} + +#Error codes retured by amd_smi_lib functions +class amdsmi_status(c_int): + AMDSMI_STATUS_SUCCESS = 0 # Call succeeded + AMDSMI_STATUS_INVAL = 1 # Invalid parameters + AMDSMI_STATUS_NOT_SUPPORTED = 2 # Command not supported + AMDSMI_STATUS_FILE_ERROR = 3 # Problem accessing a file. + AMDSMI_STATUS_NO_PERM = 4 # Permission Denied + AMDSMI_STATUS_OUT_OF_RESOURCES = 5 # Not enough memory + AMDSMI_STATUS_INTERNAL_EXCEPTION = 6 # An internal exception was caught + AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS = 7 # The provided input is out of allowable or safe range + AMDSMI_STATUS_INIT_ERROR = 8 # An error occurred when initializing internal data structures + AMDSMI_STATUS_NOT_YET_IMPLEMENTED = 9 # Not implemented yet + AMDSMI_STATUS_NOT_FOUND = 10 # Device Not found + AMDSMI_STATUS_INSUFFICIENT_SIZE = 11 # Not enough resources were available for the operation + AMDSMI_STATUS_INTERRUPT = 12 # An interrupt occurred during execution of function + AMDSMI_STATUS_UNEXPECTED_SIZE = 13 # An unexpected amount of data was read + AMDSMI_STATUS_NO_DATA = 14 # No data was found for a given input + AMDSMI_STATUS_UNEXPECTED_DATA = 15 # The data read or provided to function is not what was expected + AMDSMI_STATUS_BUSY = 16 # Device busy + AMDSMI_STATUS_REFCOUNT_OVERFLOW = 17 # An internal reference counter exceeded INT32_MAX + AMDSMI_LIB_START = 1000 + AMDSMI_STATUS_FAIL_LOAD_MODULE = AMDSMI_LIB_START # Fail to load lib + AMDSMI_STATUS_FAIL_LOAD_SYMBOL = 1001 + AMDSMI_STATUS_DRM_ERROR = 1002 # Error when call libdrm + AMDSMI_STATUS_IO = 1003 # Error + AMDSMI_STATUS_FAULT = 1004 # Bad address + AMDSMI_STATUS_API_FAILED = 1005 # API call failed + AMDSMI_STATUS_TIMEOUT = 1006 # Timeout in API call + AMDSMI_STATUS_NO_SLOT = 1007 # No more free slot + AMDSMI_STATUS_RETRY = 1008 # Retry operation + AMDSMI_STATUS_NOT_INIT = 1009 # Device not initialized + AMDSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF # An unknown error occurred + +amdsmi_status_t = amdsmi_status +#Clock types +class amdsmi_clk_type (c_int): + CLK_TYPE_SYS = 0x0, # System clock + CLK_TYPE_FIRST = CLK_TYPE_SYS + CLK_TYPE_GFX = CLK_TYPE_SYS + CLK_TYPE_DF = 0x1 # Data Fabric clock (for ASICs + # running on a separate clock) + CLK_TYPE_DCEF = 0x2 # Display Controller Engine clock + CLK_TYPE_SOC = 0x3 + CLK_TYPE_MEM = 0x4 + CLK_TYPE_PCIE = 0x5 + CLK_TYPE_VCLK0 = 0x6 + CLK_TYPE_VCLK1 = 0x7 + CLK_TYPE_DCLK0 = 0x8 + CLK_TYPE_DCLK1 = 0x9 + CLK_TYPE__MAX = CLK_TYPE_DCLK1 + +amdsmi_clk_type_t = amdsmi_clk_type +#This enumeration is used to indicate from which part of the device a +# temperature reading should be obtained +class amdsmi_temperature_type (c_int): + TEMPERATURE_TYPE_EDGE = 0 + TEMPERATURE_TYPE_FIRST = TEMPERATURE_TYPE_EDGE + TEMPERATURE_TYPE_JUNCTION = 1 + TEMPERATURE_TYPE_VRAM = 2 + TEMPERATURE_TYPE_HBM_0 = 3 + TEMPERATURE_TYPE_HBM_1 = 4 + TEMPERATURE_TYPE_HBM_2 = 5 + TEMPERATURE_TYPE_HBM_3 = 6 + TEMPERATURE_TYPE_PLX = 7 + TEMPERATURE_TYPE__MAX = TEMPERATURE_TYPE_PLX + +#The values of this enum are used to identify the various firmware +#blocks. +class amdsmi_fw_block_t (c_int): + FW_ID_SMU = 1 + FW_ID_FIRST = FW_ID_SMU + FW_ID_CP_CE = 2 + FW_ID_CP_PFP = 3 + FW_ID_CP_ME = 4 + FW_ID_CP_MEC_JT1 = 5 + FW_ID_CP_MEC_JT2 = 6 + FW_ID_CP_MEC1 = 7 + FW_ID_CP_MEC2 = 8 + FW_ID_RLC = 9 + FW_ID_SDMA0 = 10 + FW_ID_SDMA1 = 11 + FW_ID_SDMA2 = 12 + FW_ID_SDMA3 = 13 + FW_ID_SDMA4 = 14 + FW_ID_SDMA5 = 15 + FW_ID_SDMA6 = 16 + FW_ID_SDMA7 = 17 + FW_ID_VCN = 18 + FW_ID_UVD = 19 + FW_ID_VCE = 20 + FW_ID_ISP = 21 + FW_ID_DMCU_ERAM = 22 # eRAM + FW_ID_DMCU_ISR = 23 # ISR + FW_ID_RLC_RESTORE_LIST_GPM_MEM = 24 + FW_ID_RLC_RESTORE_LIST_SRM_MEM = 25 + FW_ID_RLC_RESTORE_LIST_CNTL = 26 + FW_ID_RLC_V = 27 + FW_ID_MMSCH = 28 + FW_ID_PSP_SYSDRV = 29 + FW_ID_PSP_SOSDRV = 30 + FW_ID_PSP_TOC = 31 + FW_ID_PSP_KEYDB = 32 + FW_ID_DFC = 33 + FW_ID_PSP_SPL = 34 + FW_ID_DRV_CAP = 35 + FW_ID_MC = 36 + FW_ID_PSP_BL = 37 + FW_ID_CP_PM4 = 38 + FW_ID_ASD = 39 + FW_ID_TA_RAS = 40 + FW_ID_XGMI = 41 + FW_ID_RLC_SRLG = 42 + FW_ID_RLC_SRLS = 43 + FW_ID_SMC = 44 + FW_ID_DMCU = 45 + FW_ID__MAX = 46 + +#This structure represents a range (e.g., frequencies or voltages) + +class amdsmi_range_t (Structure): + _fields_ = [ + ('lower_bound', c_uint64), + ('upper_bound', c_uint64), + ] + +class amdsmi_xgmi_info_t (Structure): + _fields_ = [ + ('xgmi_lanes', c_uint8), + ('xgmi_hive_id', c_uint64), + ('xgmi_node_id', c_uint64), + ('index', c_uint32), + ] + +#GPU Capability info + +class gfx (Structure): + _fields_ = [ + ('gfxip_major', c_uint32), + ('gfxip_minor', c_uint32), + ('gfxip_cu_count', c_uint16)] + +class mm (Structure): + _fields_ = [ + ('mm_ip_count', c_uint8), + ('mm_ip_list', c_uint8 * AMDSMI_MAX_MM_IP_COUNT) + ] +class amdsmi_gpu_caps_t (Structure): + _fields_ = [ + ('gfx', gfx), + ('mm', mm), + ('ras_supported', c_bool), + ('max_vf_num', c_uint8), + ('gfx_ip_count', c_uint32), + ('dma_ip_count', c_uint32) + ] + +class amdsmi_vram_info (Structure): + _fields_ = [ + ('vram_total', c_uint32), + ('vram_used', c_uint32), + ] + +class amdsmi_frequency_range_t(Structure): + _fields_ = [ + ('supported_freq_range', amdsmi_range_t), + ('current_freq_range', amdsmi_range_t), + ] + +class bdf_submodule (Structure): + _fields_ = [ + ('function_number', c_uint64, 3), + ('device_number', c_uint64, 5), + ('bus_number', c_uint64, 8), + ('domain_number', c_uint64, 48), + ] +class amdsmi_bdf_t (Union): + _fields_ = [ + ('bdf_submodule', bdf_submodule), + ('as_uint', c_uint64) + ] + +class amdsmi_power_cap_info_t (Structure): + _fields_ = [ + ('power_cap', c_uint64), + ('default_power_cap', c_uint64), + ('dpm_cap', c_uint64), + ('min_power_cap', c_uint64), + ('max_power_cap', c_uint64) + ] + +class amdsmi_vbios_info_t (Structure): + _fields_ =[ + ('name', c_char * AMDSMI_MAX_STRING_LENGTH), + ('vbios_version', c_uint32), + ('build_date', c_char * AMDSMI_MAX_DATE_LENGTH), + ('part_number', c_char * AMDSMI_MAX_STRING_LENGTH), + ('vbios_version_string', c_char * AMDSMI_NORMAL_STRING_LENGTH) + ] + +class fw_info_list (Structure): + _fields_ = [ + ('fw_id', amdsmi_fw_block_t), + ('fw_version', c_uint64) + ] +class amdsmi_fw_info_t (Structure): + _fields_ =[ + ('num_fw_info', c_uint8), + ('fw_info_list', fw_info_list * amdsmi_fw_block_t.FW_ID__MAX) + ] + +class amdsmi_asic_info_t (Structure): + _fields_ = [ + ('market_name', c_char * AMDSMI_MAX_STRING_LENGTH), + ('family', c_uint32), + ('vendor_id', c_uint32), + ('subvendor_id', c_uint32), + ('device_id', c_uint64), + ('rev_id', c_uint32), + ('asic_serial', c_char * AMDSMI_NORMAL_STRING_LENGTH) + ] + +class amdsmi_board_info (Structure): + _fields_ = [ + ('serial_number', c_uint64), + ('is_master', c_bool), + ('model_number', c_char * AMDSMI_NORMAL_STRING_LENGTH), + ('product_serial', c_char * AMDSMI_NORMAL_STRING_LENGTH), + ('fru_id', c_char * AMDSMI_NORMAL_STRING_LENGTH), + ('product_name', c_char * AMDSMI_PRODUCT_NAME_LENGTH), + ('manufacturer_name', c_char * AMDSMI_NORMAL_STRING_LENGTH), + ] + +class amdsmi_temperature_t (Structure): + _fields_ = [ + ('cur_temp', c_uint32) + ] + +class amdsmi_temperature_limit_t (Structure): + _fields_ = [ + ('limit', c_uint32) + ] + +class amdsmi_power_limit_t (Structure): + _fields_ = [ + ('limit', c_uint32) + ] + +class amdsmi_power_measure (Structure): + _fields_ = [ + ('average_socket_power', c_uint32), + ('energy_accumulator', c_uint64), + ('voltage_gfx', c_uint32), + ('voltage_soc', c_uint32), + ('voltage_mem', c_uint32), + ] + +class amdsmi_clk_measure_t (Structure): + _fields_ = [ + ('cur_clk', c_uint32), + ('avg_clk', c_uint32), + ('min_clk', c_uint32), + ('max_clk', c_uint32) + ] + +class amdsmi_engine_usage_t (Structure): + _fields_ = [ + ('gfx_activity', c_uint32), + ('umc_activity', c_uint32), + ('mm_activity', c_uint32 * AMDSMI_MAX_MM_IP_COUNT) + ] + +amdsmi_process_handle = c_uint32 + +class memory_usage (Structure): + _fields_ = [ + ('gtt_mem', c_uint64), + ('cpu_mem', c_uint64), + ('vram_mem', c_uint64) + ] + + +class engine_usage (Structure): + _fields_ = [ + ('gfx', c_uint16 * AMDSMI_MAX_MM_IP_COUNT), + ('compute', c_uint16 * AMDSMI_MAX_MM_IP_COUNT), + ('sdma', c_uint16 * AMDSMI_MAX_MM_IP_COUNT), + ('enc', c_uint16 * AMDSMI_MAX_MM_IP_COUNT), + ('dec',c_uint16 * AMDSMI_MAX_MM_IP_COUNT) + ] +class amdsmi_proc_info_t(Structure): + _fields_ = [ + ('name', c_char * AMDSMI_NORMAL_STRING_LENGTH), + ('pid', amdsmi_process_handle), + ('mem', c_uint64), + ('engine_usage', engine_usage), + ('memory_usage', memory_usage), + ('container_name', c_char * AMDSMI_NORMAL_STRING_LENGTH) + + ] +amdsmi_process_info = amdsmi_proc_info_t + +# Guaranteed maximum possible number of supported frequencies +AMDSMI_MAX_NUM_FREQUENCIES = 32 + +# The number of points that make up a voltage-frequency curve definition +AMDSMI_NUM_VOLTAGE_CURVE_POINTS = 3 + +class amdsmi_dev_perf_level_t (c_int): + AMDSMI_DEV_PERF_LEVEL_AUTO = 0 # Performance level is "auto" + AMDSMI_DEV_PERF_LEVEL_FIRST = AMDSMI_DEV_PERF_LEVEL_AUTO + AMDSMI_DEV_PERF_LEVEL_HIGH = 1 # Keep PowerPlay levels "high", regardless of workload + AMDSMI_DEV_PERF_LEVEL_MANUAL = 2 # Only use values defined by manually setting the AMDSMI_CLK_TYPE_SYS speed + AMDSMI_DEV_PERF_LEVEL_STABLE_STD = 3 # Stable power state with profiling clocks + AMDSMI_DEV_PERF_LEVEL_STABLE_PEAK = 4 # Stable power state with peak clocks + AMDSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK = 5 # Stable power state with minimum memory clock + AMDSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK = 6 # Stable power state with minimum system clock + AMDSMI_DEV_PERF_LEVEL_DETERMINISM = 7 # Performance determinism state + AMDSMI_DEV_PERF_LEVEL_LAST = AMDSMI_DEV_PERF_LEVEL_DETERMINISM + AMDSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 # Unknown performance level + +amdsmi_dev_perf_level = amdsmi_dev_perf_level_t + +class amdsmi_sw_component_t (c_int): + AMDSMI_SW_COMP_FIRST = 0x0 + AMDSMI_SW_COMP_DRIVER = AMDSMI_SW_COMP_FIRST # Driver + AMDSMI_SW_COMP_LAST = AMDSMI_SW_COMP_DRIVER + +amdsmi_event_handle_t = c_uint64 + + +#Event Groups +# Enum denoting an event group. The value of the enum is the +# base value for all the event enums in the group. +class amdsmi_event_group_t (c_int): + AMDSMI_EVNT_GRP_XGMI = 0 # Data Fabric(XGMI) related events + AMDSMI_EVNT_GRP_XGMI_DATA_OUT = 10 # XGMI Outbound data + AMDSMI_EVNT_GRP_INVALID = 0xFFFFFFFF + +# Event types +# Event type enum. Events belonging to a particular event group +# ::amdsmi_event_group_t should begin enumerating at the ::amdsmi_event_group_t +# value for that group. + +class amdsmi_event_type_t (c_int): + AMDSMI_EVNT_FIRST = amdsmi_event_group_t.AMDSMI_EVNT_GRP_XGMI + AMDSMI_EVNT_XGMI_FIRST = amdsmi_event_group_t.AMDSMI_EVNT_GRP_XGMI + AMDSMI_EVNT_XGMI_0_NOP_TX = AMDSMI_EVNT_XGMI_FIRST # NOPs sent to neighbor 0 + AMDSMI_EVNT_XGMI_0_REQUEST_TX = 1 + AMDSMI_EVNT_XGMI_0_RESPONSE_TX = 2 + AMDSMI_EVNT_XGMI_0_BEATS_TX = 3 + AMDSMI_EVNT_XGMI_1_NOP_TX = 4 + AMDSMI_EVNT_XGMI_1_REQUEST_TX = 5 + AMDSMI_EVNT_XGMI_1_RESPONSE_TX = 6 + AMDSMI_EVNT_XGMI_1_BEATS_TX = 7 + AMDSMI_EVNT_XGMI_LAST = 7 + AMDSMI_EVNT_XGMI_DATA_OUT_FIRST = 10 + AMDSMI_EVNT_XGMI_DATA_OUT_0 = 10 + AMDSMI_EVNT_XGMI_DATA_OUT_1 = 11 + AMDSMI_EVNT_XGMI_DATA_OUT_2 = 12 + AMDSMI_EVNT_XGMI_DATA_OUT_3 = 13 + AMDSMI_EVNT_XGMI_DATA_OUT_4 = 14 + AMDSMI_EVNT_XGMI_DATA_OUT_5 = 15 + AMDSMI_EVNT_XGMI_DATA_OUT_LAST = AMDSMI_EVNT_XGMI_DATA_OUT_5 + AMDSMI_EVNT_LAST = AMDSMI_EVNT_XGMI_DATA_OUT_LAST + +class amdsmi_counter_command_t (c_int): + AMDSMI_CNTR_CMD_START = 0 + AMDSMI_CNTR_CMD_STOP = 1 + +class amdsmi_counter_value_t (Structure): + _fields_ = [ + ('value', c_uint64), + ('time_enabled', c_uint64), + ('time_running', c_uint64) + ] + +class amdsmi_evt_notification_type_t (c_int): + AMDSMI_EVT_NOTIF_VMFAULT = 1 # VM page fault + AMDSMI_EVT_NOTIF_FIRST = AMDSMI_EVT_NOTIF_VMFAULT, + AMDSMI_EVT_NOTIF_THERMAL_THROTTLE = 2, + AMDSMI_EVT_NOTIF_GPU_PRE_RESET = 3, + AMDSMI_EVT_NOTIF_GPU_POST_RESET = 4, + AMDSMI_EVT_NOTIF_LAST = AMDSMI_EVT_NOTIF_GPU_POST_RESET + +# function to generate event bitmask from event id +def AMDSMI_EVENT_MASK_FROM_INDEX (i): + return c_ulonglong(1 << (i - 1)) + +MAX_EVENT_NOTIFICATION_MSG_SIZE = 64 + +# Event notification data returned from event notification API +class amdsmi_evt_notification_data_t (Structure): + _fields_ = [ + ('device_handle', c_void_p), # Handler of device that corresponds to the event + ('event', amdsmi_evt_notification_type_t), # Event type + ('message', c_char * MAX_EVENT_NOTIFICATION_MSG_SIZE) # Event message + ] + +# Temperature Metrics. This enum is used to identify various +# temperature metrics. Corresponding values will be in millidegress Celcius. + +class amdsmi_temperature_metric_t (c_int): + AMDSMI_TEMP_CURRENT = 0 # Temperature current value + AMDSMI_TEMP_FIRST = AMDSMI_TEMP_CURRENT + AMDSMI_TEMP_MAX = 1 # Temperature max value + AMDSMI_TEMP_MIN = 2 # Temperature min value + AMDSMI_TEMP_MAX_HYST = 3 # Temperature hysteresis value for max limit (This is an absolute temperature, not a delta) + AMDSMI_TEMP_MIN_HYST = 4 # Temperature hysteresis value for min limit (not a delta) + AMDSMI_TEMP_CRITICAL = 5 # Temperature critical max value, typically greater than corresponding temp_max values. + AMDSMI_TEMP_CRITICAL_HYST = 6 # Temperature hysteresis value for critical limit. (not a delta) + AMDSMI_TEMP_EMERGENCY = 7 # Temperature emergency max value, for chips supporting more than two upper temperature + # limits. Must be equal or greater than corresponding temp_crit values. + AMDSMI_TEMP_EMERGENCY_HYST = 8 # Temperature hysteresis value for emergency limit. (not a delta). + AMDSMI_TEMP_CRIT_MIN = 9 # Temperature critical min value, typically lower than corresponding temperature min values + AMDSMI_TEMP_CRIT_MIN_HYST = 10 # Temperature hysteresis value for critical minimum limit. (not a delta) + AMDSMI_TEMP_OFFSET = 11 # Temperature offset which is added to the temperature reading by the chip. + AMDSMI_TEMP_LOWEST = 12 # Historical minimum temperature. + AMDSMI_TEMP_HIGHEST = 13 # Historical maximum temperature. + AMDSMI_TEMP_LAST = AMDSMI_TEMP_HIGHEST + +class amdsmi_voltage_metric_t (c_int): + AMDSMI_VOLT_CURRENT = 0 # Voltage current value. + AMDSMI_VOLT_FIRST = AMDSMI_VOLT_CURRENT + AMDSMI_VOLT_MAX = 1 # Voltage max value. + AMDSMI_VOLT_MIN_CRIT = 2 # Voltage critical min value. + AMDSMI_VOLT_MIN = 3 # Voltage min value. + AMDSMI_VOLT_MAX_CRIT = 4 # Voltage critical max value. + AMDSMI_VOLT_AVERAGE = 5 # Average voltage. + AMDSMI_VOLT_LOWEST = 6 # Historical minimum voltage. + AMDSMI_VOLT_HIGHEST = 7 # Historical maximum voltage. + AMDSMI_VOLT_LAST = AMDSMI_VOLT_HIGHEST + +# This ennumeration is used to indicate which type of +# voltage reading should be obtained. + +class amdsmi_voltage_type_t (c_int): + AMDSMI_VOLT_TYPE_FIRST = 0 + AMDSMI_VOLT_TYPE_VDDGFX = AMDSMI_VOLT_TYPE_FIRST # Vddgfx GPU voltage + AMDSMI_VOLT_TYPE_LAST = AMDSMI_VOLT_TYPE_VDDGFX + AMDSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF # Invalid type + +# Pre-set Profile Selections. These bitmasks can be AND'd with the +# ::amdsmi_power_profile_status_t.available_profiles returned from +# ::amdsmi_dev_power_profile_presets_get to determine which power profiles +# are supported by the system. + +class amdsmi_power_profile_preset_masks_t (c_int): + AMDSMI_PWR_PROF_PRST_CUSTOM_MASK = 0x1 # Custom Power Profile + AMDSMI_PWR_PROF_PRST_VIDEO_MASK = 0x2 # Video Power Profile + AMDSMI_PWR_PROF_PRST_POWER_SAVING_MASK = 0x4 # Power Saving Profile + AMDSMI_PWR_PROF_PRST_COMPUTE_MASK = 0x8 # Compute Saving Profile + AMDSMI_PWR_PROF_PRST_VR_MASK = 0x10 # VR Power Profile + + # 3D Full Screen Power Profile + AMDSMI_PWR_PROF_PRST_3D_FULL_SCR_MASK = 0x20 + AMDSMI_PWR_PROF_PRST_BOOTUP_DEFAULT = 0x40 # Default Boot Up Profile + AMDSMI_PWR_PROF_PRST_LAST = AMDSMI_PWR_PROF_PRST_BOOTUP_DEFAULT + + # Invalid power profile + AMDSMI_PWR_PROF_PRST_INVALID = 0xFFFFFFFFFFFFFFFF + +class amdsmi_gpu_block_t (c_int): + AMDSMI_GPU_BLOCK_INVALID = 0x0000000000000000 # Used to indicate an invalid block + AMDSMI_GPU_BLOCK_FIRST = 0x0000000000000001 + + AMDSMI_GPU_BLOCK_UMC = AMDSMI_GPU_BLOCK_FIRST # UMC block + AMDSMI_GPU_BLOCK_SDMA = 0x0000000000000002 # SDMA block + AMDSMI_GPU_BLOCK_GFX = 0x0000000000000004 # GFX block + AMDSMI_GPU_BLOCK_MMHUB = 0x0000000000000008 # MMHUB block + AMDSMI_GPU_BLOCK_ATHUB = 0x0000000000000010 # ATHUB block + AMDSMI_GPU_BLOCK_PCIE_BIF = 0x0000000000000020 # PCIE_BIF block + AMDSMI_GPU_BLOCK_HDP = 0x0000000000000040 # HDP block + AMDSMI_GPU_BLOCK_XGMI_WAFL = 0x0000000000000080 # XGMI block + AMDSMI_GPU_BLOCK_DF = 0x0000000000000100 # DF block + AMDSMI_GPU_BLOCK_SMN = 0x0000000000000200 # SMN block + AMDSMI_GPU_BLOCK_SEM = 0x0000000000000400 # SEM block + AMDSMI_GPU_BLOCK_MP0 = 0x0000000000000800 # MP0 block + AMDSMI_GPU_BLOCK_MP1 = 0x0000000000001000 # MP1 block + AMDSMI_GPU_BLOCK_FUSE = 0x0000000000002000 # Fuse block + + AMDSMI_GPU_BLOCK_LAST = AMDSMI_GPU_BLOCK_FUSE # The highest bit position for supported blocks + AMDSMI_GPU_BLOCK_RESERVED = 0x8000000000000000 + +class amdsmi_ras_err_state_t (c_int): + AMDSMI_RAS_ERR_STATE_NONE = 0 # No current errors + AMDSMI_RAS_ERR_STATE_DISABLED = 1 # ECC is disabled + AMDSMI_RAS_ERR_STATE_PARITY = 2 # ECC errors present, but type unknown + AMDSMI_RAS_ERR_STATE_SING_C = 3 # Single correctable error + AMDSMI_RAS_ERR_STATE_MULT_UC = 4 # Multiple uncorrectable errors + AMDSMI_RAS_ERR_STATE_POISON = 5 # Firmware detected error and isolated page. Treat as uncorrectable. + AMDSMI_RAS_ERR_STATE_ENABLED = 6 # ECC is enabled + + AMDSMI_RAS_ERR_STATE_LAST = AMDSMI_RAS_ERR_STATE_ENABLED + AMDSMI_RAS_ERR_STATE_INVALID = 0xFFFFFFFF + +class amdsmi_memory_type_t (c_int): + AMDSMI_MEM_TYPE_FIRST = 0 + + AMDSMI_MEM_TYPE_VRAM = AMDSMI_MEM_TYPE_FIRST # VRAM memory + AMDSMI_MEM_TYPE_VIS_VRAM = 1 # VRAM memory that is visible + AMDSMI_MEM_TYPE_GTT = 2 # GTT memory + + AMDSMI_MEM_TYPE_LAST = AMDSMI_MEM_TYPE_GTT + +class amdsmi_freq_ind_t (c_int): + AMDSMI_FREQ_IND_MIN = 0 # Index used for the minimum frequency value + AMDSMI_FREQ_IND_MAX = 1 # Index used for the maximum frequency value + AMDSMI_FREQ_IND_INVALID = 0xFFFFFFFF # An invalid frequency index + +class amdsmi_xgmi_status_t (c_int): + AMDSMI_XGMI_STATUS_NO_ERRORS = 0 + AMDSMI_XGMI_STATUS_ERROR = 1 + AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS = 2 + +amdsmi_bit_field_t = c_uint64() +amdsmi_bit_field = amdsmi_bit_field_t + +# Reserved Memory Page States +class amdsmi_memory_page_status_t (c_int): + AMDSMI_MEM_PAGE_STATUS_RESERVED = 0 # Reserved. This gpu page is reserved and not available for use + AMDSMI_MEM_PAGE_STATUS_PENDING = 1 # Pending. This gpu page is marked as bad + # and will be marked reserved at the next window. + AMDSMI_MEM_PAGE_STATUS_UNRESERVABLE = 2 # Unable to reserve this page + +# Types for IO Link +class AMDSMI_IO_LINK_TYPE (c_int): + AMDSMI_IOLINK_TYPE_UNDEFINED = 0 # unknown type. + AMDSMI_IOLINK_TYPE_PCIEXPRESS = 1 # PCI Express + AMDSMI_IOLINK_TYPE_XGMI = 2 # XGMI + AMDSMI_IOLINK_TYPE_NUMIOLINKTYPES = 3 # Number of IO Link types + AMDSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF # Max of IO Link types + +# The utilization counter type +class AMDSMI_UTILIZATION_COUNTER_TYPE (c_int): + AMDSMI_UTILIZATION_COUNTER_FIRST = 0 # GFX Activity + AMDSMI_COARSE_GRAIN_GFX_ACTIVITY = AMDSMI_UTILIZATION_COUNTER_FIRST + AMDSMI_COARSE_GRAIN_MEM_ACTIVITY = 1 # Memory Activity + AMDSMI_UTILIZATION_COUNTER_LAST = AMDSMI_COARSE_GRAIN_MEM_ACTIVITY + +# Reserved Memory Page Record +class amdsmi_utilization_counter_t (Structure): + _fields_=[ + ('page_address', c_uint64), + ('page_size', c_uint64), + ('status', amdsmi_memory_page_status_t), + ] + +# Number of possible power profiles that a system could support +AMDSMI_MAX_NUM_POWER_PROFILES = (sizeof(amdsmi_bit_field_t) * 8) + +# This structure contains information about which power profiles are +# supported by the system for a given device, and which power profile is currently active. + +class amdsmi_power_profile_status_t (Structure): + _fields_ = [ + ('available_profiles', c_uint64), # Which profiles are supported by this system + ('current', amdsmi_power_profile_preset_masks_t), # Which power profile is currently active + ('num_profiles', c_uint32) # How many power profiles are available + ] + +# This structure holds information about clock frequencies. +class amdsmi_frequencies_t (Structure): + _fields_ = [ + ('num_supported', c_uint32), # The number of supported frequencies + ('current', c_uint32), # The current frequency index + ('frequency', c_uint64 * AMDSMI_MAX_NUM_FREQUENCIES), # List of frequencies. + # Only the first num_supported frequencies are valid. + ] + +#This structure holds information about the possible PCIe +#bandwidths. Specifically, the possible transfer rates and their +#associated numbers of lanes are stored here. +class amdsmi_pcie_bandwidth_t (Structure): + _fields_ = [ + ('transfer_rate', amdsmi_frequencies_t), # Transfer rates (T/s) that are possible + ('lanes', c_uint32 * AMDSMI_MAX_NUM_FREQUENCIES), # List of lanes for corresponding transfer rate. + # Only the first num_supported bandwidths are valid. + ] + +# This structure holds version information. + +class amdsmi_version_t (Structure): + _fields_ = [ + ('major', c_uint32), # Major version + ('minor', c_uint32), # Minor version + ('patch', c_uint32), # Patch, build or stepping version + ('build', c_char_p), # Build string + ] + +# This structure represents a point on the frequency-voltage plane. +class amdsmi_od_vddc_point_t (Structure): + _fields_ = [ + ('frequency', c_uint64), # Frequency coordinate (in Hz) + ('voltage', c_uint64), # Voltage coordinate (in mV) + ] + +# This structure holds 2 ::amdsmi_range_t's, one for frequency and one for +# voltage. These 2 ranges indicate the range of possible values for the +# corresponding ::amdsmi_od_vddc_point_t. + +class amdsmi_freq_volt_region_t (Structure): + _fields_ = [ + ('freq_range', amdsmi_range_t), # The frequency range for this VDDC Curve point + ('volt_range', amdsmi_range_t), # The voltage range for this VDDC Curve point + ] + +# Array of ::AMDSMI_NUM_VOLTAGE_CURVE_POINTS ::amdsmi_od_vddc_point_t's that +# make up the voltage frequency curve points. + +class amdsmi_od_volt_curve_t (Structure): + _fields_ = [ + # Array of ::AMDSMI_NUM_VOLTAGE_CURVE_POINTS ::amdsmi_od_vddc_point_t's that + # make up the voltage frequency curve points. + ('vc_points', amdsmi_od_vddc_point_t * AMDSMI_NUM_VOLTAGE_CURVE_POINTS) + ] + +# This structure holds the frequency-voltage values for a device. +class amdsmi_od_volt_freq_data_t (Structure): + _fields_ = [ + ('curr_sclk_range', amdsmi_range_t), # The current SCLK frequency range + ('curr_mclk_range', amdsmi_range_t), # The current MCLK frequency range; (upper bound only) + ('sclk_freq_limits', amdsmi_range_t), # The range possible of SCLK values + ('mclk_freq_limits', amdsmi_range_t), # The range possible of MCLK values + ('curve', amdsmi_od_volt_curve_t), # The current voltage curve + ('num_regions', c_uint32), # The number of voltage curve regions + ] + +# The following structures hold the gpu metrics values for a device. +# Size and version information of metrics data + +class amd_metrics_table_header_t (Structure): + _fields_ = [ + ('structure_size', c_uint16), + ('format_revision', c_ubyte), + ('content_revision', c_ubyte), + ] + +AMDSMI_GPU_METRICS_API_FORMAT_VER = 1 +AMDSMI_GPU_METRICS_API_CONTENT_VER_1 = 1 +AMDSMI_GPU_METRICS_API_CONTENT_VER_2 = 2 +AMDSMI_GPU_METRICS_API_CONTENT_VER_3 = 3 + +AMDSMI_NUM_HBM_INSTANCES = 4 # This should match NUM_HBM_INSTANCES +CENTRIGRADE_TO_MILLI_CENTIGRADE = 1000 # Unit conversion factor for HBM temperatures + +class amdsmi_gpu_metrics_t (Structure): + _fields_ = [ + ('common_header', amd_metrics_table_header_t), + # Temperature + ('temperature_edge', c_uint16), + ('temperature_hotspot', c_uint16), + ('temperature_mem', c_uint16), + ('temperature_vrgfx', c_uint16), + ('temperature_vrsoc', c_uint16), + ('temperature_vrmem', c_uint16), + # Utilization + ('average_gfx_activity', c_uint16), + ('average_umc_activity', c_uint16), + ('average_mm_activity', c_uint16), + # Power/Energy + ('average_socket_power', c_uint16), + ('energy_accumulator', c_uint64), + # Driver attached timestamp (in ns) + ('system_clock_counter', c_uint64), + # Average clocks + ('average_gfxclk_frequency', c_uint16), + ('average_socclk_frequency', c_uint16), + ('average_uclk_frequency', c_uint16), + ('average_vclk0_frequency', c_uint16), + ('average_dclk0_frequency', c_uint16), + ('average_vclk1_frequency', c_uint16), + ('average_dclk1_frequency', c_uint16), + # Current clocks + ('current_gfxclk', c_uint16), + ('current_socclk', c_uint16), + ('current_uclk', c_uint16), + ('current_vclk0', c_uint16), + ('current_dclk0', c_uint16), + ('current_vclk1', c_uint16), + ('current_dclk1', c_uint16), + # Throttle status + ('throttle_status', c_uint32), + # Fans + ('current_fan_speed', c_uint16), + # Link width/speed + ('pcie_link_width', c_uint16), # v1 mod.(8->16) + ('pcie_link_speed', c_uint16), # in 0.1 GT/s; v1 mod. (8->16) + ('padding', c_uint16), # new in v1 + ('gfx_activity_acc', c_uint32), # new in v1 + ('mem_actvity_acc', c_uint32), # new in v1 + ('temperature_hbm', c_uint16 * AMDSMI_NUM_HBM_INSTANCES) # new in v1 + ] + +# This structure holds error counts. +class amdsmi_error_count_t (Structure): + _fields_ = [ + ('correctable_count', c_uint64), # Accumulated correctable errors + ('uncorrectable_count', c_uint64) # Accumulated uncorrectable errors + ] + +# This structure holds pcie info. +class amdsmi_pcie_info_t (Structure): + _fields_ = [ + ('pcie_lanes', c_uint16), + ('pcie_speed', c_uint16), + ] + +class amdsmi_process_info_t (Structure): + _fields_ = [ + ('process_id', c_uint32), # Process ID + ('pasid', c_uint32), # PASID + ('vram_usage', c_uint64), # VRAM usage + ('sdma_usage', c_uint64), # SDMA usage in microseconds + ('cu_occupancy', c_uint32), # Compute Unit usage in percent + ] + +# Opaque handle to function-support object +class amdsmi_func_id_iter_handle(Structure): + pass +amdsmi_func_id_iter_handle_t = POINTER(amdsmi_func_id_iter_handle) + +# Place-holder "variant" for functions that have don't have any variants, +# but do have monitors or sensors. + +AMDSMI_DEFAULT_VARIANT = 0xFFFFFFFFFFFFFFFF + +class submodule_union(Union): + _fields_ = [ + ('memory_type', amdsmi_memory_type_t), + ('temp_metric', amdsmi_temperature_metric_t), + ('evnt_type', amdsmi_event_type_t), + ('evnt_group', amdsmi_event_group_t), + ('clk_type', amdsmi_clk_type_t), + ('fw_block', amdsmi_fw_block_t), + ('gpu_block_type', amdsmi_gpu_block_t), + ] +class amdsmi_func_id_value_t (Union): + _fields_ = [ + ('id', c_uint64), + ('name', c_char_p), + ('submodule', submodule_union) + ] + +amd_id = amdsmi_func_id_value_t \ No newline at end of file diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc index 69d9ef12b0..b4228dc3d9 100644 --- a/example/amd_smi_drm_example.cc +++ b/example/amd_smi_drm_example.cc @@ -259,6 +259,8 @@ int main() { // Get device type. Since the amdsmi is initialized with // AMD_SMI_INIT_AMD_GPUS, the device_type must be AMD_GPU. device_type_t device_type = {}; + std::cout << "Device Handle: " << device_handles[j] << std::endl; + ret = amdsmi_get_device_type(device_handles[j], &device_type); CHK_AMDSMI_RET(ret) if (device_type != AMD_GPU) {