#!/usr/bin/env python3 # # Copyright (C) Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in # the Software without restriction, including without limitation the rights to # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of # the Software, and to permit persons to whom the Software is furnished to do so, # subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import argparse import errno import os import sys import time import collections from amdsmi import amdsmi_interface from typing import Optional from typing import Union from pathlib import Path from _version import __version__ from amdsmi_helpers import AMDSMIHelpers import amdsmi_cli_exceptions # Custom Help Formatter for increasing the action max length class AMDSMIParserHelpFormatter(argparse.HelpFormatter): def __init__(self, prog): super().__init__(prog=prog, indent_increment=2, max_help_position=24, width=90) self._action_max_length = 20 # Custom Help Formatter for not duplicating the metavar in the subparsers class AMDSMISubparserHelpFormatter(argparse.RawTextHelpFormatter): def __init__(self, prog): super().__init__(prog, indent_increment=2, max_help_position=80, width=90) self._action_max_length = 20 def _format_action_invocation(self, action): if not action.option_strings or action.nargs == 0: return super()._format_action_invocation(action) default = self._get_default_metavar_for_optional(action) args_string = self._format_args(action, default) return ', '.join(action.option_strings) + ' ' + args_string class AMDSMIParser(argparse.ArgumentParser): """Unified Parser for AMDSMI CLI. This parser doesn't access amdsmi's lib directly,but via AMDSMIHelpers, this allows for us to use this parser with future OS & Platform integration. Args: argparse (ArgumentParser): argparse.ArgumentParser """ def __init__(self, version, list, static, firmware, bad_pages, metric, process, profile, event, topology, set_value, reset, monitor, xgmi, partition, ras, node, rocm_smi, default, sys_argv=None, helpers=None): # Helper variables if helpers is None: # If helpers is not provided, create a new instance self.helpers = AMDSMIHelpers() else: self.helpers = helpers # Get choices based on driver initialized if self.helpers.is_amdgpu_initialized(): self.gpu_choices, self.gpu_choices_str = self.helpers.get_gpu_choices() else: self.gpu_choices = {} self.gpu_choices_str = "" if self.helpers.is_amd_hsmp_initialized(): self.cpu_choices, self.cpu_choices_str = self.helpers.get_cpu_choices() self.core_choices, self.core_choices_str = self.helpers.get_core_choices() else: self.cpu_choices = {} self.cpu_choices_str = "" self.core_choices = {} self.core_choices_str = "" self.vf_choices = ['3', '2', '1'] self.version_string = f"Version: {__version__}" self.platform_string = f"Platform: {self.helpers.os_info()}" self.rocm_version = self.helpers.get_rocm_version() self.rocm_version_string = f"ROCm version: {self.rocm_version}" self.program_name = 'amd-smi' self.description = f"AMD System Management Interface | {self.version_string} | {self.rocm_version_string} | {self.platform_string}" # Adjust argument parser options super().__init__( formatter_class= lambda prog: AMDSMIParserHelpFormatter(prog), description=self.description, epilog="For detailed help on specific commands: amd-smi [command] -h", add_help=True, prog=self.program_name) # Add top-level --rocm-smi flag self.add_argument('--rocm-smi', action='store_true', help='Display GPU information in ROCm-SMI compatible format') # Add top-level command modifiers (for --rocm-smi and other top-level flags) loglevel_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] loglevel_help = f"Set the logging level from the possible choices: {', '.join(loglevel_choices)}" self.add_argument('--loglevel', action='store', type=str.upper, required=False, help=loglevel_help, default='ERROR', metavar='LEVEL', choices=loglevel_choices) # Setup subparsers self.subparsers = self.add_subparsers( title="AMD-SMI Commands", parser_class=argparse.ArgumentParser, help="Descriptions:", metavar='') # Store possible subcommands & aliases for later errors self.possible_commands = ['version', 'list', 'static', 'firmware', 'ucode', 'bad-pages', 'metric', 'process', 'profile', 'event', 'topology', 'set', 'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', 'node', 'default'] # Add all subparsers if sys_argv is not None: if any(arg in sys_argv for arg in ['--help', '-h']): self._add_version_parser(self.subparsers, version) self._add_list_parser(self.subparsers, list) self._add_static_parser(self.subparsers, static) self._add_firmware_parser(self.subparsers, firmware) self._add_bad_pages_parser(self.subparsers, bad_pages) self._add_metric_parser(self.subparsers, metric) self._add_process_parser(self.subparsers, process) self._add_profile_parser(self.subparsers, profile) self._add_event_parser(self.subparsers, event) self._add_topology_parser(self.subparsers, topology) self._add_set_value_parser(self.subparsers, set_value) self._add_reset_parser(self.subparsers, reset) self._add_monitor_parser(self.subparsers, monitor) self._add_xgmi_parser(self.subparsers, xgmi) self._add_partition_parser(self.subparsers, partition) self._add_ras_parser(self.subparsers, ras) self._add_node_parser(self.subparsers, node) elif any(arg in sys_argv for arg in ['version']): self._add_version_parser(self.subparsers, version) elif any(arg in sys_argv for arg in ['list']): self._add_list_parser(self.subparsers, list) elif any(arg in sys_argv for arg in ['static']): self._add_static_parser(self.subparsers, static) elif any(arg in sys_argv for arg in ['firmware', 'ucode']): self._add_firmware_parser(self.subparsers, firmware) elif any(arg in sys_argv for arg in ['bad-pages']): self._add_bad_pages_parser(self.subparsers, bad_pages) elif any(arg in sys_argv for arg in ['metric']): self._add_metric_parser(self.subparsers, metric) elif any(arg in sys_argv for arg in ['process']): self._add_process_parser(self.subparsers, process) elif any(arg in sys_argv for arg in ['profile']): self._add_profile_parser(self.subparsers, profile) elif any(arg in sys_argv for arg in ['event']): self._add_event_parser(self.subparsers, event) elif any(arg in sys_argv for arg in ['topology']): self._add_topology_parser(self.subparsers, topology) elif any(arg in sys_argv for arg in ['set']): self._add_set_value_parser(self.subparsers, set_value) elif any(arg in sys_argv for arg in ['reset']): self._add_reset_parser(self.subparsers, reset) elif any(arg in sys_argv for arg in ['monitor', 'dmon']): self._add_monitor_parser(self.subparsers, monitor) elif any(arg in sys_argv for arg in ['xgmi']): self._add_xgmi_parser(self.subparsers, xgmi) elif any(arg in sys_argv for arg in ['partition']): self._add_partition_parser(self.subparsers, partition) elif any(arg in sys_argv for arg in ['ras']): self._add_ras_parser(self.subparsers, ras) elif any(arg in sys_argv for arg in ['node']): self._add_node_parser(self.subparsers, node) else: # If no subcommand is given, add the default parser self._add_default_parser(self.subparsers, default) def _not_negative_int(self, int_value, sub_arg=None): # Argument type validator if int_value.isdigit(): # Is digit doesn't work on negative numbers return int(int_value) outputformat = self.helpers.get_output_format() if int_value == "": raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(sub_arg, outputformat) else: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], int_value, outputformat) def _positive_int(self, int_value, sub_arg=None): # Argument type validator if int_value.isdigit(): # Is digit doesn't work on negative numbers if int(int_value) > 0: return int(int_value) outputformat = self.helpers.get_output_format() if int_value == "": raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(sub_arg, outputformat) else: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], int_value, outputformat) def _is_valid_string(self, string_value, sub_arg=None): # Argument type validator # This is for triggering a cli exception if an empty string is detected if string_value: return string_value outputformat = self.helpers.get_output_format() if string_value == "": raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(sub_arg, outputformat) else: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], string_value, outputformat) def _is_command_supported(self, user_input, acceptable_values, command_name): if acceptable_values == "N/A": outputformat = self.helpers.get_output_format() raise amdsmi_cli_exceptions.AmdSmiPermissionDeniedException(command_name, outputformat) elif str(user_input).upper() not in acceptable_values: print(f"Valid inputs are {acceptable_values}") raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], str(user_input).upper(), self.helpers.get_output_format()) else: return str(user_input).upper() def _limit_select(self): """Custom action for setting clock limits""" output_format = self.helpers.get_output_format() class AMDSMILimitArgs(argparse.Action): def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace, values: Union[str, list, None], option_string: Optional[str] = None) -> None: # valid values valid_clk_types = ('sclk', 'mclk') valid_lim_types = ('min', 'max') clk_type, lim_type, val = values # Check if the sclk and mclk parameters are valid if clk_type not in valid_clk_types: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], clk_type, output_format) if lim_type not in valid_lim_types: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], lim_type, output_format) # Check if the val is a valid integer value if not val.isdigit(): raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], val, output_format) val = int(val) if val < 0: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], val, output_format) clk_limit_args = collections.namedtuple('clk_limit_args', ['clk_type', 'lim_type', 'val']) setattr(namespace, self.dest, clk_limit_args(clk_type, lim_type, val)) return AMDSMILimitArgs def _level_select(self): """Custom action for setting clock frequencies to particular performance level""" output_format = self.helpers.get_output_format() class AMDSMIFreqArgs(argparse.Action): def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace, values: list, option_string: Optional[str] = None) -> None: # valid values valid_clk_types = ('sclk', 'mclk', 'pcie', 'fclk', 'socclk') clk_type = values[0] perf_levels_str = values[1:] # Check if the sclk and mclk parameters are valid if clk_type not in valid_clk_types: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], clk_type, output_format) perf_levels = [] # Check if every item in perf level is valid for level in perf_levels_str: if not level.isdigit(): raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], level, output_format) level = int(level) if level < 0: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], level, output_format) perf_levels.append(level) clk_level_args = collections.namedtuple('clk_level_args', ['clk_type', 'perf_levels']) setattr(namespace, self.dest, clk_level_args(clk_type, perf_levels)) return AMDSMIFreqArgs def _power_cap_options(self): """Custom action for setting power cap options""" output_format = self.helpers.get_output_format() class AMDSMIPowerCapArgs(argparse.Action): def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace, values: list, option_string: Optional[str] = None) -> None: if len(values) == 1: # Only wattage provided - set all available power cap types power_cap_value = values[0] power_cap_type = None # None means all available sensors elif len(values) == 2: # Both power type and wattage provided power_cap_value = values[0] power_cap_type = values[1] if power_cap_type not in ['ppt0', 'ppt1']: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], power_cap_type, output_format) else: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], values, output_format) if not power_cap_value.isdigit(): raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], power_cap_value, output_format) power_cap_args = collections.namedtuple('power_cap_args', ['pwr_type', 'watts']) setattr(namespace, self.dest, power_cap_args(power_cap_type, int(power_cap_value))) return AMDSMIPowerCapArgs def _check_folder_path(self): """ Argument action validator: Returns a path to folder from the folder path provided. If the path doesn't exist create it. """ class CheckOutputFilePath(argparse.Action): outputformat = self.helpers.get_output_format() # Checks the values def __call__(self, parser, args, values, option_string=None): path = Path(values) try: path.mkdir(parents=True, exist_ok=True) except OSError as e: raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat, f"Unable to make '{path}' a folder.") if not path.exists(): raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat) elif path.is_dir(): setattr(args, self.dest, path) return CheckOutputFilePath def _check_output_file_path(self): """ Argument action validator: Returns a path to a file from the output file path provided. If the path is a directory then create a file within it and return that file path If the path is a file and it exists return the file path If the path is a file and it doesn't exist create and return the file path """ class CheckOutputFilePath(argparse.Action): outputformat = self.helpers.get_output_format() # Checks the values def __call__(self, parser, args, values, option_string=None): path = Path(values) if not path.exists(): if path.parent.is_dir(): path.touch() setattr(args, self.dest, path) return else: raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat) if path.is_dir(): file_name = f"{int(time.time())}-amdsmi-output" if args.json: file_name += ".json" elif args.csv: file_name += ".csv" else: file_name += ".txt" path = path / file_name path.touch() setattr(args, self.dest, path) elif path.is_file(): # Check if --append or --overwrite flags are present in command line has_append = '--append' in sys.argv has_overwrite = '--overwrite' in sys.argv if has_append or getattr(args, 'append', False): setattr(args, self.dest, path) return if has_overwrite or getattr(args, 'overwrite', False): path.open('w').close() path.touch() setattr(args, self.dest, path) return # Prompt if neither --append nor --overwrite are specified try: resp = input(f"File '{path}' exists. Overwrite (o) / Append (a) / Cancel (N) ? [o/a/N]: ").strip().lower() except Exception: sys.exit('Confirmation not given. Exiting without setting value') if resp in ('a', 'append'): setattr(args, self.dest, path) return elif resp in ('o', 'yes'): path.open('w').close() setattr(args, self.dest, path) return else: # User declined to overwrite raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException( path, CheckOutputFilePath.outputformat, "User declined to overwrite or append existing file.") else: raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat) return CheckOutputFilePath def _check_cper_file_path(self): """ Argument action validator: Returns a path to a file from the input file path provided. If the file doesn't exist, is empty, or is invalid, raise an error. """ class _CheckInputFilePath(argparse.Action): # Checks the values outputformat=self.helpers.get_output_format() def __call__(self, parser, args, values, option_string=None): path = Path(values) try: if not path.exists(): raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct.") if path.is_dir(): raise IsADirectoryError(f"Invalid Path: {path} is a directory when it needs to be a specific file.") if path.is_file(): if os.stat(values).st_size == 0: raise ValueError(f"Invalid Path: {path} Input file is empty.") setattr(args, self.dest, path) else: raise FileNotFoundError(f"Invalid Path: {path} Could not determine if the value given is a valid path.") except Exception as root_cause: raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, _CheckInputFilePath.outputformat) from root_cause return _CheckInputFilePath def _check_watch_selected(self): """ Validate that the -w/--watch argument was selected This is because -W/--watch_time and -i/--iterations are dependent on watch """ class WatchSelectedAction(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): if args.watch is None: raise argparse.ArgumentError(self, f"invalid argument: '{self.dest}' needs to be paired with -w/--watch") else: setattr(args, self.dest, values) return WatchSelectedAction def _gpu_select(self, gpu_choices): """ Custom argparse action to return the device handle(s) for the gpu(s) selected This will set the destination (args.gpu) to a list of 1 or more device handles If 1 or more device handles are not found then raise an ArgumentError for the first invalid gpu seen """ amdsmi_helpers = self.helpers class _GPUSelectAction(argparse.Action): outputformat=self.helpers.get_output_format() # Checks the values def __call__(self, parser, args, values, option_string=None): if "all" in gpu_choices: del gpu_choices["all"] status, gpu_format, selected_device_handles = amdsmi_helpers.get_device_handles_from_gpu_selections(gpu_selections=values, gpu_choices=gpu_choices) if status: setattr(args, self.dest, selected_device_handles) else: if selected_device_handles == '': raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--gpu", _GPUSelectAction.outputformat) elif not gpu_format: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], selected_device_handles, _GPUSelectAction.outputformat) else: raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles, _GPUSelectAction.outputformat, True, False, False) return _GPUSelectAction def _cpu_select(self, cpu_choices): """ Custom argparse action to return the device handle(s) for the cpu(s) selected This will set the destination (args.cpu) to a list of 1 or more device handles If 1 or more device handles are not found then raise an ArgumentError for the first invalid cpu seen """ amdsmi_helpers = self.helpers class _CPUSelectAction(argparse.Action): outputformat=self.helpers.get_output_format() # Checks the values def __call__(self, parser, args, values, option_string=None): if "all" in cpu_choices: del cpu_choices["all"] status, cpu_format, selected_device_handles = amdsmi_helpers.get_device_handles_from_cpu_selections(cpu_selections=values, cpu_choices=cpu_choices) if status: setattr(args, self.dest, selected_device_handles) else: if selected_device_handles == '': raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--cpu", _CPUSelectAction.outputformat) elif not cpu_format: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], selected_device_handles, _CPUSelectAction.outputformat) else: raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles, _CPUSelectAction.outputformat, False, True, False) return _CPUSelectAction def _core_select(self, core_choices): """ Custom argparse action to return the device handle(s) for the core(s) selected This will set the destination (args.core) to a list of 1 or more device handles If 1 or more device handles are not found then raise an ArgumentError for the first invalid core seen """ amdsmi_helpers = self.helpers class _CoreSelectAction(argparse.Action): outputformat=self.helpers.get_output_format() # Checks the values def __call__(self, parser, args, values, option_string=None): if "all" in core_choices: del core_choices["all"] status, core_format, selected_device_handles = amdsmi_helpers.get_device_handles_from_core_selections(core_selections=values, core_choices=core_choices) if status: setattr(args, self.dest, selected_device_handles) else: if selected_device_handles == '': raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--core", _CoreSelectAction.outputformat) elif not core_format: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], selected_device_handles, _CoreSelectAction.outputformat) else: raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles, _CoreSelectAction.outputformat, False, False, True) return _CoreSelectAction def _add_watch_arguments(self, subcommand_parser): # Device arguments help text watch_help = "Reprint the command in a loop of INTERVAL seconds" watch_time_help = "The total TIME to watch the given command" iterations_help = "Total number of ITERATIONS to loop on the given command" # Mutually Exclusive Args within the subparser subcommand_parser.add_argument('-w', '--watch', action='store', metavar='INTERVAL', type=lambda value: self._positive_int(value, '--watch'), required=False, help=watch_help) subcommand_parser.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='TIME', type=lambda value: self._positive_int(value, '--watch_time'), required=False, help=watch_time_help) subcommand_parser.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='ITERATIONS', type=lambda value: self._positive_int(value, '--iterations'), required=False, help=iterations_help) def _validate_cpu_core(self, value): if value == '': outputformat = self.helpers.get_output_format() raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(value, outputformat) if isinstance(value, str): if value.lower() == "all": return value if value.isdigit(): if int(value) < 0: outputformat = self.helpers.get_output_format() raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], value, outputformat) else: outputformat = self.helpers.get_output_format() raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], value, outputformat) if isinstance(value, int): if int(value) < 0: outputformat = self.helpers.get_output_format() raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], value, outputformat) return value def _validate_set_clock(self, validate_clock_type=True): """ Validate Clock input""" amdsmi_helpers = self.helpers class _ValidateClockType(argparse.Action): # Checks the clock type and clock values def __call__(self, parser, args, values, option_string=None): if validate_clock_type: clock_type = values[0] clock_types = amdsmi_helpers.get_clock_types()[0] valid_clock_type, amdsmi_clock_type = amdsmi_helpers.validate_clock_type(input_clock_type=clock_type) if not valid_clock_type: raise argparse.ArgumentError(self, f"Invalid argument: '{clock_type}' needs to be a valid clock type:{clock_types}") clock_levels = values[1:] else: clock_levels = values freq_bitmask = 0 for level in clock_levels: if level > 63: raise argparse.ArgumentError(self, f"Invalid argument: '{level}' needs to be a valid clock level 0-63") freq_bitmask |= (1 << level) if validate_clock_type: setattr(args, self.dest, (amdsmi_clock_type, freq_bitmask)) else: setattr(args, self.dest, freq_bitmask) return _ValidateClockType def _prompt_spec_warning(self): """ Prompt out of spec warning""" amdsmi_helpers = self.helpers class _PromptSpecWarning(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): amdsmi_helpers.confirm_out_of_spec_warning() setattr(args, self.dest, values) return _PromptSpecWarning @staticmethod def _custom_ceil(x): """ Custom ceiling function to round up float values to the nearest integer. This is used to ensure that fan speed percentages are rounded up correctly. """ if x == int(x): # If x is already an integer return int(x) elif x > 0: # For positive numbers, floor division + 1 return int(x) + 1 else: # For negative numbers, floor division directly gives the ceiling return int(x) def _validate_fan_speed(self): """ Validate fan speed input""" amdsmi_helpers = self.helpers class _ValidateFanSpeed(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): if isinstance(values, str): # Convert percentage to fan level if '%' in values: try: amdsmi_helpers.confirm_out_of_spec_warning() # Convert percentage to fan speed level values = (int(values[:-1]) / 100) * 255 values = AMDSMIParser._custom_ceil(values) # Round up (Ceiling) setattr(args, self.dest, values) except ValueError as e: raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-100%") else: # Store the fan level as fan_speed values = int(values) if 0 <= values <= 255: amdsmi_helpers.confirm_out_of_spec_warning() setattr(args, self.dest, values) else: raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255") else: raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255 or 0-100%") return _ValidateFanSpeed def _validate_overdrive_percent(self): """ Validate overdrive percentage input""" amdsmi_helpers = self.helpers class _ValidateOverdrivePercent(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): if isinstance(values, str): try: if values[-1] == '%': over_drive_percent = int(values[:-1]) else: over_drive_percent = int(values) if 0 <= over_drive_percent <= 20: amdsmi_helpers.confirm_out_of_spec_warning() setattr(args, self.dest, over_drive_percent) else: raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be within range 0-20 or 0-20%") except ValueError: raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%") else: raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%") return _ValidateOverdrivePercent def _validate_ptl_format(self): """Validate and normalize --ptl-format FRMT1,FRMT2.""" valid_names = [name for name in amdsmi_interface.AmdSmiPtlData.__members__ if name != 'INVALID'] class _ValidatePtlFormat(argparse.Action): def __call__(self, parser, args, values, option_string=None): if not isinstance(values, str): raise argparse.ArgumentError( self, f"Invalid argument for {option_string}: '{values}' " f"(expected string like I8,F32)" ) parts = values.split(',') if len(parts) != 2: raise argparse.ArgumentError( self, f"{option_string} expects exactly two comma-separated formats, " f"e.g. I8,F32 (got '{values}')" ) enums = [] for raw in parts: token = raw.strip().upper() if token == '': raise argparse.ArgumentError( self, f"Empty PTL format in '{values}'. Expected formats like I8,F32." ) try: enum_val = amdsmi_interface.AmdSmiPtlData[token] except KeyError: raise argparse.ArgumentError( self, f"Invalid PTL format '{raw}'. Valid formats are: " + ", ".join(valid_names) ) if enum_val == amdsmi_interface.AmdSmiPtlData.INVALID: raise argparse.ArgumentError( self, f"INVALID is not a usable PTL format." ) enums.append(enum_val) if enums[0] == enums[1]: raise argparse.ArgumentError( self, f"PTL formats must be different (got '{values}')." ) setattr(args, self.dest, (enums[0], enums[1])) return _ValidatePtlFormat ### Building parsers ### def _add_device_arguments(self, subcommand_parser: argparse.ArgumentParser, required=False): # Device arguments help text gpu_help = f"Select a GPU ID, BDF, or UUID from the possible choices:\n{self.gpu_choices_str}" vf_help = "Gets general information about the specified VF (timeslice, fb info, …).\ \nAvailable only on virtualization OSs" cpu_help = f"Select a CPU ID from the possible choices:\n{self.cpu_choices_str}" core_help = f"Select a Core ID from the possible choices:\n{self.core_choices_str}" # Create argument group for all the devices device_group = subcommand_parser.add_argument_group('Device Arguments') # Mutually Exclusive Args within the subparser device_args = device_group.add_mutually_exclusive_group(required=required) if self.helpers.is_amdgpu_initialized(): device_args.add_argument('-g', '--gpu', action=self._gpu_select(self.gpu_choices), nargs='+', help=gpu_help) if self.helpers.is_amd_hsmp_initialized(): device_args.add_argument('-U', '--cpu', type=self._validate_cpu_core, action=self._cpu_select(self.cpu_choices), nargs='+', help=cpu_help) if subcommand_parser._optionals.title != "Static Arguments": device_args.add_argument('-O', '--core', type=self._validate_cpu_core, action=self._core_select(self.core_choices), nargs='+', help=core_help) if self.helpers.is_hypervisor(): device_args.add_argument('-v', '--vf', action='store', nargs='+', help=vf_help, choices=self.vf_choices) def _add_command_modifiers(self, subcommand_parser: argparse.ArgumentParser): json_help = "Displays output in JSON format" csv_help = "Displays output in CSV format" file_help = "Saves output into a file on the provided path" loglevel_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] loglevel_choices_str = ", ".join(loglevel_choices) loglevel_help = ( f"Set the logging level from the possible choices:\n " f"{loglevel_choices_str}" ) command_modifier_group = subcommand_parser.add_argument_group('Command Modifiers') # Output Format options logging_args = command_modifier_group.add_mutually_exclusive_group() logging_args.add_argument('--json', action='store_true', required=False, help=json_help) logging_args.add_argument('--csv', action='store_true', required=False, help=csv_help) command_modifier_group.add_argument('--file', action=self._check_output_file_path(), type=str, required=False, help=file_help) command_modifier_group.add_argument('--overwrite', action='store_true', required=False, help="Overwrite the file") command_modifier_group.add_argument('--append', action='store_true', required=False, help="Append to the file") # Placing loglevel outside the subcommands so it can be used with any subcommand command_modifier_group.add_argument('--loglevel', action='store', type=str.upper, required=False, help=loglevel_help, default='ERROR', metavar='LEVEL', choices=loglevel_choices) return command_modifier_group def _add_watch_arguments(self, subcommand_parser: argparse.ArgumentParser): # Device arguments help text watch_help = "Reprint the command in a loop of INTERVAL seconds" watch_time_help = "The total duration of TIME to watch the command" iterations_help = "The total number of ITERATIONS to repeat the command" watch_arguments_group = subcommand_parser.add_argument_group('Watch Arguments') # Mutually Exclusive Args within the subparser watch_arguments_group.add_argument('-w', '--watch', action='store', metavar='INTERVAL', type=lambda value: self._positive_int(value, '--watch'), required=False, help=watch_help) watch_arguments_group.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='TIME', type=lambda value: self._positive_int(value, '--watch_time'), required=False, help=watch_time_help) watch_arguments_group.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='ITERATIONS', type=lambda value: self._positive_int(value, '--iterations'), required=False, help=iterations_help) return watch_arguments_group def _add_default_parser(self, subparsers: argparse._SubParsersAction, func): # there should be no args to parse here so let this be a dummy function to preserve later logic default_parser = subparsers.add_parser('default', description=None) default_parser._optionals.title = None default_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) default_parser.set_defaults(func=func) # Add Universal Arguments self._add_command_modifiers(default_parser) def _add_version_parser(self, subparsers: argparse._SubParsersAction, func): # Subparser help text version_help = "Display version information" description = self.description # Create version subparser version_parser = subparsers.add_parser('version', help=version_help, description=description) version_parser._optionals.title = None version_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) version_parser.set_defaults(func=func) # Add Universal Arguments self._add_command_modifiers(version_parser) # help info gpu_version_help = "Display the current amdgpu driver version" cpu_version_help = "Display the current amd_hsmp or hsmp_acpi driver version" # Add GPU and CPU version Arguments version_parser.add_argument('-g', '--gpu_version', action='store_true', required=False, help=gpu_version_help, default=None) version_parser.add_argument('-c', '--cpu_version', action='store_true', required=False, help=cpu_version_help, default=None) def _add_list_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The list subcommand is only applicable to systems with amdgpu initialized return # Subparser help text list_help = "List GPU information" list_optionals_title = "List Arguments" list_subcommand_help = f"{self.description}\n\nLists all detected devices on the system.\ \nLists the BDF, UUID, KFD_ID, NODE_ID, and Partition ID for each GPU and/or CPUs.\ \nIn virtualization environments, it can also list VFs associated to each\ \nGPU with some basic information for each VF." enumeration_help = "Enumeration mapping to other features.\ \n Includes CARD, RENDER, HSA_ID, HIP_ID, and HIP_UUID." # Create list subparser list_parser = subparsers.add_parser('list', help=list_help, description=list_subcommand_help) list_parser._optionals.title = list_optionals_title list_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) list_parser.set_defaults(func=func) # Create -e subparser list_parser.add_argument("-e", action="store_true", help=enumeration_help) # Add Universal Arguments self._add_device_arguments(list_parser, required=False) self._add_command_modifiers(list_parser) def _add_static_parser(self, subparsers: argparse._SubParsersAction, func): # Subparser help text static_help = "Gets static information about the specified GPU" static_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns static information for all GPUs on the system.\ \nIf no static argument is provided, all static information will be displayed." static_optionals_title = "Static Arguments" # Optional arguments help text asic_help = "All asic information" bus_help = "All bus information" vbios_help = "All video bios/IFWI information (if available)" limit_help = "All limit metric values (i.e. power and thermal limits)" driver_help = "Displays driver version" vram_help = "All vram information" cache_help = "All cache information" board_help = "All board information" soc_pstate_help = "The available soc pstate policy" xgmi_plpd_help = "The available XGMI per-link power down policy" process_isolation_help = "The process isolation status" profile_help = "Display current and available power profiles" clk_options = self.helpers.get_clock_types()[0] clk_options.remove('PCIE') clk_option_str = ", ".join(clk_options) + ", ALL" clock_help = f"Show one or more valid clock frequency levels. Available options:\n\t{clk_option_str}" # Options arguments help text for Hypervisors and Baremetal # Might be able to remove Sudo requirement in ROCm 7.0 ras_help = "Displays RAS features information;\n\tSudo may be required for some features" numa_help = "All numa node information" # Linux Baremetal only partition_help = "Partition information:\n\t" \ "No longer available in default output.\n\tArgument is required to display." \ "\n\tEx. `amd-smi static -p` or use" \ "\n\t`amd-smi partition -c -m`/`sudo amd-smi partition -a`" # Options arguments help text for Hypervisors dfc_help = "All DFC FW table information" fb_help = "Displays Frame Buffer information" num_vf_help = "Displays number of supported and enabled VFs" # Options arguments help text for CPUs smu_help = "All SMU FW information" interface_help = "Displays hsmp interface version" # Create static subparser static_parser = subparsers.add_parser('static', help=static_help, description=static_subcommand_help) static_parser._optionals.title = static_optionals_title static_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) static_parser.set_defaults(func=func) # Handle GPU Options if self.helpers.is_amdgpu_initialized(): static_parser.add_argument('-a', '--asic', action='store_true', required=False, help=asic_help) static_parser.add_argument('-b', '--bus', action='store_true', required=False, help=bus_help) # Accept vbios args without displaying them static_parser.add_argument('-V', '--vbios', dest='vbios', action='store_true', required=False, help=argparse.SUPPRESS) static_parser.add_argument('-I', '--ifwi', dest='vbios', action='store_true', required=False, help=vbios_help) static_parser.add_argument('-d', '--driver', action='store_true', required=False, help=driver_help) static_parser.add_argument('-v', '--vram', action='store_true', required=False, help=vram_help) static_parser.add_argument('-c', '--cache', action='store_true', required=False, help=cache_help) static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help) static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help) static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) static_parser.add_argument('-C', '--clock', action='store', default=False, nargs='*', type=str, required=False, help=clock_help) static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) # Options to display on Hypervisors and Baremetal if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) static_parser.add_argument('-P', '--soc-pstate', action='store_true', required=False, help=soc_pstate_help) static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help) static_parser.add_argument('-o', '--profile', action='store_true', required=False, help=profile_help) if self.helpers.is_linux() and not self.helpers.is_virtual_os(): static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) # Options to only display on a Hypervisor TODO: Add hypervisor driver check if self.helpers.is_hypervisor(): static_parser.add_argument('-d', '--dfc-ucode', action='store_true', required=False, help=dfc_help) static_parser.add_argument('-f', '--fb-info', action='store_true', required=False, help=fb_help) static_parser.add_argument('-n', '--num-vf', action='store_true', required=False, help=num_vf_help) # Handle CPU Options if self.helpers.is_amd_hsmp_initialized(): cpu_group = static_parser.add_argument_group("CPU Arguments") cpu_group.add_argument('-s', '--smu', action='store_true', required=False, help=smu_help) cpu_group.add_argument('-i', '--interface-ver', action='store_true', required=False, help=interface_help) # Add Universal Arguments self._add_device_arguments(static_parser, required=False) self._add_command_modifiers(static_parser) def _add_firmware_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The firmware subcommand is only applicable to systems with amdgpu initialized return # Subparser help text firmware_help = "Gets firmware information about the specified GPU" firmware_subcommand_help = f"{self.description}\n\nIf no GPU is specified, return firmware information for all GPUs on the system." firmware_optionals_title = "Firmware Arguments" # Optional arguments help text fw_list_help = "All FW list information" err_records_help = "All error records information" # Create firmware subparser firmware_parser = subparsers.add_parser('firmware', help=firmware_help, description=firmware_subcommand_help, aliases=['ucode']) firmware_parser._optionals.title = firmware_optionals_title firmware_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) firmware_parser.set_defaults(func=func) # Optional Args firmware_parser.add_argument('-f', '--ucode-list', '--fw-list', dest='fw_list', action='store_true', required=False, help=fw_list_help, default=True) # Options to only display on a Hypervisor if self.helpers.is_hypervisor(): firmware_parser.add_argument('-e', '--error-records', action='store_true', required=False, help=err_records_help) # Add Universal Arguments self._add_device_arguments(firmware_parser, required=False) self._add_command_modifiers(firmware_parser) def _add_bad_pages_parser(self, subparsers: argparse._SubParsersAction, func): if not (self.helpers.is_baremetal() and self.helpers.is_linux()): # The bad_pages subcommand is only applicable to Linux Baremetal systems return if not self.helpers.is_amdgpu_initialized(): # The bad_pages subcommand is only applicable to systems with amdgpu initialized return # Subparser help text bad_pages_help = "Gets bad page information about the specified GPU" bad_pages_subcommand_help = f"{self.description}\n\nIf no GPU is specified, return bad page information for all GPUs on the system." bad_pages_optionals_title = "Bad Pages Arguments" # Optional arguments help text pending_help = "Displays all pending retired pages" retired_help = "Displays retired pages" un_res_help = "Displays unreservable pages" hex_help = "Displays page addresses and sizes in hexadecimal format" # Create bad_pages subparser bad_pages_parser = subparsers.add_parser('bad-pages', help=bad_pages_help, description=bad_pages_subcommand_help) bad_pages_parser._optionals.title = bad_pages_optionals_title bad_pages_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) bad_pages_parser.set_defaults(func=func) # Optional Args bad_pages_parser.add_argument('-p', '--pending', action='store_true', required=False, help=pending_help) bad_pages_parser.add_argument('-r', '--retired', action='store_true', required=False, help=retired_help) bad_pages_parser.add_argument('-u', '--un-res', action='store_true', required=False, help=un_res_help) bad_pages_parser.add_argument('-x', '--hex', action='store_true', required=False, help=hex_help) # Add Universal Arguments self._add_device_arguments(bad_pages_parser, required=False) self._add_command_modifiers(bad_pages_parser) def _add_metric_parser(self, subparsers: argparse._SubParsersAction, func): # Subparser help text metric_help = "Gets metric/performance information about the specified GPU" metric_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns metric information for all GPUs on the system.\ \nIf no metric argument is provided, all metric information will be displayed." metric_optionals_title = "Metric arguments" # Optional arguments help text usage_help = "Displays engine usage information" # Help text for Arguments only Available on Linux Virtual OS and Baremetal platforms mem_usage_help = "Memory usage per block" # Help text for Arguments only on Hypervisor and Baremetal platforms power_help = "Current power usage" clock_help = "Average, max, and current clock frequencies" temperature_help = "Current temperatures" ecc_help = "Total number of ECC errors" ecc_blocks_help = "Number of ECC errors per block" pcie_help = "Current PCIe speed, width, and replay count" voltage_help = "GPU voltage" base_board_help = "base_board temperatures" gpu_board_help = "gpu_board temperatures" # Help text for Arguments only on Linux Baremetal platforms fan_help = "Current fan speed" vc_help = "Display voltage curve" overdrive_help = "Current GFX and MEM clock overdrive level" perf_level_help = "Current DPM performance level" xgmi_err_help = "XGMI error information since last read" energy_help = "Amount of energy consumed" throttle_help = "Displays throttle accumulators;\n Only available for MI300 or newer ASICs" # Help text for Arguments only on Hypervisors schedule_help = "All scheduling information" guard_help = "All guard information" guest_data_help = "All guest data information" fb_usage_help = "Displays total and used Frame Buffer usage information" xgmi_help = "Table of current XGMI metrics information" # Help text for cpu options cpu_power_metrics_help = "CPU power metrics" cpu_proc_help = "Displays prochot status" cpu_freq_help = "Displays currentFclkMemclk frequencies and cclk frequency limit" cpu_c0_res_help = "Displays C0 residency" cpu_lclk_dpm_help = "Displays lclk dpm level range. Requires socket ID and NBOID as inputs" cpu_pwr_svi_telemetry_rails_help = "Displays svi based telemetry for all rails" cpu_io_bandwidth_help = "Displays current IO bandwidth for the selected CPU.\ \n input parameters are bandwidth type(1) and link ID encodings\ \n i.e. P2, P3, G0 - G7" cpu_xgmi_bandwidth_help = "Displays current XGMI bandwidth for the selected CPU\ \n input parameters are bandwidth type(1,2,4) and link ID encodings\ \n i.e. P2, P3, G0 - G7" cpu_metrics_ver_help = "Displays metrics table version" cpu_metrics_table_help = "Displays metric table" cpu_socket_energy_help = "Displays socket energy for the selected CPU socket" cpu_ddr_bandwidth_help = "Displays per socket max ddr bw, current utilized bw,\ \n and current utilized ddr bw in percentage" cpu_temp_help = "Displays cpu socket temperature" cpu_dimm_temp_range_rate_help = "Displays dimm temperature range and refresh rate" cpu_dimm_pow_consumption_help = "Displays dimm power consumption" cpu_dimm_thermal_sensor_help = "Displays dimm thermal sensor" cpu_dfcstate_ctrl_help = "Displays DFCState control status" cpu_railisofreq_policy_help = "Displays CPU ISO frequency policy" # Help text for core options core_energy_help = "Displays core energy for the selected core" core_boost_limit_help = "Get boost limit for the selected cores" core_curr_active_freq_core_limit_help = "Get Current CCLK limit set per Core" # Create metric subparser metric_parser = subparsers.add_parser('metric', help=metric_help, description=metric_subcommand_help) metric_parser._optionals.title = metric_optionals_title metric_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) metric_parser.set_defaults(func=func) # Optional Args for Linux Virtual OS and Baremetal systems if not self.helpers.is_hypervisor() and not self.helpers.is_windows(): metric_parser.add_argument('-m', '--mem-usage', action='store_true', required=False, help=mem_usage_help) if self.helpers.is_amdgpu_initialized(): # Optional Args for Hypervisors and Baremetal systems if self.helpers.is_hypervisor() or self.helpers.is_baremetal() or self.helpers.is_linux(): metric_parser.add_argument('-u', '--usage', action='store_true', required=False, help=usage_help) metric_parser.add_argument('-p', '--power', action='store_true', required=False, help=power_help) metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help) metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help) metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help) metric_parser.add_argument('-b', '--base-board', action='store_true', required=False, help=base_board_help, default=False) metric_parser.add_argument('-G', '--gpu-board', action='store_true', required=False, help=gpu_board_help, default=False) # Options that only apply to Hypervisors and Baremetal Linux if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()): pass # Optional Args for Linux Baremetal Systems if self.helpers.is_baremetal() and self.helpers.is_linux(): metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help) metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help) metric_parser.add_argument('-E', '--energy', action='store_true', required=False, help=energy_help) metric_parser.add_argument('-v', '--violation', dest='throttle', action='store_true', required=False, help=throttle_help) metric_parser.add_argument('-T', '--throttle', dest='throttle', action='store_true', required=False, help=argparse.SUPPRESS) # Options to only display to Hypervisors # Need to resolve the -G for guard, but technically should never intersect since it's VF only if self.helpers.is_hypervisor(): metric_parser.add_argument('-s', '--schedule', action='store_true', required=False, help=schedule_help) metric_parser.add_argument('-G', '--guard', action='store_true', required=False, help=guard_help) metric_parser.add_argument('-u', '--guest-data', action='store_true', required=False, help=guest_data_help) metric_parser.add_argument('-f', '--fb-usage', action='store_true', required=False, help=fb_usage_help) metric_parser.add_argument('-m', '--xgmi', action='store_true', required=False, help=xgmi_help) if self.helpers.is_amd_hsmp_initialized(): # Optional Args for CPUs cpu_group = metric_parser.add_argument_group("CPU Arguments") cpu_group.add_argument('--cpu-power-metrics', action='store_true', required=False, help=cpu_power_metrics_help) cpu_group.add_argument('--cpu-prochot', action='store_true', required=False, help=cpu_proc_help) cpu_group.add_argument('--cpu-freq-metrics', action='store_true', required=False, help=cpu_freq_help) cpu_group.add_argument('--cpu-c0-res', action='store_true', required=False, help=cpu_c0_res_help) cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("NBIOID"), help=cpu_lclk_dpm_help) cpu_group.add_argument('--cpu-pwr-svi-telemetry-rails', action='store_true', required=False, help=cpu_pwr_svi_telemetry_rails_help) cpu_group.add_argument('--cpu-io-bandwidth', action='append', required=False, nargs=2, metavar=("IO_BW", "LINKID_NAME"), help=cpu_io_bandwidth_help) cpu_group.add_argument('--cpu-xgmi-bandwidth', action='append', required=False, nargs=2, metavar=("XGMI_BW", "LINKID_NAME"), help=cpu_xgmi_bandwidth_help) cpu_group.add_argument('--cpu-metrics-ver', action='store_true', required=False, help=cpu_metrics_ver_help) cpu_group.add_argument('--cpu-metrics-table', action='store_true', required=False, help=cpu_metrics_table_help) cpu_group.add_argument('--cpu-socket-energy', action='store_true', required=False, help=cpu_socket_energy_help) cpu_group.add_argument('--cpu-ddr-bandwidth', action='store_true', required=False, help=cpu_ddr_bandwidth_help) cpu_group.add_argument('--cpu-temp', action='store_true', required=False, help=cpu_temp_help) cpu_group.add_argument('--cpu-dimm-temp-range-rate', action='append', required=False, type=lambda x: int(x, 0), nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_temp_range_rate_help) cpu_group.add_argument('--cpu-dimm-pow-consumption', action='append', required=False, type=lambda x: int(x, 0), nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_pow_consumption_help) cpu_group.add_argument('--cpu-dimm-thermal-sensor', action='append', required=False, type=lambda x: int(x, 0), nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_thermal_sensor_help) cpu_group.add_argument('--cpu-dfcstate-ctrl', action='store_true', required=False, help=cpu_dfcstate_ctrl_help) cpu_group.add_argument('--cpu-railisofreq-policy', action='store_true', required=False, help=cpu_railisofreq_policy_help) # Optional Args for CPU cores core_group = metric_parser.add_argument_group("CPU Core Arguments") core_group.add_argument('--core-boost-limit', action='store_true', required=False, help=core_boost_limit_help) core_group.add_argument('--core-curr-active-freq-core-limit', action='store_true', required=False, help=core_curr_active_freq_core_limit_help) core_group.add_argument('--core-energy', action='store_true', required=False, help=core_energy_help) # Add Universal Arguments & watch Args self._add_watch_arguments(metric_parser) self._add_device_arguments(metric_parser, required=False) self._add_command_modifiers(metric_parser) def _add_process_parser(self, subparsers: argparse._SubParsersAction, func): if self.helpers.is_hypervisor(): # Don't add this subparser on Hypervisors # This subparser is only available to Guest and Baremetal systems return if not self.helpers.is_amdgpu_initialized(): # The process subcommand is currently only applicable to systems with amdgpu initialized return # Subparser help text process_help = "Lists compute process information running on the specified GPU" process_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system.\ \nIf no process argument is provided, all process information will be displayed." process_optionals_title = "Process arguments" # Optional Arguments help text general_help = "pid, process name, memory usage" engine_help = "All engine usages" pid_help = "Gets compute process GPU information about the specified process based on Process ID" name_help = "Gets compute process GPU information about the specified process based on Process Name.\ \nIf multiple processes have the same name, information is returned for all of them.\ \nProcess Name may require elevated permissions." # Create process subparser process_parser = subparsers.add_parser('process', help=process_help, description=process_subcommand_help) process_parser._optionals.title = process_optionals_title process_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) process_parser.set_defaults(func=func) # Optional Args process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help) process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help) process_parser.add_argument('-p', '--pid', action='store', type=lambda value: self._not_negative_int(value, '--pid'), required=False, help=pid_help) process_parser.add_argument('-n', '--name', action='store', type=lambda value: self._is_valid_string(value, '--name'), required=False, help=name_help) # Add Universal Arguments & watch Args self._add_watch_arguments(process_parser) self._add_device_arguments(process_parser, required=False) self._add_command_modifiers(process_parser) def _add_profile_parser(self, subparsers: argparse._SubParsersAction, func): if not (self.helpers.is_windows() and self.helpers.is_hypervisor()): # This subparser only applies to Hypervisors return # Subparser help text profile_help = "Displays information about all profiles and current profile" profile_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system." profile_optionals_title = "Profile Arguments" # Create profile subparser profile_parser = subparsers.add_parser('profile', help=profile_help, description=profile_subcommand_help) profile_parser._optionals.title = profile_optionals_title profile_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) profile_parser.set_defaults(func=func) # Add Universal Arguments self._add_device_arguments(profile_parser, required=False) self._add_command_modifiers(profile_parser) def _add_event_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The event subcommand is only applicable to systems with amdgpu initialized return # Subparser help text event_help = "Displays event information for the given GPU" event_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns event information for all GPUs on the system." event_optionals_title = "Event Arguments" # Create event subparser event_parser = subparsers.add_parser('event', help=event_help, description=event_subcommand_help) event_parser._optionals.title = event_optionals_title event_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) event_parser.set_defaults(func=func) # Add Universal Arguments self._add_device_arguments(event_parser, required=False) self._add_command_modifiers(event_parser) def _add_topology_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The topology subcommand is only applicable to systems with amdgpu initialized return # Subparser help text topology_help = "Displays topology information of the devices" topology_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system.\ \nIf no topology argument is provided, all topology information will be displayed." topology_optionals_title = "Topology arguments" # Help text for Arguments only on Guest and BM platforms access_help = "Displays link accessibility between GPUs" weight_help = "Displays relative weight between GPUs" hops_help = "Displays the number of hops between GPUs" link_type_help = "Displays the link type between GPUs" numa_bw_help = "Display max and min bandwidth between nodes" coherent_help = "Display cache coherant (or non-coherant) link capability between nodes" atomics_help = "Display 32 and 64-bit atomic io link capability between nodes" dma_help = "Display P2P direct memory access (DMA) link capability between nodes" bi_dir_help = "Display P2P bi-directional link capability between nodes" # Create topology subparser topology_parser = subparsers.add_parser('topology', help=topology_help, description=topology_subcommand_help) topology_parser._optionals.title = topology_optionals_title topology_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) topology_parser.set_defaults(func=func) # Add Universal Arguments self._add_command_modifiers(topology_parser) self._add_device_arguments(topology_parser, required=False) # Optional Args topology_parser.add_argument('-a', '--access', action='store_true', required=False, help=access_help) topology_parser.add_argument('-w', '--weight', action='store_true', required=False, help=weight_help) topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help) topology_parser.add_argument('-t', '--link-type', action='store_true', required=False, help=link_type_help) topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help) topology_parser.add_argument('-c', '--coherent', action='store_true', required=False, help=coherent_help) topology_parser.add_argument('-n', '--atomics', action='store_true', required=False, help=atomics_help) topology_parser.add_argument('-d', '--dma', action='store_true', required=False, help=dma_help) topology_parser.add_argument('-z', '--bi-dir', action='store_true', required=False, help=bi_dir_help) def _add_set_value_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_linux(): # This subparser is only applicable to Linux return # Subparser help text set_value_help = "Set options for devices" set_value_subcommand_help = f"{self.description}\n\nIf no GPU is specified, will select all GPUs on the system.\ \nA set argument must be provided; Multiple set arguments are accepted.\ \nRequires 'sudo' privileges." set_value_optionals_title = "Set Arguments" # Help text for Arguments only on BM platforms if self.helpers.is_amdgpu_initialized(): if self.helpers.is_baremetal(): fan_support = self.helpers.get_fan_support() set_fan_help = f"Set GPU fan speed ({fan_support})" perf_level_help_choices_str = ", ".join(self.helpers.get_perf_levels()[0][0:-1]) set_perf_level_help = f"Set one of the following performance levels:\n\t{perf_level_help_choices_str}" power_profile_choices_str = ", ".join(self.helpers.get_power_profiles()[0:-1]) set_profile_help = f"Set power profile level (#) or choose one of available profiles:\n\t{power_profile_choices_str}" set_perf_det_help = "Enable performance determinism mode and set GFXCLK softmax limit (in MHz)" (accelerator_set_choices, _) = self.helpers.get_accelerator_choices_types_indices() memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types()) accelerator_set_choices_str = ", ".join(accelerator_set_choices) set_compute_partition_help = f"Set one of the following accelerator TYPE or profile INDEX:\n\t{accelerator_set_choices_str}.\n\tUse `sudo amd-smi partition --accelerator` to find acceptable values." set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" soc_pstate_help_info = ", ".join(self.helpers.get_soc_pstates()) set_soc_pstate_help = f"Set the GPU soc pstate policy using policy id, an integer. Valid id's include:\n\t{soc_pstate_help_info}" xgmi_plpd_help_info = ", ".join(self.helpers.get_xgmi_plpd_policies()) set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id, an integer. Valid id's include:\n\t{xgmi_plpd_help_info}" set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels.\n\tUse `amd-smi static --bus` to find acceptable pcie levels." set_ptl_status_help = "Enable or disable the PTL on a GPU processor:\n 0 for disable and 1 for enable." ptl_format_help_choices_str = ", ".join(self.helpers.get_ptl_values()[0][0:-1]) set_ptl_format_help = f"Set the PTL format on a GPU processor. For example, --ptl-format I8,F32\n\tSet to one of the following PTL formats: {ptl_format_help_choices_str}" ppt0_power_cap_min, ppt0_power_cap_max, ppt1_power_cap_min, ppt1_power_cap_max = self.helpers.get_power_caps() set_power_cap_help = f"Set either PPT0 or PPT1 power capacity limit:\n\tEx: `amd-smi set -o 1300 ppt0`\n\tPPT0 min cap: {ppt0_power_cap_min}, PPT0 max cap: {ppt0_power_cap_max}\n\tPPT1 min cap: {ppt1_power_cap_min}, PPT1 max cap: {ppt1_power_cap_max}" set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \n\tex: amd-smi set -L (sclk | mclk) (min | max) value" set_process_isolation_help = "Enable or disable the GPU process isolation on a per partition basis:\n 0 for disable and 1 for enable.\n" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." set_cpu_xgmi_link_width_help = "Set max and Min linkwidth. Input parameters are min and max link width values" set_cpu_lclk_dpm_level_help = "Sets the max and min dpm level on a given NBIO.\ \n Input parameters are die_index, min dpm, max dpm." set_cpu_pwr_eff_mode_help = "Sets the power efficency mode policy. Input parameter is mode." set_cpu_gmi3_link_width_help = "Sets max and min gmi3 link width range" set_cpu_pcie_link_rate_help = "Sets pcie link rate" set_cpu_df_pstate_range_help = "Sets max and min df-pstates" set_cpu_enable_apb_help = "Enables the DF p-state performance boost algorithm" set_cpu_disable_apb_help = "Disables the DF p-state performance boost algorithm. Input parameter is DFPstate (0-3)" set_soc_boost_limit_help = "Sets the boost limit for the given socket. Input parameter is socket BOOST_LIMIT value" set_cpu_dfcstate_ctrl_help = "Sets the DFCState control. Input parameter is value (0-1)" set_cpu_railisofreq_policy_help = "Sets the CPU ISO frequency policy. Input parameter is value (0-1)" # Help text for CPU Core set options set_core_boost_limit_help = "Sets the boost limit for the given core. Input parameter is core BOOST_LIMIT value" # Create set_value subparser set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help) set_value_parser._optionals.title = set_value_optionals_title set_value_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) set_value_parser.set_defaults(func=func) if self.helpers.is_amdgpu_initialized(): # set value should only take one of these at a time so args below will be mutually exclusive set_value_exclusive_group = set_value_parser.add_mutually_exclusive_group() if self.helpers.is_baremetal(): # Optional GPU Args set_value_exclusive_group.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%') set_value_exclusive_group.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL') set_value_exclusive_group.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='PROFILE_LEVEL') set_value_exclusive_group.add_argument('-d', '--perf-determinism', action='store', type=lambda value: self._not_negative_int(value, '--perf-determinism'), required=False, help=set_perf_det_help, metavar='SCLKMAX') set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=accelerator_set_choices, type=lambda value: self._is_command_supported(value, accelerator_set_choices, '--compute-partition'), required=False, help=set_compute_partition_help, metavar=('TYPE/INDEX')) set_value_exclusive_group.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') # Power cap is enabled on guest, maintain order set_value_exclusive_group.add_argument('-o', '--power-cap', action=self._power_cap_options(), nargs='+', required=False, help=set_power_cap_help, metavar=('WATTS', '[PWR_TYPE]')) if self.helpers.is_baremetal(): set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID') set_value_exclusive_group.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=lambda value: self._not_negative_int(value, '--xgmi-plpd'), help=set_xgmi_plpd_help, metavar='POLICY_ID') set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'PERF_LEVELS')) set_value_exclusive_group.add_argument('-S', '--ptl-status', action='store', choices=[0,1], type=lambda value: self._not_negative_int(value, '--ptl-status'), required=False, help=set_ptl_status_help, metavar=('STATUS')) set_value_exclusive_group.add_argument('-F', '--ptl-format', action=self._validate_ptl_format(), required=False, help=set_ptl_format_help, metavar=('FRMT1,FRMT2')) set_value_exclusive_group.add_argument('-L', '--clk-limit', action=self._limit_select(), nargs=3, required=False, help=set_clk_limit_help, metavar=('CLK_TYPE', 'LIM_TYPE', 'VALUE')) set_value_exclusive_group.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=lambda value: self._not_negative_int(value, '--process-isolation'), required=False, help=set_process_isolation_help, metavar='STATUS') if self.helpers.is_amd_hsmp_initialized(): if self.helpers.is_baremetal(): # Optional CPU Args cpu_group = set_value_parser.add_argument_group("CPU Arguments") cpu_group.add_argument('--cpu-pwr-limit', action='append', required=False, type=self._positive_int, nargs=1, metavar=("PWR_LIMIT"), help=set_cpu_pwr_limit_help) cpu_group.add_argument('--cpu-xgmi-link-width', action='append', required=False, type=self._not_negative_int, nargs=2, metavar=("MIN_WIDTH", "MAX_WIDTH"), help=set_cpu_xgmi_link_width_help) cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._not_negative_int, nargs=3, metavar=("NBIOID", "MIN_DPM", "MAX_DPM"), help=set_cpu_lclk_dpm_level_help) cpu_group.add_argument('--cpu-pwr-eff-mode', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("MODE"), help=set_cpu_pwr_eff_mode_help) cpu_group.add_argument('--cpu-gmi3-link-width', action='append', required=False, type=self._not_negative_int, nargs=2, metavar=("MIN_LW", "MAX_LW"), help=set_cpu_gmi3_link_width_help) cpu_group.add_argument('--cpu-pcie-link-rate', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("LINK_RATE"), help=set_cpu_pcie_link_rate_help) cpu_group.add_argument('--cpu-df-pstate-range', action='append', required=False, type=self._not_negative_int, nargs=2, metavar=("MAX_PSTATE", "MIN_PSTATE"), help=set_cpu_df_pstate_range_help) cpu_group.add_argument('--cpu-enable-apb', action='store_true', required=False, help=set_cpu_enable_apb_help) cpu_group.add_argument('--cpu-disable-apb', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("DF_PSTATE"), help=set_cpu_disable_apb_help) cpu_group.add_argument('--soc-boost-limit', action='append', required=False, type=self._positive_int, nargs=1, metavar=("BOOST_LIMIT"), help=set_soc_boost_limit_help) cpu_group.add_argument('--cpu-dfcstate-ctrl', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("VALUE"), help=set_cpu_dfcstate_ctrl_help) cpu_group.add_argument('--cpu-railisofreq-policy', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("VALUE"), help=set_cpu_railisofreq_policy_help) # Optional CPU Core Args core_group = set_value_parser.add_argument_group("CPU Core Arguments") core_group.add_argument('--core-boost-limit', action='append', required=False, type=self._positive_int, nargs=1, metavar=("BOOST_LIMIT"), help=set_core_boost_limit_help) # Set accepts default devices of all self._add_device_arguments(set_value_parser, required=False) # Add Universal Arguments self._add_command_modifiers(set_value_parser) def _add_reset_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_linux(): # This subparser is only applicable to Linux return if not self.helpers.is_amdgpu_initialized(): # The reset subcommand is only applicable to systems with amdgpu initialized return # Subparser help text reset_help = "Reset options for devices" reset_subcommand_help = f"{self.description}\n\nIf no GPU is specified, will select all GPUs on the system.\ \nA reset argument must be provided; Multiple reset arguments are accepted.\ \nRequires 'sudo' privileges." reset_optionals_title = "Reset Arguments" # Help text for Arguments only on Guest and BM platforms gpureset_help = "Reset the specified GPU" reset_clocks_help = "Reset clocks and overdrive to default" reset_fans_help = "Reset fans to automatic (driver) control" reset_profile_help = "Reset power profile back to default" reset_xgmierr_help = "Reset XGMI error counts" reset_perf_det_help = "Disable performance determinism" reset_power_cap_help = "Reset the PPT0 and PPT1 power capacity limit to max capable" reset_gpu_clean_local_data_help = "Clean up local data in LDS/GPRs on a per partition basis" # Create reset subparser reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help) reset_parser._optionals.title = reset_optionals_title reset_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) reset_parser.set_defaults(func=func) # make reset args mutually exclusive reset_exclusive_group = reset_parser.add_mutually_exclusive_group() if self.helpers.is_baremetal(): # Add Baremetal reset arguments reset_exclusive_group.add_argument('-G', '--gpureset', action='store_true', required=False, help=gpureset_help) reset_exclusive_group.add_argument('-c', '--clocks', action='store_true', required=False, help=reset_clocks_help) reset_exclusive_group.add_argument('-f', '--fans', action='store_true', required=False, help=reset_fans_help) reset_exclusive_group.add_argument('-p', '--profile', action='store_true', required=False, help=reset_profile_help) reset_exclusive_group.add_argument('-x', '--xgmierr', action='store_true', required=False, help=reset_xgmierr_help) reset_exclusive_group.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perf_det_help) reset_exclusive_group.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help) # Add Baremetal and Virtual OS reset arguments reset_exclusive_group.add_argument('-l', '--clean-local-data', action='store_true', required=False, help=reset_gpu_clean_local_data_help) # Reset accepts default devices of all self._add_device_arguments(reset_parser, required=False) # Add Universal Arguments self._add_command_modifiers(reset_parser) def _add_monitor_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_linux(): # This subparser is only applicable to Linux return if not self.helpers.is_amdgpu_initialized(): # The monitor subcommand is only applicable to systems with amdgpu initialized return # Subparser help text monitor_help = "Monitor metrics for target devices" monitor_subcommand_help = f"{self.description}\n\nMonitor a target device for the specified arguments.\ \nIf no arguments are provided, all arguments will be enabled.\ \nUse the watch arguments to run continuously." monitor_optionals_title = "Monitor Arguments" # Help text for Arguments only on Guest and BM platforms power_usage_help = "Monitor power usage and power cap in Watts" temperature_help = "Monitor temperature in Celsius" base_board_temps_help = "Monitor base board temperatures in Celsius" gpu_board_temps_help = "Monitor GPU board temperatures in Celsius" gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)" mem_util_help = "Monitor memory utilization (%%) and clock (MHz)" encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)" decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)" ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts" mem_usage_help = "Monitor memory usage in MB" pcie_bandwidth_help = "Monitor PCIe bandwidth in Mb/s" process_help = "Enable Process information table below monitor output;\n Process Name may require elevated permissions" violation_help = "Monitor power and thermal violation status (%%);\n Only available for MI300 or newer ASICs" # Create monitor subparser monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help, aliases=["dmon"]) monitor_parser._optionals.title = monitor_optionals_title monitor_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) monitor_parser.set_defaults(func=func) # Add monitor arguments monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help) monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) monitor_parser.add_argument('-b', '--base-board-temps', action='store_true', required=False, help=base_board_temps_help) monitor_parser.add_argument('-o', '--gpu-board-temps', action='store_true', required=False, help=gpu_board_temps_help) monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help) monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help) monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help) monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help) monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help) monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help) monitor_parser.add_argument('-q', '--process', action='store_true', required=False, help=process_help) if not self.helpers.is_virtual_os(): monitor_parser.add_argument('-V', '--violation', action='store_true', required=False, help=violation_help) # Add Universal Arguments & Watch Args self._add_watch_arguments(monitor_parser) self._add_device_arguments(monitor_parser, required=False) self._add_command_modifiers(monitor_parser) def _add_xgmi_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The xgmi subcommand is only applicable to systems with amdgpu initialized return # Subparser help text xgmi_help = "Displays xgmi information of the devices" xgmi_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system.\ \nIf no xgmi argument is provided, all xgmi information will be displayed." xgmi_optionals_title = "XGMI arguments" # Help text for Arguments only on Guest and BM platforms metrics_help = "Metric XGMI information" xgmi_source_status_help = "Source GPU XGMI Link information" xgmi_link_status_help = "XGMI Link Status information" # Create xgmi subparser xgmi_parser = subparsers.add_parser('xgmi', help=xgmi_help, description=xgmi_subcommand_help) xgmi_parser._optionals.title = xgmi_optionals_title xgmi_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) xgmi_parser.set_defaults(func=func) # Optional Args xgmi_parser.add_argument('-m', '--metric', action='store_true', required=False, help=metrics_help) xgmi_parser.add_argument('-s', '--source-status', action='store_true', required=False, help=xgmi_source_status_help) xgmi_parser.add_argument('-l', '--link-status', action='store_true', required=False, help=xgmi_link_status_help) # Add Universal Arguments self._add_device_arguments(xgmi_parser, required=False) self._add_command_modifiers(xgmi_parser) def _add_partition_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The partition subcommand is only applicable to systems with amdgpu initialized return # Subparser help text partition_help = "Displays partition information of the devices" partition_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system.\ \nIf no partition argument is provided, all partition information will be displayed." partition_optionals_title = "Partition arguments" # Options help text current_help = "display the current partition information" memory_help = "display the current memory partition mode and capabilities" accelerator_help = "display accelerator partition information" # Create partition subparser partition_parser = subparsers.add_parser('partition', help=partition_help, description=partition_subcommand_help) partition_parser._optionals.title = partition_optionals_title partition_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) partition_parser.set_defaults(func=func) # Handle GPU Options partition_parser.add_argument('-c', '--current', action='store_true', required=False, help=current_help) partition_parser.add_argument('-m', '--memory', action='store_true', required=False, help=memory_help) partition_parser.add_argument('-a', '--accelerator', action='store_true', required=False, help=accelerator_help) # Add Universal Arguments self._add_device_arguments(partition_parser, required=False) self._add_command_modifiers(partition_parser) def _add_ras_parser(self, subparsers: argparse._SubParsersAction, func): """ Adds the 'ras' subcommand. Expected command: amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder --file-limit=1000 --follow All parameters are provided via options; no positional arguments or optional --file/--gpu are used. """ # Subparser help text ras_help = "Retrieve RAS (CPER) entries from the driver" ras_description = ( f"{self.description}\n\n" "Retrieve and decode RAS (CPER) entries from the kernel driver.\n" "Supports filtering by severity, exporting to different formats, and continuous monitoring.\n" "This command accepts options only; no positional arguments are required." ) ras_optionals_title = "RAS arguments" # Help text for RAS arguments cper_help = "Trigger current CPER data retrieval" afid_help = "Generate an AFID (AMD Field ID) given a CPER record file" severity_choices = ["nonfatal-uncorrected", "fatal", "nonfatal-corrected", "all"] severity_choices_str = ", ".join(severity_choices) severity_help = f"Set the SEVERITY filters from the following:\n {severity_choices_str}" folder_help = "Folder to dump current CPER report files" file_limit_help = "Maximum number of current CPER files in target folder\n Older files beyond limit will be deleted" cper_file_help = "Full path of a retrieved CPER record file to generate the AFID" follow_help = "Continuously monitor for new CPER entries" ras_parser = subparsers.add_parser("ras", help=ras_help, description=ras_description) ras_parser._optionals.title = ras_optionals_title ras_parser.formatter_class = lambda prog: AMDSMISubparserHelpFormatter(prog) ras_parser.set_defaults(func=func) # Create mutually exclusive command ras group (--cper or --afid) ras_exclusive_group = ras_parser.add_mutually_exclusive_group(required=True) ras_exclusive_group.add_argument("--cper", action="store_true", help=cper_help) ras_exclusive_group.add_argument("--afid", action="store_true", help=afid_help) # CPER Arguments remove defaults cper_group = ras_parser.add_argument_group("CPER Arguments") cper_group.add_argument("--severity", type=str.lower, nargs='+', default=['all'], help=severity_help, choices=severity_choices, metavar='SEVERITY') cper_group.add_argument("--folder", type=str, action=self._check_folder_path(), help=folder_help) cper_group.add_argument("--file-limit", type=self._positive_int, action='store', help=file_limit_help) cper_group.add_argument("--follow", action="store_true", help=follow_help) # AFID Arguments afid_group = ras_parser.add_argument_group("AFID Arguments") afid_group.add_argument("--cper-file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help) # Add common modifiers and device selection arguments. self._add_device_arguments(ras_parser, required=False) self._add_command_modifiers(ras_parser) def _add_node_parser(self, subparsers: argparse._SubParsersAction, func): # Subparser help text node_help = "Gets power information for the node" node_subcommand_help = f"{self.description}\n\nReturns information for node 0 on the system.\ \nIf no node argument is provided, all node information will be displayed." node_optionals_title = "Node arguments" # Help text for Node arguments power_management_help = "Displays power management information" node_parser = subparsers.add_parser("node", help=node_help, description=node_subcommand_help) node_parser._optionals.title = node_optionals_title node_parser.formatter_class = lambda prog: AMDSMISubparserHelpFormatter(prog) node_parser.set_defaults(func=func) # Optional Args node_parser.add_argument('-p', '--power-management', action='store_true', required=False, help=power_management_help) # Add Universal Arguments self._add_command_modifiers(node_parser) def error(self, message): outputformat = self.helpers.get_output_format() if "argument : invalid choice: " in message: l = len("argument : invalid choice: ") + 1 message = message[l:] message = message.split("'")[0] # Check if the command is possible in other system configurations and error accordingly if message in self.possible_commands: raise amdsmi_cli_exceptions.AmdSmiCommandNotSupportedException(message, outputformat) raise amdsmi_cli_exceptions.AmdSmiInvalidCommandException(message, outputformat) elif "unrecognized arguments: " in message: l = len("unrecognized arguments: ") message = message[l:] raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], message, outputformat) else: print(message)