From b7f779182deb871ae2b5391846e03da98020c8d9 Mon Sep 17 00:00:00 2001 From: gabrpham Date: Thu, 12 Sep 2024 13:54:18 -0500 Subject: [PATCH] [SWDEV-448738] Added rocmsmi extremum command as 'set -L' Change-Id: I997c630bd20cc61673813a2301eb5e3002619a32 Signed-off-by: gabrpham Change-Id: Ifa884303f9a0fa058af093a23f5be449bba54f29 --- CHANGELOG.md | 19 +++++++++++-------- amdsmi_cli/amdsmi_commands.py | 28 +++++++++++++++++++++------- amdsmi_cli/amdsmi_parser.py | 24 ++++++++++++++++++++++++ py-interface/__init__.py | 1 + py-interface/amdsmi_interface.py | 30 ++++++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59de456980..817b77109a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,25 +7,28 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ## amd_smi_lib for ROCm 6.3.0 ### Changes -- **Moved python tests directory path install location.** - - `/opt//share/amd_smi/pytest/.. to /opt//share/amd_smi/tests/python_unittest/..` -- **On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed.** -- **Removed pytest dependency, our python testing depends only on unittest framework.** +- **Moved python tests directory path install location**. + - `/opt//share/amd_smi/pytest/..` to `/opt//share/amd_smi/tests/python_unittest/..` + - On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed. + - Removed pytest dependency, our python testing now only depends on the unittest framework. - **Added more supported utilization count types to `amdsmi_get_utilization_count()`**. +- **Added `amd-smi set -L/--clk-limit ...` command**. +Equivalent to rocm-smi's '--extremum' command which sets sclk's or mclk's soft minimum or soft maximum clock frequency. + - **Added Pytest functionality to test amdsmi API calls in Python**. - **Changed the `power` parameter in `amdsmi_get_energy_count()` to `energy_accumulator`**. Changes propagate forwards into the python interface as well, however we are maintaing backwards compatibility and keeping the `power` field in the python API until ROCm 6.4. -- **Added GPU memory overdrive percentage to `amd-smi metric -o`**. +- **Added GPU memory overdrive percentage to `amd-smi metric -o`**. Added `amdsmi_get_gpu_mem_overdrive_level()` function to amd-smi C and Python Libraries. -- **Added Subsystem Device ID to `amd-smi static --asic`**. +- **Added Subsystem Device ID to `amd-smi static --asic`**. No underlying changes to amdsmi_get_gpu_asic_info -- **Added retrieving connection type and P2P capabilities between two GPUs**. +- **Added retrieving connection type and P2P capabilities between two GPUs**. - Added `amdsmi_topo_get_p2p_status` function to amd-smi C and Python Libraries. - Added retrieving P2P link capabilities to CLI `amd-smi topology`. @@ -276,7 +279,7 @@ GPU: 1 ### Removals -- **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate** +- **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**. This will allow 0 to be a valid input for several options in setting CPUs where appropriate (for example, as a mode or NBIOID) ### Optimizations diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index aa4e54e9c1..086c8bd104 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -3665,7 +3665,7 @@ class AMDSMICommands(): def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, memory_partition=None, power_cap=None, soc_pstate=None, xgmi_plpd = None, - process_isolation=None): + process_isolation=None, clk_limit=None): """Issue reset commands to target gpu(s) Args: @@ -3712,6 +3712,8 @@ class AMDSMICommands(): args.xgmi_plpd = xgmi_plpd if process_isolation: args.process_isolation = process_isolation + if clk_limit: + args.clk_limit = clk_limit # Handle No GPU passed if args.gpu == None: @@ -3734,7 +3736,8 @@ class AMDSMICommands(): args.power_cap is not None, args.soc_pstate is not None, args.xgmi_plpd is not None, - args.process_isolation is not None]): + args.process_isolation is not None, + args.clk_limit is not None]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3860,6 +3863,17 @@ class AMDSMICommands(): raise ValueError(f"Unable to set process isolation to {status_string} on {gpu_string}") from e self.logger.store_output(args.gpu, 'process_isolation', result) + if isinstance(args.clk_limit, tuple): + try: + clk_type = args.clk_limit.clk_type + lim_type = args.clk_limit.lim_type + val = args.clk_limit.val + amdsmi_interface.amdsmi_set_gpu_clk_limit(args.gpu, clk_type, lim_type, val) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'clk_limit', f"Successfully changed {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val}") if multiple_devices: self.logger.store_multiple_device_output() @@ -3875,7 +3889,7 @@ class AMDSMICommands(): cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, soc_boost_limit=None, core=None, core_boost_limit=None, soc_pstate=None, xgmi_plpd=None, - process_isolation=None): + process_isolation=None, clk_limit=None): """Issue reset commands to target gpu(s) Args: @@ -3926,8 +3940,8 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap", "soc_pstate", "xgmi_plpd", "process_isolation", - ] + "memory_partition", "power_cap", "soc_pstate", "xgmi_plpd", + "process_isolation", "clk_limit"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3983,7 +3997,7 @@ class AMDSMICommands(): self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, memory_partition, power_cap, soc_pstate, xgmi_plpd, - process_isolation) + process_isolation, clk_limit) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -4003,7 +4017,7 @@ class AMDSMICommands(): self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, memory_partition, power_cap, soc_pstate, xgmi_plpd, - process_isolation) + process_isolation, clk_limit) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 85f0307305..54008b8a3f 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -26,6 +26,7 @@ import errno import os import sys import time +import collections from pathlib import Path @@ -172,6 +173,27 @@ class AMDSMIParser(argparse.ArgumentParser): raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(string_value, outputformat) + def _limit_select(self): + """Custom action for setting clock limits""" + output_format = self.helpers.get_output_format() + + class AMDSMILimitArgs(argparse.Action): + def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace, + values: str | list | None, option_string: str | None = None) -> None: + # valid values + valid_clk_types = ('sclk', 'mclk') + valid_lim_types = ('min', 'max') + clk_type, lim_type, val = values + if clk_type not in valid_clk_types: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(clk_type, output_format) + if lim_type not in valid_lim_types: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(lim_type, output_format) + val = int(val) + clk_limit_args = collections.namedtuple('clk_limit_args', ['clk_type', 'lim_type', 'val']) + setattr(namespace, self.dest, clk_limit_args(clk_type, lim_type, val)) + return AMDSMILimitArgs + + def _check_output_file_path(self): """ Argument action validator: Returns a path to a file from the output file path provided. @@ -1014,6 +1036,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_soc_pstate_help = "Set the GPU soc pstate policy using policy id\n" set_xgmi_plpd_help = "Set the GPU XGMI per-link power down policy using policy id\n" set_process_isolation_help = "Enable or disable the GPU process isolation: 0 for disable and 1 for enable.\n" + set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \nOf form: amd-smi set -L (sclk | mclk) (min | max) value" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." @@ -1052,6 +1075,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') set_value_parser.add_argument('-p', '--soc-pstate', action='store', required=False, type=self._not_negative_int, help=set_soc_pstate_help, metavar='POLICY_ID') set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID') + set_value_parser.add_argument('-L', '--clk-limit', action=self._limit_select(), nargs=3, required=False, help=set_clk_limit_help, metavar=('CLK_TYPE', 'LIM_TYPE', 'VALUE')) set_value_parser.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=self._not_negative_int, required=False, help=set_process_isolation_help, metavar='STATUS') diff --git a/py-interface/__init__.py b/py-interface/__init__.py index e56cf2a1eb..13e3221401 100644 --- a/py-interface/__init__.py +++ b/py-interface/__init__.py @@ -124,6 +124,7 @@ from .amdsmi_interface import amdsmi_set_gpu_pci_bandwidth from .amdsmi_interface import amdsmi_set_power_cap from .amdsmi_interface import amdsmi_set_gpu_power_profile from .amdsmi_interface import amdsmi_set_gpu_clk_range +from .amdsmi_interface import amdsmi_set_gpu_clk_limit from .amdsmi_interface import amdsmi_set_gpu_od_clk_info from .amdsmi_interface import amdsmi_set_gpu_od_volt_info from .amdsmi_interface import amdsmi_set_gpu_perf_level diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 13ddbc1312..dd5c676a01 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -3190,6 +3190,36 @@ def amdsmi_set_gpu_clk_range( ) +def amdsmi_set_gpu_clk_limit( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + clk_type: str, + limit_type: str, + value: int + ) -> None: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + if not isinstance(value, int): + raise AmdSmiParameterException(value, int) + if clk_type.lower() == "sclk": + clk_type_conversion = amdsmi_wrapper.AMDSMI_CLK_TYPE_SYS + elif clk_type.lower() == "mclk": + clk_type_conversion = amdsmi_wrapper.AMDSMI_CLK_TYPE_MEM + if limit_type.lower() == "min": + limit_type_conversion = amdsmi_wrapper.CLK_LIMIT_MIN + elif limit_type.lower() == "max": + limit_type_conversion = amdsmi_wrapper.CLK_LIMIT_MAX + _check_res( + amdsmi_wrapper.amdsmi_set_gpu_clk_limit( + processor_handle, + amdsmi_wrapper.amdsmi_clk_type_t(clk_type_conversion), + amdsmi_wrapper.amdsmi_clk_limit_type_t(limit_type_conversion), + ctypes.c_uint64(value), + ) + ) + + def amdsmi_get_gpu_memory_total(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, mem_type: AmdSmiMemoryType): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException(