[SWDEV-448738] Added rocmsmi extremum command as 'set -L'

Change-Id: I997c630bd20cc61673813a2301eb5e3002619a32
Signed-off-by: gabrpham <Gabriel.Pham@amd.com>

Change-Id: Ifa884303f9a0fa058af093a23f5be449bba54f29
This commit is contained in:
gabrpham
2024-09-12 13:54:18 -05:00
committed by Gabriel Pham
parent ac593f9fa0
commit b7f779182d
5 changed files with 87 additions and 15 deletions
+11 -8
View File
@@ -7,25 +7,28 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
## amd_smi_lib for ROCm 6.3.0
### Changes
- **Moved python tests directory path install location.**
- `/opt/<rocm-path>/share/amd_smi/pytest/.. to /opt/<rocm-path>/share/amd_smi/tests/python_unittest/..`
- **On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed.**
- **Removed pytest dependency, our python testing depends only on unittest framework.**
- **Moved python tests directory path install location**.
- `/opt/<rocm-path>/share/amd_smi/pytest/..` to `/opt/<rocm-path>/share/amd_smi/tests/python_unittest/..`
- On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed.
- Removed pytest dependency, our python testing now only depends on the unittest framework.
- **Added more supported utilization count types to `amdsmi_get_utilization_count()`**.
- **Added `amd-smi set -L/--clk-limit ...` command**.
Equivalent to rocm-smi's '--extremum' command which sets sclk's or mclk's soft minimum or soft maximum clock frequency.
- **Added Pytest functionality to test amdsmi API calls in Python**.
- **Changed the `power` parameter in `amdsmi_get_energy_count()` to `energy_accumulator`**.
Changes propagate forwards into the python interface as well, however we are maintaing backwards compatibility and keeping the `power` field in the python API until ROCm 6.4.
- **Added GPU memory overdrive percentage to `amd-smi metric -o`**.
- **Added GPU memory overdrive percentage to `amd-smi metric -o`**.
Added `amdsmi_get_gpu_mem_overdrive_level()` function to amd-smi C and Python Libraries.
- **Added Subsystem Device ID to `amd-smi static --asic`**.
- **Added Subsystem Device ID to `amd-smi static --asic`**.
No underlying changes to amdsmi_get_gpu_asic_info
- **Added retrieving connection type and P2P capabilities between two GPUs**.
- **Added retrieving connection type and P2P capabilities between two GPUs**.
- Added `amdsmi_topo_get_p2p_status` function to amd-smi C and Python Libraries.
- Added retrieving P2P link capabilities to CLI `amd-smi topology`.
@@ -276,7 +279,7 @@ GPU: 1
### Removals
- **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**
- **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**.
This will allow 0 to be a valid input for several options in setting CPUs where appropriate (for example, as a mode or NBIOID)
### Optimizations
+21 -7
View File
@@ -3665,7 +3665,7 @@ class AMDSMICommands():
def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
profile=None, perf_determinism=None, compute_partition=None,
memory_partition=None, power_cap=None, soc_pstate=None, xgmi_plpd = None,
process_isolation=None):
process_isolation=None, clk_limit=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -3712,6 +3712,8 @@ class AMDSMICommands():
args.xgmi_plpd = xgmi_plpd
if process_isolation:
args.process_isolation = process_isolation
if clk_limit:
args.clk_limit = clk_limit
# Handle No GPU passed
if args.gpu == None:
@@ -3734,7 +3736,8 @@ class AMDSMICommands():
args.power_cap is not None,
args.soc_pstate is not None,
args.xgmi_plpd is not None,
args.process_isolation is not None]):
args.process_isolation is not None,
args.clk_limit is not None]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
@@ -3860,6 +3863,17 @@ class AMDSMICommands():
raise ValueError(f"Unable to set process isolation to {status_string} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'process_isolation', result)
if isinstance(args.clk_limit, tuple):
try:
clk_type = args.clk_limit.clk_type
lim_type = args.clk_limit.lim_type
val = args.clk_limit.val
amdsmi_interface.amdsmi_set_gpu_clk_limit(args.gpu, clk_type, lim_type, val)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'clk_limit', f"Successfully changed {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val}")
if multiple_devices:
self.logger.store_multiple_device_output()
@@ -3875,7 +3889,7 @@ class AMDSMICommands():
cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None,
cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None,
soc_boost_limit=None, core=None, core_boost_limit=None, soc_pstate=None, xgmi_plpd=None,
process_isolation=None):
process_isolation=None, clk_limit=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -3926,8 +3940,8 @@ class AMDSMICommands():
# Check if a GPU argument has been set
gpu_args_enabled = False
gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
"memory_partition", "power_cap", "soc_pstate", "xgmi_plpd", "process_isolation",
]
"memory_partition", "power_cap", "soc_pstate", "xgmi_plpd",
"process_isolation", "clk_limit"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr) is not None:
@@ -3983,7 +3997,7 @@ class AMDSMICommands():
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap, soc_pstate, xgmi_plpd,
process_isolation)
process_isolation, clk_limit)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None and args.core == None:
raise ValueError('No CPU or CORE provided, specific target(s) are needed')
@@ -4003,7 +4017,7 @@ class AMDSMICommands():
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap, soc_pstate, xgmi_plpd,
process_isolation)
process_isolation, clk_limit)
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
+24
View File
@@ -26,6 +26,7 @@ import errno
import os
import sys
import time
import collections
from pathlib import Path
@@ -172,6 +173,27 @@ class AMDSMIParser(argparse.ArgumentParser):
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(string_value, outputformat)
def _limit_select(self):
"""Custom action for setting clock limits"""
output_format = self.helpers.get_output_format()
class AMDSMILimitArgs(argparse.Action):
def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace,
values: str | list | None, option_string: str | None = None) -> None:
# valid values
valid_clk_types = ('sclk', 'mclk')
valid_lim_types = ('min', 'max')
clk_type, lim_type, val = values
if clk_type not in valid_clk_types:
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(clk_type, output_format)
if lim_type not in valid_lim_types:
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(lim_type, output_format)
val = int(val)
clk_limit_args = collections.namedtuple('clk_limit_args', ['clk_type', 'lim_type', 'val'])
setattr(namespace, self.dest, clk_limit_args(clk_type, lim_type, val))
return AMDSMILimitArgs
def _check_output_file_path(self):
""" Argument action validator:
Returns a path to a file from the output file path provided.
@@ -1014,6 +1036,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_soc_pstate_help = "Set the GPU soc pstate policy using policy id\n"
set_xgmi_plpd_help = "Set the GPU XGMI per-link power down policy using policy id\n"
set_process_isolation_help = "Enable or disable the GPU process isolation: 0 for disable and 1 for enable.\n"
set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \nOf form: amd-smi set -L (sclk | mclk) (min | max) value"
# Help text for CPU set options
set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value."
@@ -1052,6 +1075,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS')
set_value_parser.add_argument('-p', '--soc-pstate', action='store', required=False, type=self._not_negative_int, help=set_soc_pstate_help, metavar='POLICY_ID')
set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID')
set_value_parser.add_argument('-L', '--clk-limit', action=self._limit_select(), nargs=3, required=False, help=set_clk_limit_help, metavar=('CLK_TYPE', 'LIM_TYPE', 'VALUE'))
set_value_parser.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=self._not_negative_int, required=False, help=set_process_isolation_help, metavar='STATUS')
+1
View File
@@ -124,6 +124,7 @@ from .amdsmi_interface import amdsmi_set_gpu_pci_bandwidth
from .amdsmi_interface import amdsmi_set_power_cap
from .amdsmi_interface import amdsmi_set_gpu_power_profile
from .amdsmi_interface import amdsmi_set_gpu_clk_range
from .amdsmi_interface import amdsmi_set_gpu_clk_limit
from .amdsmi_interface import amdsmi_set_gpu_od_clk_info
from .amdsmi_interface import amdsmi_set_gpu_od_volt_info
from .amdsmi_interface import amdsmi_set_gpu_perf_level
+30
View File
@@ -3190,6 +3190,36 @@ def amdsmi_set_gpu_clk_range(
)
def amdsmi_set_gpu_clk_limit(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
clk_type: str,
limit_type: str,
value: int
) -> None:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
if not isinstance(value, int):
raise AmdSmiParameterException(value, int)
if clk_type.lower() == "sclk":
clk_type_conversion = amdsmi_wrapper.AMDSMI_CLK_TYPE_SYS
elif clk_type.lower() == "mclk":
clk_type_conversion = amdsmi_wrapper.AMDSMI_CLK_TYPE_MEM
if limit_type.lower() == "min":
limit_type_conversion = amdsmi_wrapper.CLK_LIMIT_MIN
elif limit_type.lower() == "max":
limit_type_conversion = amdsmi_wrapper.CLK_LIMIT_MAX
_check_res(
amdsmi_wrapper.amdsmi_set_gpu_clk_limit(
processor_handle,
amdsmi_wrapper.amdsmi_clk_type_t(clk_type_conversion),
amdsmi_wrapper.amdsmi_clk_limit_type_t(limit_type_conversion),
ctypes.c_uint64(value),
)
)
def amdsmi_get_gpu_memory_total(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, mem_type: AmdSmiMemoryType):
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(