[SWDEV-484382] Added new command amd-smi static --clock

Signed-off-by: gabrpham <Gabriel.Pham@amd.com>
Change-Id: I49e1aa2e699734d81c40c76c62da1cecc5bd3c0e


[ROCm/amdsmi commit: bc16e1a5da]
This commit is contained in:
gabrpham
2024-12-04 00:10:32 -06:00
committed by Gabriel Pham
parent df85708b46
commit cb42e8e444
4 changed files with 146 additions and 8 deletions
+41
View File
@@ -3,6 +3,47 @@
Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/projects/amdsmi](https://rocm.docs.amd.com/projects/amdsmi/en/latest/).
***All information listed below is for reference and subject to change.***
## amd_smi_lib for ROCm 6.4.0
### Added
- **Added new command `amd-smi static -C/--clock`**.
This new command displays the clock frequency performance levels for the selected GPUs and clocks.
```shell
amd-smi static --clock all -g 0
GPU: 0
CLOCK:
SYS:
CURRENT LEVEL: 2
FREQUENCY_LEVELS:
0: 300 MHz
1: 904 MHz
2: 1165 MHz
3: 1360 MHz
4: 1440 MHz
5: 1544 MHz
6: 1627 MHz
7: 1720 MHz
8: 1800 MHz
MEM:
CURRENT LEVEL: 0
FREQUENCY_LEVELS:
0: 167 MHz
DF:
CURRENT LEVEL: 0
FREQUENCY_LEVELS:
0: 1400 MHz
SOC:
CURRENT LEVEL: 0
FREQUENCY_LEVELS:
0: 302 MHz
DCEF: N/A
VCLK0: N/A
VCLK1: N/A
DCLK0: N/A
DCLK1: N/A
```
## amd_smi_lib for ROCm 6.4.0
+68 -8
View File
@@ -31,7 +31,7 @@ import os
from _version import __version__
from amdsmi_helpers import AMDSMIHelpers
from amdsmi_logger import AMDSMILogger
from amdsmi_cli_exceptions import AmdSmiRequiredCommandException
from amdsmi_cli_exceptions import AmdSmiRequiredCommandException, AmdSmiInvalidParameterValueException
from rocm_version import get_rocm_version
from amdsmi import amdsmi_interface
from amdsmi import amdsmi_exception
@@ -271,7 +271,7 @@ class AMDSMICommands():
def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None,
limit=None, driver=None, ras=None, board=None, numa=None, vram=None,
cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None,
soc_pstate=None, xgmi_plpd=None, process_isolation=None):
soc_pstate=None, xgmi_plpd=None, process_isolation=None, clock=None):
"""Get Static information for target gpu
Args:
@@ -321,12 +321,19 @@ class AMDSMICommands():
args.cache = cache
if process_isolation:
args.process_isolation = process_isolation
if clock:
args.clock = clock
# args.clock defaults to False so if it was overwritten to empty list, that indicates that it was given as an arguments but with an empty list
if args.clock == []:
args.clock = True
# Store args that are applicable to the current platform
current_platform_args = ["asic", "bus", "vbios", "driver", "ras",
"vram", "cache", "board", "process_isolation"]
"vram", "cache", "board", "process_isolation",
"clock"]
current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras,
args.vram, args.cache, args.board, args.process_isolation]
args.vram, args.cache, args.board, args.process_isolation,
args.clock]
if self.helpers.is_linux() and self.helpers.is_baremetal():
if partition:
@@ -829,6 +836,58 @@ class AMDSMICommands():
logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['cache_info'] = cache_info_list
if 'clock' in current_platform_args:
if isinstance(args.clock, bool) and args.clock == True:
args.clock = ['sys', 'mem', 'df', 'soc', 'dcef', 'vclk0', 'vclk1', 'dclk0', 'dclk1']
if isinstance(args.clock, list):
# remove potential duplicates from list
args.clock = list(set(args.clock))
# check that clock is valid option
if "all" in args.clock or len(args.clock) == 0:
args.clock = ['sys', 'mem', 'df', 'soc', 'dcef', 'vclk0', 'vclk1', 'dclk0', 'dclk1']
clk_dict = {}
for clk in args.clock:
clk_type = clk.lower()
if clk_type == "sys":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.SYS
elif clk_type == "mem":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.MEM
elif clk_type == "df":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.DF
elif clk_type == "soc":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.SOC
elif clk_type == "dcef":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.DCEF
# vclk and dclk currently do not support levels so average clk is given for frequency levels
elif clk_type == "vclk0":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.VCLK0
elif clk_type == "vclk1":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.VCLK1
elif clk_type == "dclk0":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.DCLK0
elif clk_type == "dclk1":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.DCLK1
else:
clk_type_conversion = "N/A"
output_format = self.helpers.get_output_format()
raise AmdSmiInvalidParameterValueException(clk_type, output_format) # clk type given is bad
try:
frequencies = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, clk_type_conversion)
freq_dict = {}
freq_dict.update({'current level':frequencies['current']})
freq_dict.update({'frequency_levels':{}})
for level in range(len(frequencies['frequency'])):
freq = str(self.helpers.convert_SI_unit(frequencies['frequency'][level], AMDSMIHelpers.SI_Unit.MICRO)) + " MHz"
freq_dict['frequency_levels'].update({level:freq})
except amdsmi_exception.AmdSmiLibraryException as e:
freq_dict = "N/A"
clk_dict.update({clk:freq_dict})
static_dict['clock'] = clk_dict
else:
raise amdsmi_exception.AmdSmiParameterException(args.clock, list[str])
# Convert and store output by pid for csv format
multiple_devices_csv_override = False
@@ -864,7 +923,8 @@ class AMDSMICommands():
bus=None, vbios=None, limit=None, driver=None, ras=None,
board=None, numa=None, vram=None, cache=None, partition=None,
dfc_ucode=None, fb_info=None, num_vf=None, cpu=None,
interface_ver=None, soc_pstate=None, xgmi_plpd = None, process_isolation=None):
interface_ver=None, soc_pstate=None, xgmi_plpd = None, process_isolation=None,
clock=None):
"""Get Static information for target gpu and cpu
Args:
@@ -916,7 +976,7 @@ class AMDSMICommands():
gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras",
"board", "numa", "vram", "cache", "partition",
"dfc_ucode", "fb_info", "num_vf", "soc_pstate", "xgmi_plpd",
"process_isolation"]
"process_isolation", "clock"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr):
@@ -947,7 +1007,7 @@ class AMDSMICommands():
bus, vbios, limit, driver, ras,
board, numa, vram, cache, partition,
dfc_ucode, fb_info, num_vf, soc_pstate,
process_isolation)
process_isolation, clock)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None:
args.cpu = self.cpu_handles
@@ -962,7 +1022,7 @@ class AMDSMICommands():
bus, vbios, limit, driver, ras,
board, numa, vram, cache, partition,
dfc_ucode, fb_info, num_vf, soc_pstate, xgmi_plpd,
process_isolation)
process_isolation, clock)
def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True):
@@ -607,6 +607,10 @@ class AMDSMIParser(argparse.ArgumentParser):
soc_pstate_help = "The available soc pstate policy"
xgmi_plpd_help = "The available XGMI per-link power down policy"
process_isolation_help = "The process isolation status"
clk_options = self.helpers.get_clock_types()[0]
clk_options.remove('PCIE')
clk_option_str = ", ".join(clk_options) + ", ALL"
clock_help = f"Show one or more valid clock frequency levels. Available options:\n\t{clk_option_str}"
# Options arguments help text for Hypervisors and Baremetal
ras_help = "Displays RAS features information"
@@ -642,6 +646,7 @@ class AMDSMIParser(argparse.ArgumentParser):
static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help)
static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help)
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
static_parser.add_argument('-C', '--clock', default=False, nargs='*', type=str, required=False, help=clock_help)
# Options to display on Hypervisors and Baremetal
if self.helpers.is_hypervisor() or self.helpers.is_baremetal():
@@ -158,6 +158,8 @@ Static Arguments:
-B, --board All board information
-R, --process-isolation The process isolation status
-r, --ras Displays RAS features information
-C, --clock [CLOCK ...] Show one or more valid clock frequency levels. Available options:
SYS, DF, DCEF, SOC, MEM, VCLK0, VCLK1, DCLK0, DCLK1, ALL
-p, --partition Partition information
-l, --limit All limit metric values (i.e. power and thermal limits)
-P, --policy The available DPM policy
@@ -855,5 +857,35 @@ GPU: 0
CACHE_LEVEL: 3
MAX_NUM_CU_SHARED: 228
NUM_CACHE_INSTANCE: 1
CLOCK:
SYS:
CURRENT LEVEL: 2
FREQUENCY_LEVELS:
0: 300 MHz
1: 904 MHz
2: 1165 MHz
3: 1360 MHz
4: 1440 MHz
5: 1544 MHz
6: 1627 MHz
7: 1720 MHz
8: 1800 MHz
DF:
CURRENT LEVEL: 0
FREQUENCY_LEVELS:
0: 1400 MHz
DCEF: N/A
SOC:
CURRENT LEVEL: 0
FREQUENCY_LEVELS:
0: 302 MHz
MEM:
CURRENT LEVEL: 0
FREQUENCY_LEVELS:
0: 167 MHz
VCLK0: N/A
VCLK1: N/A
DCLK1: N/A
DCLK0: N/A
...
```