Files
rocm-systems/projects/amdsmi/amdsmi_cli/amdsmi_commands.py
T

5803 řádky
313 KiB
Python
Surový Normální zobrazení Historie

2023-03-20 13:29:28 -05:00
#!/usr/bin/env python3
2023-03-06 06:20:21 -06:00
#
# Copyright (C) Advanced Micro Devices. All rights reserved.
2023-03-06 06:20:21 -06:00
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import logging
2023-09-27 02:37:46 -05:00
import sys
2023-03-28 15:32:17 -05:00
import threading
2023-04-21 08:02:53 -05:00
import time
2023-12-06 03:25:38 -06:00
import json
import multiprocessing
import threading
import os
2023-03-06 06:20:21 -06:00
2023-03-28 15:32:17 -05:00
from _version import __version__
2023-03-17 05:34:24 -05:00
from amdsmi_helpers import AMDSMIHelpers
2023-03-06 06:20:21 -06:00
from amdsmi_logger import AMDSMILogger
from amdsmi_cli_exceptions import AmdSmiRequiredCommandException, AmdSmiInvalidParameterException
2023-12-12 15:54:00 -06:00
from rocm_version import get_rocm_version
2023-03-17 05:34:24 -05:00
from amdsmi import amdsmi_interface
2023-03-28 15:32:17 -05:00
from amdsmi import amdsmi_exception
2023-03-06 06:20:21 -06:00
class AMDSMICommands():
2023-04-21 08:02:53 -05:00
"""This class contains all the commands corresponding to AMDSMIParser
Each command function will interact with AMDSMILogger to handle
2023-09-24 02:03:51 -05:00
displaying the output to the specified format and destination.
2023-04-21 08:02:53 -05:00
"""
2023-09-24 02:03:51 -05:00
def __init__(self, format='human_readable', destination='stdout') -> None:
2023-03-06 06:20:21 -06:00
self.helpers = AMDSMIHelpers()
2023-09-24 02:03:51 -05:00
self.logger = AMDSMILogger(format=format, destination=destination)
self.device_handles = []
self.cpu_handles = []
self.core_handles = []
self.stop = ''
2023-11-02 20:54:03 -05:00
amdsmi_init_flag = self.helpers.get_amdsmi_init_flag()
logging.debug(f"AMDSMI Init Flag: {amdsmi_init_flag}")
exit_flag = False
if self.helpers.is_amdgpu_initialized():
try:
self.device_handles = amdsmi_interface.amdsmi_get_processor_handles()
except amdsmi_exception.AmdSmiLibraryException as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
logging.error('Unable to get devices, driver not initialized (amdgpu not found in modules)')
else:
raise e
if len(self.device_handles) == 0:
# No GPU's found post amdgpu driver initialization
logging.error('Unable to detect any GPU devices, check amdgpu version and module status (sudo modprobe amdgpu)')
exit_flag = True
if self.helpers.is_amd_hsmp_initialized():
try:
self.cpu_handles = amdsmi_interface.amdsmi_get_cpusocket_handles()
except amdsmi_exception.AmdSmiLibraryException as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
2024-05-24 13:34:26 +02:00
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_DRV):
logging.info('Unable to detect any CPU devices, check amd_hsmp version and module status (sudo modprobe amd_hsmp)')
else:
raise e
# core handles
try:
self.core_handles = amdsmi_interface.amdsmi_get_cpucore_handles()
except amdsmi_exception.AmdSmiLibraryException as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
2024-05-24 13:34:26 +02:00
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_DRV):
logging.info('Unable to get CORE devices, amd_hsmp driver not loaded (sudo modprobe amd_hsmp)')
else:
raise e
2023-11-02 20:54:03 -05:00
if len(self.cpu_handles) == 0 and len(self.core_handles) == 0:
# No CPU's found post amd_hsmp driver initialization
logging.error('Unable to detect any CPU devices, check amd_hsmp version and module status (sudo modprobe amd_hsmp)')
exit_flag = True
if exit_flag:
sys.exit(-1)
2023-03-17 14:58:50 +01:00
2023-03-06 06:20:21 -06:00
def version(self, args):
"""Print Version String
Args:
args (Namespace): Namespace containing the parsed CLI args
"""
2023-03-28 15:32:17 -05:00
try:
2023-08-03 23:45:34 -05:00
amdsmi_lib_version = amdsmi_interface.amdsmi_get_lib_version()
amdsmi_lib_version_str = f"{amdsmi_lib_version['year']}.{amdsmi_lib_version['major']}.{amdsmi_lib_version['minor']}.{amdsmi_lib_version['release']}"
rocm_version_str = get_rocm_version()
2023-03-28 15:32:17 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-08-01 06:20:12 -05:00
amdsmi_lib_version_str = e.get_error_info()
2023-03-06 06:20:21 -06:00
self.logger.output['tool'] = 'AMDSMI Tool'
self.logger.output['version'] = f'{__version__}'
self.logger.output['amdsmi_library_version'] = f'{amdsmi_lib_version_str}'
2023-12-12 15:54:00 -06:00
self.logger.output['rocm_version'] = f'{rocm_version_str}'
2023-03-06 06:20:21 -06:00
if self.logger.is_human_readable_format():
human_readable_output = f"AMDSMI Tool: {__version__} | " \
f"AMDSMI Library version: {amdsmi_lib_version_str} | " \
f"ROCm version: {rocm_version_str}"
# Custom human readable handling for version
if self.logger.destination == 'stdout':
print(human_readable_output)
else:
with self.logger.destination.open('a', encoding="utf-8") as output_file:
output_file.write(human_readable_output + '\n')
2023-03-06 06:20:21 -06:00
elif self.logger.is_json_format() or self.logger.is_csv_format():
self.logger.print_output()
2023-09-18 03:39:03 -05:00
def list(self, args, multiple_devices=False, gpu=None):
"""List information for target gpu
2023-03-06 06:20:21 -06:00
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
Raises:
IndexError: Index error if gpu list is empty
Returns:
None: Print output via AMDSMILogger to destination
"""
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
# Handle No GPU passed
2023-09-14 15:13:53 -05:00
if args.gpu == None:
2023-03-06 06:20:21 -06:00
args.gpu = self.device_handles
# Handle multiple GPUs
2023-09-18 03:39:03 -05:00
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list)
2023-03-28 15:32:17 -05:00
if handled_multiple_gpus:
2023-03-30 10:07:46 -05:00
return # This function is recursive
args.gpu = device_handle
2023-03-28 15:32:17 -05:00
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
2023-03-28 15:32:17 -05:00
try:
bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu)
2023-03-28 15:32:17 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
bdf = e.get_error_info()
2023-03-06 06:20:21 -06:00
2023-03-28 15:32:17 -05:00
try:
uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu)
2023-03-28 15:32:17 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
uuid = e.get_error_info()
2023-03-06 06:20:21 -06:00
try:
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
kfd_id = kfd_info['kfd_id']
node_id = kfd_info['node_id']
2024-05-21 20:30:16 -05:00
partition_id = kfd_info['current_partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
kfd_id = node_id = "N/A"
logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info())
# CSV format is intentionally aligned with Host
if self.logger.is_csv_format():
2024-02-07 05:55:04 -06:00
self.logger.store_output(args.gpu, 'gpu_bdf', bdf)
self.logger.store_output(args.gpu, 'gpu_uuid', uuid)
self.logger.store_output(args.gpu, 'kfd_id', kfd_id)
self.logger.store_output(args.gpu, 'node_id', node_id)
self.logger.store_output(args.gpu, 'partition_id', partition_id)
else:
self.logger.store_output(args.gpu, 'bdf', bdf)
self.logger.store_output(args.gpu, 'uuid', uuid)
self.logger.store_output(args.gpu, 'kfd_id', kfd_id)
self.logger.store_output(args.gpu, 'node_id', node_id)
self.logger.store_output(args.gpu, 'partition_id', partition_id)
2023-03-06 06:20:21 -06:00
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
2023-09-24 02:03:51 -05:00
self.logger.print_output()
2023-03-06 06:20:21 -06:00
def static_cpu(self, args, multiple_devices=False, cpu=None, interface_ver=None):
"""Get Static information for target cpu
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
cpu (device_handle, optional): device_handle for target device. Defaults to None.
Returns:
None: Print output via AMDSMILogger to destination
"""
if cpu:
args.cpu = cpu
if interface_ver:
args.interface_ver = interface_ver
# Store cpu args that are applicable to the current platform
curr_platform_cpu_args = ["smu", "interface_ver"]
curr_platform_cpu_values = [args.smu, args.interface_ver]
# If no cpu options are passed, return all available args
if not any(curr_platform_cpu_values):
for arg in curr_platform_cpu_args:
setattr(args, arg, True)
# Handle multiple CPUs
handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args,
self.logger,
self.static_cpu)
if handled_multiple_cpus:
return # This function is recursive
args.cpu = device_handle
# Get cpu id for logging
cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu)
logging.debug(f"Static Arg information for CPU {cpu_id} on {self.helpers.os_info()}")
static_dict = {}
if args.smu:
try:
smu = amdsmi_interface.amdsmi_get_cpu_smu_fw_version(args.cpu)
static_dict["smu"] = {"FW_VERSION" : f"{smu['smu_fw_major_ver_num']}."
f"{smu['smu_fw_minor_ver_num']}.{smu['smu_fw_debug_ver_num']}"}
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["smu"] = "N/A"
logging.debug("Failed to get SMU FW for cpu %s | %s", cpu_id, e.get_error_info())
if args.interface_ver:
static_dict["interface_version"] = {}
try:
intf_ver = amdsmi_interface.amdsmi_get_cpu_hsmp_proto_ver(args.cpu)
static_dict["interface_version"]["proto version"] = intf_ver
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["interface_version"]["proto version"] = "N/A"
logging.debug("Failed to get proto version for cpu %s | %s", cpu_id, e.get_error_info())
multiple_devices_csv_override = False
self.logger.store_cpu_output(args.cpu, 'values', static_dict)
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override)
def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None,
limit=None, driver=None, ras=None, board=None, numa=None, vram=None,
2024-03-20 12:06:24 -05:00
cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None,
soc_pstate=None, xgmi_plpd=None, process_isolation=None, clock=None):
2023-03-06 06:20:21 -06:00
"""Get Static information for target gpu
Args:
args (Namespace): Namespace containing the parsed CLI args
current_platform_args (list): gpu supported platform arguments
current_platform_values (list): gpu supported platform values for each argument
2023-03-06 06:20:21 -06:00
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
asic (bool, optional): Value override for args.asic. Defaults to None.
bus (bool, optional): Value override for args.bus. Defaults to None.
vbios (bool, optional): Value override for args.vbios. Defaults to None.
limit (bool, optional): Value override for args.limit. Defaults to None.
driver (bool, optional): Value override for args.driver. Defaults to None.
ras (bool, optional): Value override for args.ras. Defaults to None.
board (bool, optional): Value override for args.board. Defaults to None.
2023-04-21 15:10:38 -05:00
numa (bool, optional): Value override for args.numa. Defaults to None.
2023-09-22 05:10:45 -05:00
vram (bool, optional): Value override for args.vram. Defaults to None.
2023-10-10 20:42:52 -05:00
cache (bool, optional): Value override for args.cache. Defaults to None.
partition (bool, optional): Value override for args.partition. Defaults to None.
2023-10-27 09:39:31 -05:00
dfc_ucode (bool, optional): Value override for args.dfc_ucode. Defaults to None.
fb_info (bool, optional): Value override for args.fb_info. Defaults to None.
num_vf (bool, optional): Value override for args.num_vf. Defaults to None.
soc_pstate (bool, optional): Value override for args.soc_pstate. Defaults to None.
2024-03-20 12:06:24 -05:00
xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
2024-04-08 10:35:24 -05:00
process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None.
2023-03-06 06:20:21 -06:00
Returns:
None: Print output via AMDSMILogger to destination
"""
2023-03-06 06:20:21 -06:00
if gpu:
args.gpu = gpu
if asic:
args.asic = asic
if bus:
args.bus = bus
if vbios:
args.vbios = vbios
2023-10-05 17:31:34 -05:00
if board:
args.board = board
if driver:
args.driver = driver
if ras:
args.ras = ras
2023-10-05 02:13:36 -05:00
if vram:
args.vram = vram
2023-10-10 20:42:52 -05:00
if cache:
args.cache = cache
2024-05-28 11:26:25 -05:00
if process_isolation:
args.process_isolation = process_isolation
if clock:
args.clock = clock
# args.clock defaults to False so if it was overwritten to empty list, that indicates that it was given as an arguments but with an empty list
if args.clock == []:
args.clock = True
2023-10-27 09:39:31 -05:00
# Store args that are applicable to the current platform
current_platform_args = ["asic", "bus", "vbios", "driver", "ras",
"vram", "cache", "board", "process_isolation",
"clock"]
current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras,
args.vram, args.cache, args.board, args.process_isolation,
args.clock]
2023-10-27 09:39:31 -05:00
if self.helpers.is_linux() and self.helpers.is_baremetal():
if partition:
args.partition = partition
2023-04-19 17:40:03 +02:00
if limit:
args.limit = limit
if soc_pstate:
args.soc_pstate = soc_pstate
2024-03-20 12:06:24 -05:00
if xgmi_plpd:
args.xgmi_plpd = xgmi_plpd
current_platform_args += ["ras", "limit", "partition", "soc_pstate", "xgmi_plpd"]
current_platform_values += [args.ras, args.limit, args.partition, args.soc_pstate, args.xgmi_plpd]
2023-10-27 09:39:31 -05:00
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
if numa:
args.numa = numa
current_platform_args += ["numa"]
current_platform_values += [args.numa]
if self.helpers.is_hypervisor():
if dfc_ucode:
args.dfc_ucode = dfc_ucode
if fb_info:
args.fb_info = fb_info
if num_vf:
args.num_vf = num_vf
current_platform_args += ["dfc_ucode", "fb_info", "num_vf"]
current_platform_values += [args.dfc_ucode, args.fb_info, args.num_vf]
2023-03-06 06:20:21 -06:00
if not any(current_platform_values):
for arg in current_platform_args:
setattr(args, arg, True)
2023-03-06 06:20:21 -06:00
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.static_gpu)
2023-03-28 15:32:17 -05:00
if handled_multiple_gpus:
2023-03-30 10:07:46 -05:00
return # This function is recursive
args.gpu = device_handle
2023-09-24 16:12:30 -05:00
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
2023-10-27 09:39:31 -05:00
logging.debug(f"Static Arg information for GPU {gpu_id} on {self.helpers.os_info()}")
logging.debug(f"Applicable Args: {current_platform_args}")
logging.debug(f"Arg Values: {current_platform_values}")
# Populate static dictionary for each enabled argument
2023-10-27 09:39:31 -05:00
static_dict = {}
2023-03-06 06:20:21 -06:00
if args.asic:
asic_dict = {
"market_name" : "N/A",
"vendor_id" : "N/A",
"vendor_name" : "N/A",
"subvendor_id" : "N/A",
"device_id" : "N/A",
"subsystem_id" : "N/A",
"rev_id" : "N/A",
"asic_serial" : "N/A",
"oam_id" : "N/A",
"num_compute_units" : "N/A",
"target_graphics_version" : "N/A"
}
2023-03-28 15:32:17 -05:00
try:
asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu)
for key, value in asic_info.items():
asic_dict[key] = value
2023-03-28 15:32:17 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info())
try:
subsystem_id = amdsmi_interface.amdsmi_get_gpu_subsystem_id(args.gpu)
asic_dict["subsystem_id"] = subsystem_id
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['asic'] = asic_dict
2023-03-06 06:20:21 -06:00
if args.bus:
bus_info = {
'bdf': "N/A",
'max_pcie_width': "N/A",
'max_pcie_speed': "N/A",
'pcie_interface_version': "N/A",
'slot_type': "N/A"
}
2023-03-17 14:58:50 +01:00
2023-03-28 15:32:17 -05:00
try:
2023-10-05 02:26:54 -05:00
bus_info['bdf'] = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
bus_info['bdf'] = "N/A"
logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info())
try:
2024-03-05 14:01:06 -06:00
pcie_static = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_static']
bus_info['max_pcie_width'] = pcie_static['max_pcie_width']
bus_info['max_pcie_speed'] = pcie_static['max_pcie_speed']
bus_info['pcie_interface_version'] = pcie_static['pcie_interface_version']
bus_info['slot_type'] = pcie_static['slot_type']
2023-08-01 06:20:12 -05:00
if bus_info['max_pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(bus_info['max_pcie_speed'] / 1000, 1)
else:
pcie_speed_GTs_value = round(bus_info['max_pcie_speed'] / 1000)
bus_info['max_pcie_speed'] = pcie_speed_GTs_value
if bus_info['pcie_interface_version'] > 0:
bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}"
# Set the unit for pcie_speed
pcie_speed_unit ='GT/s'
2023-03-17 14:58:50 +01:00
if self.logger.is_human_readable_format():
bus_info['max_pcie_speed'] = f"{bus_info['max_pcie_speed']} {pcie_speed_unit}"
if self.logger.is_json_format():
bus_info['max_pcie_speed'] = {"value" : bus_info['max_pcie_speed'],
"unit" : pcie_speed_unit}
2023-03-28 15:32:17 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info())
2023-10-05 02:26:54 -05:00
static_dict['bus'] = bus_info
2023-03-06 06:20:21 -06:00
if args.vbios:
2023-03-28 15:32:17 -05:00
try:
vbios_info = amdsmi_interface.amdsmi_get_gpu_vbios_info(args.gpu)
2023-11-11 02:14:16 -06:00
for key, value in vbios_info.items():
if isinstance(value, str):
if value.strip() == '':
vbios_info[key] = "N/A"
2023-04-21 08:02:53 -05:00
static_dict['vbios'] = vbios_info
2023-03-17 14:58:50 +01:00
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict['vbios'] = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get vbios info for gpu %s | %s", gpu_id, e.get_error_info())
2023-10-27 09:39:31 -05:00
if 'limit' in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.limit:
2023-09-18 18:11:39 -05:00
# Power limits
2023-04-19 17:40:03 +02:00
try:
2023-08-01 01:39:05 -05:00
power_limit_error = False
2023-10-17 00:08:43 -05:00
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
max_power_limit = power_cap_info['max_power_cap']
max_power_limit = self.helpers.convert_SI_unit(max_power_limit, AMDSMIHelpers.SI_Unit.MICRO)
2024-04-04 10:06:47 -05:00
min_power_limit = power_cap_info['min_power_cap']
min_power_limit = self.helpers.convert_SI_unit(min_power_limit, AMDSMIHelpers.SI_Unit.MICRO)
socket_power_limit = power_cap_info['power_cap']
socket_power_limit = self.helpers.convert_SI_unit(socket_power_limit, AMDSMIHelpers.SI_Unit.MICRO)
2023-04-19 17:40:03 +02:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-08-01 01:39:05 -05:00
power_limit_error = True
max_power_limit = "N/A"
socket_power_limit = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-18 18:11:55 -05:00
# Edge temperature limits
2023-04-19 17:40:03 +02:00
try:
2023-09-18 18:11:55 -05:00
slowdown_temp_edge_limit_error = False
slowdown_temp_edge_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu,
2023-04-19 17:40:03 +02:00
amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL)
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-18 18:11:55 -05:00
slowdown_temp_edge_limit_error = True
slowdown_temp_edge_limit = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get edge temperature slowdown metric for gpu %s | %s", gpu_id, e.get_error_info())
2023-03-28 15:32:17 -05:00
2023-09-18 18:11:55 -05:00
if slowdown_temp_edge_limit == 0:
slowdown_temp_edge_limit_error = True
slowdown_temp_edge_limit = "N/A"
2023-04-19 17:40:03 +02:00
try:
2023-09-18 18:11:55 -05:00
shutdown_temp_edge_limit_error = False
shutdown_temp_edge_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu,
amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.EMERGENCY)
except amdsmi_exception.AmdSmiLibraryException as e:
shutdown_temp_edge_limit_error = True
shutdown_temp_edge_limit = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get edge temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-18 18:11:55 -05:00
if shutdown_temp_edge_limit == 0:
shutdown_temp_edge_limit_error = True
shutdown_temp_edge_limit = "N/A"
2023-09-18 18:11:55 -05:00
# Hotspot/Junction temperature limits
try:
slowdown_temp_hotspot_limit_error = False
slowdown_temp_hotspot_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu,
2023-09-13 09:45:33 -05:00
amdsmi_interface.AmdSmiTemperatureType.HOTSPOT, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL)
2023-04-19 17:40:03 +02:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-18 18:11:55 -05:00
slowdown_temp_hotspot_limit_error = True
slowdown_temp_hotspot_limit = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get hotspot temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-18 18:11:55 -05:00
try:
shutdown_temp_hotspot_limit_error = False
shutdown_temp_hotspot_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu,
amdsmi_interface.AmdSmiTemperatureType.HOTSPOT, amdsmi_interface.AmdSmiTemperatureMetric.EMERGENCY)
except amdsmi_exception.AmdSmiLibraryException as e:
shutdown_temp_hotspot_limit_error = True
shutdown_temp_hotspot_limit = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get hotspot temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
2023-03-28 15:32:17 -05:00
2023-09-18 18:11:55 -05:00
# VRAM temperature limits
2023-04-19 17:40:03 +02:00
try:
2023-09-18 18:11:55 -05:00
slowdown_temp_vram_limit_error = False
slowdown_temp_vram_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu,
2023-04-19 17:40:03 +02:00
amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL)
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-18 18:11:55 -05:00
slowdown_temp_vram_limit_error = True
slowdown_temp_vram_limit = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get vram temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-18 18:11:55 -05:00
try:
shutdown_temp_vram_limit_error = False
shutdown_temp_vram_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu,
amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.EMERGENCY)
except amdsmi_exception.AmdSmiLibraryException as e:
shutdown_temp_vram_limit_error = True
shutdown_temp_vram_limit = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
2023-03-06 06:20:21 -06:00
2024-02-22 08:38:54 -06:00
# Assign units
power_unit = 'W'
temp_unit_human_readable = '\N{DEGREE SIGN}C'
temp_unit_json = 'C'
2024-04-04 10:06:47 -05:00
if not power_limit_error:
max_power_limit = self.helpers.unit_format(self.logger,
max_power_limit,
power_unit)
min_power_limit = self.helpers.unit_format(self.logger,
min_power_limit,
power_unit)
socket_power_limit = self.helpers.unit_format(self.logger,
socket_power_limit,
power_unit)
2023-03-06 06:20:21 -06:00
2024-04-04 10:06:47 -05:00
if self.logger.is_human_readable_format():
2023-09-18 18:11:55 -05:00
if not slowdown_temp_edge_limit_error:
slowdown_temp_edge_limit = f"{slowdown_temp_edge_limit} {temp_unit_human_readable}"
2023-09-18 18:11:55 -05:00
if not slowdown_temp_hotspot_limit_error:
slowdown_temp_hotspot_limit = f"{slowdown_temp_hotspot_limit} {temp_unit_human_readable}"
2023-09-18 18:11:55 -05:00
if not slowdown_temp_vram_limit_error:
slowdown_temp_vram_limit = f"{slowdown_temp_vram_limit} {temp_unit_human_readable}"
2023-09-18 18:11:55 -05:00
if not shutdown_temp_edge_limit_error:
shutdown_temp_edge_limit = f"{shutdown_temp_edge_limit} {temp_unit_human_readable}"
2023-09-18 18:11:55 -05:00
if not shutdown_temp_hotspot_limit_error:
shutdown_temp_hotspot_limit = f"{shutdown_temp_hotspot_limit} {temp_unit_human_readable}"
2023-09-18 18:11:55 -05:00
if not shutdown_temp_vram_limit_error:
shutdown_temp_vram_limit = f"{shutdown_temp_vram_limit} {temp_unit_human_readable}"
2024-04-04 10:06:47 -05:00
if self.logger.is_json_format():
if not slowdown_temp_edge_limit_error:
slowdown_temp_edge_limit = {"value" : slowdown_temp_edge_limit,
"unit" : temp_unit_json}
if not slowdown_temp_hotspot_limit_error:
slowdown_temp_hotspot_limit = {"value" : slowdown_temp_hotspot_limit,
"unit" : temp_unit_json}
if not slowdown_temp_vram_limit_error:
slowdown_temp_vram_limit = {"value" : slowdown_temp_vram_limit,
"unit" : temp_unit_json}
if not shutdown_temp_edge_limit_error:
shutdown_temp_edge_limit = {"value" : shutdown_temp_edge_limit,
"unit" : temp_unit_json}
if not shutdown_temp_hotspot_limit_error:
shutdown_temp_hotspot_limit = {"value" : shutdown_temp_hotspot_limit,
"unit" : temp_unit_json}
if not shutdown_temp_vram_limit_error:
shutdown_temp_vram_limit = {"value" : shutdown_temp_vram_limit,
"unit" : temp_unit_json}
2023-03-06 06:20:21 -06:00
2023-04-19 17:40:03 +02:00
limit_info = {}
2023-09-18 18:11:39 -05:00
# Power limits
limit_info['max_power'] = max_power_limit
2024-04-04 10:06:47 -05:00
limit_info['min_power'] = min_power_limit
limit_info['socket_power'] = socket_power_limit
2023-03-06 06:20:21 -06:00
2023-09-18 18:11:55 -05:00
# Shutdown limits
limit_info['slowdown_edge_temperature'] = slowdown_temp_edge_limit
limit_info['slowdown_hotspot_temperature'] = slowdown_temp_hotspot_limit
limit_info['slowdown_vram_temperature'] = slowdown_temp_vram_limit
limit_info['shutdown_edge_temperature'] = shutdown_temp_edge_limit
limit_info['shutdown_hotspot_temperature'] = shutdown_temp_hotspot_limit
limit_info['shutdown_vram_temperature'] = shutdown_temp_vram_limit
2023-04-19 17:40:03 +02:00
static_dict['limit'] = limit_info
2023-03-06 06:20:21 -06:00
if args.driver:
driver_info_dict = {"name" : "N/A",
"version" : "N/A"}
2023-09-26 19:13:31 -05:00
2023-03-28 15:32:17 -05:00
try:
2023-07-21 08:26:59 -05:00
driver_info = amdsmi_interface.amdsmi_get_gpu_driver_info(args.gpu)
driver_info_dict["name"] = driver_info["driver_name"]
driver_info_dict["version"] = driver_info["driver_version"]
2023-03-17 14:58:50 +01:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get driver info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['driver'] = driver_info_dict
if args.board:
static_dict['board'] = {"model_number": "N/A",
"product_serial": "N/A",
"fru_id": "N/A",
"product_name": "N/A",
"manufacturer_name": "N/A"}
try:
board_info = amdsmi_interface.amdsmi_get_gpu_board_info(args.gpu)
for key, value in board_info.items():
if isinstance(value, str):
if value.strip() == '':
board_info[key] = "N/A"
static_dict['board'] = board_info
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get board info for gpu %s | %s", gpu_id, e.get_error_info())
if 'ras' in current_platform_args:
if args.ras:
ras_dict = {"eeprom_version": "N/A",
"parity_schema" : "N/A",
"single_bit_schema" : "N/A",
"double_bit_schema" : "N/A",
"poison_schema" : "N/A",
"ecc_block_state": "N/A"}
try:
ras_info = amdsmi_interface.amdsmi_get_gpu_ras_feature_info(args.gpu)
for key, value in ras_info.items():
if isinstance(value, int):
if value == 65535:
logging.debug(f"Failed to get ras {key} for gpu {gpu_id}")
ras_info[key] = "N/A"
continue
if key != "eeprom_version":
if value:
ras_info[key] = "ENABLED"
else:
ras_info[key] = "DISABLED"
ras_dict.update(ras_info)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get ras info for gpu %s | %s", gpu_id, e.get_error_info())
try:
ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu)
ecc_block_state_dict = {}
for state in ras_states:
ecc_block_state_dict[state["block"]] = state["status"]
ras_dict["ecc_block_state"] = ecc_block_state_dict
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info())
static_dict["ras"] = ras_dict
if 'partition' in current_platform_args:
if args.partition:
try:
compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
compute_partition = "N/A"
logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info())
try:
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
memory_partition = "N/A"
logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info())
try:
2024-05-21 20:30:16 -05:00
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
partition_id = kfd_info['current_partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
partition_id = "N/A"
logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['partition'] = {"compute_partition": compute_partition,
"memory_partition": memory_partition,
"partition_id": partition_id}
if 'soc_pstate' in current_platform_args:
if args.soc_pstate:
2024-02-22 08:38:54 -06:00
try:
policy_info = amdsmi_interface.amdsmi_get_soc_pstate(args.gpu)
2024-02-22 08:38:54 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
policy_info = "N/A"
logging.debug("Failed to get soc pstate policy info for gpu %s | %s", gpu_id, e.get_error_info())
2024-03-05 14:01:06 -06:00
static_dict['soc_pstate'] = policy_info
2024-03-20 12:06:24 -05:00
if 'xgmi_plpd' in current_platform_args:
if args.xgmi_plpd:
try:
policy_info = amdsmi_interface.amdsmi_get_xgmi_plpd(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
policy_info = "N/A"
logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['xgmi_plpd'] = policy_info
2024-04-08 10:35:24 -05:00
if 'process_isolation' in current_platform_args:
if args.process_isolation:
try:
status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu)
status = "Enabled" if status else "Disabled"
except amdsmi_exception.AmdSmiLibraryException as e:
status = "N/A"
logging.debug("Failed to process isolation for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['process_isolation'] = status
if 'numa' in current_platform_args:
if args.numa:
try:
numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
numa_node_number = "N/A"
logging.debug("Failed to get numa node number for gpu %s | %s", gpu_id, e.get_error_info())
try:
numa_affinity = amdsmi_interface.amdsmi_get_gpu_topo_numa_affinity(args.gpu)
# -1 means No numa node is assigned to the GPU, so there is no numa affinity
if self.logger.is_human_readable_format() and numa_affinity == -1:
numa_affinity = "NONE"
except amdsmi_exception.AmdSmiLibraryException as e:
numa_affinity = "N/A"
logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['numa'] = {'node' : numa_node_number,
'affinity' : numa_affinity}
2023-10-05 02:13:36 -05:00
if args.vram:
vram_info_dict = {"type" : "N/A",
"vendor" : "N/A",
"size" : "N/A",
"bit_width" : "N/A"}
2023-10-05 02:13:36 -05:00
try:
vram_info = amdsmi_interface.amdsmi_get_gpu_vram_info(args.gpu)
# Get vram type string
vram_type_enum = vram_info['vram_type']
2024-05-24 13:34:26 +02:00
if vram_type_enum == amdsmi_interface.amdsmi_wrapper.AMDSMI_VRAM_TYPE__MAX:
vram_type = "GDDR7"
2023-10-05 02:13:36 -05:00
else:
vram_type = amdsmi_interface.amdsmi_wrapper.amdsmi_vram_type_t__enumvalues[vram_type_enum]
# Remove amdsmi enum prefix
2024-05-24 13:34:26 +02:00
vram_type = vram_type.replace('AMDSMI_VRAM_TYPE_', '').replace('_', '')
2023-10-05 02:13:36 -05:00
# Get vram vendor string
vram_vendor_enum = vram_info['vram_vendor']
vram_vendor = amdsmi_interface.amdsmi_wrapper.amdsmi_vram_vendor_type_t__enumvalues[vram_vendor_enum]
if "PLACEHOLDER" in vram_vendor:
vram_vendor = "N/A"
else:
# Remove amdsmi enum prefix
vram_vendor = vram_vendor.replace('AMDSMI_VRAM_VENDOR__', '')
# Assign cleaned values to vram_info_dict
vram_info_dict['type'] = vram_type
vram_info_dict['vendor'] = vram_vendor
# Populate vram size with unit
2024-04-24 11:16:06 +02:00
vram_info_dict['size'] = vram_info['vram_size']
vram_size_unit = "MB"
2023-10-05 02:13:36 -05:00
if self.logger.is_human_readable_format():
2024-04-24 11:16:06 +02:00
vram_info_dict['size'] = f"{vram_info['vram_size']} {vram_size_unit}"
if self.logger.is_json_format():
2024-04-24 11:16:06 +02:00
vram_info_dict['size'] = {"value" : vram_info['vram_size'],
"unit" : vram_size_unit}
2023-10-05 02:13:36 -05:00
# Populate bit width
vram_info_dict['bit_width'] = vram_info['vram_bit_width']
2023-10-05 02:13:36 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get vram info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['vram'] = vram_info_dict
2023-10-10 20:42:52 -05:00
if args.cache:
try:
2024-02-20 04:56:52 -06:00
cache_info_list = amdsmi_interface.amdsmi_get_gpu_cache_info(args.gpu)['cache']
logging.debug(f"cache_info dictionary = {cache_info_list}")
for index, cache_info in enumerate(cache_info_list):
new_cache_info = {"cache" : index}
new_cache_info.update(cache_info)
cache_info_list[index] = new_cache_info
2024-05-21 20:30:16 -05:00
logging.debug(f"[after update] cache_info_list = {cache_info_list}")
cache_size_unit = "KB"
2023-10-10 20:42:52 -05:00
if self.logger.is_human_readable_format():
2024-02-07 05:55:04 -06:00
cache_info_dict_format = {}
2024-02-20 04:56:52 -06:00
for cache_dict in cache_info_list:
2024-02-07 05:55:04 -06:00
cache_index = "cache_" + str(cache_dict["cache"])
cache_info_dict_format[cache_index] = cache_dict
# Remove cache index from new dictionary
cache_info_dict_format[cache_index].pop("cache")
# Add cache_size unit
cache_size = f"{cache_info_dict_format[cache_index]['cache_size']} {cache_size_unit}"
cache_info_dict_format[cache_index]["cache_size"] = cache_size
2024-02-07 05:55:04 -06:00
# take cache_properties out of list -> display as string, removing brackets
2024-02-07 05:55:04 -06:00
cache_info_dict_format[cache_index]["cache_properties"] = ", ".join(cache_info_dict_format[cache_index]["cache_properties"])
2024-02-20 04:56:52 -06:00
cache_info_list = cache_info_dict_format
2024-05-21 20:30:16 -05:00
logging.debug(f"[human readable] cache_info_list = {cache_info_list}")
2024-02-07 05:55:04 -06:00
# Add cache_size_unit to json output
if self.logger.is_json_format():
for cache_dict in cache_info_list:
cache_dict["cache_size"] = {"value" : cache_dict["cache_size"],
"unit" : cache_size_unit}
2023-10-10 20:42:52 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
2024-02-20 04:56:52 -06:00
cache_info_list = "N/A"
2023-10-10 20:42:52 -05:00
logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info())
2023-10-05 02:13:36 -05:00
2024-02-20 04:56:52 -06:00
static_dict['cache_info'] = cache_info_list
if 'clock' in current_platform_args:
if isinstance(args.clock, bool) and args.clock == True:
args.clock = ['sys', 'mem', 'df', 'soc', 'dcef', 'vclk0', 'vclk1', 'dclk0', 'dclk1']
if isinstance(args.clock, list):
# remove potential duplicates from list
args.clock = list(set(args.clock))
# check that clock is valid option
if "all" in args.clock or len(args.clock) == 0:
args.clock = ['sys', 'mem', 'df', 'soc', 'dcef', 'vclk0', 'vclk1', 'dclk0', 'dclk1']
clk_dict = {}
for clk in args.clock:
clk_type = clk.lower()
if clk_type == "sys":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.SYS
elif clk_type == "mem":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.MEM
elif clk_type == "df":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.DF
elif clk_type == "soc":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.SOC
elif clk_type == "dcef":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.DCEF
# vclk and dclk currently do not support levels so average clk is given for frequency levels
elif clk_type == "vclk0":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.VCLK0
elif clk_type == "vclk1":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.VCLK1
elif clk_type == "dclk0":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.DCLK0
elif clk_type == "dclk1":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.DCLK1
else:
clk_type_conversion = "N/A"
output_format = self.helpers.get_output_format()
raise AmdSmiInvalidParameterException(clk_type, output_format) # clk type given is bad
try:
frequencies = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, clk_type_conversion)
freq_dict = {}
freq_dict.update({'current level':frequencies['current']})
freq_dict.update({'frequency_levels':{}})
for level in range(len(frequencies['frequency'])):
freq = str(self.helpers.convert_SI_unit(frequencies['frequency'][level], AMDSMIHelpers.SI_Unit.MICRO)) + " MHz"
freq_dict['frequency_levels'].update({level:freq})
except amdsmi_exception.AmdSmiLibraryException as e:
freq_dict = "N/A"
clk_dict.update({clk:freq_dict})
static_dict['clock'] = clk_dict
else:
raise amdsmi_exception.AmdSmiParameterException(args.clock, list[str])
2023-03-06 06:20:21 -06:00
2023-04-21 08:02:53 -05:00
# Convert and store output by pid for csv format
multiple_devices_csv_override = False
2023-04-19 17:40:03 +02:00
if self.logger.is_csv_format():
2023-04-21 08:02:53 -05:00
# expand if ras blocks are populated
if self.helpers.is_linux() and self.helpers.is_baremetal() and args.ras:
2023-10-10 14:20:11 -05:00
if isinstance(static_dict['ras']['ecc_block_state'], list):
ecc_block_dicts = static_dict['ras'].pop('ecc_block_state')
2023-04-19 17:40:03 +02:00
multiple_devices_csv_override = True
2023-10-10 14:20:11 -05:00
for ecc_block_dict in ecc_block_dicts:
for key, value in ecc_block_dict.items():
2023-04-19 17:40:03 +02:00
self.logger.store_output(args.gpu, key, value)
self.logger.store_output(args.gpu, 'values', static_dict)
2023-04-19 17:40:03 +02:00
self.logger.store_multiple_device_output()
else:
# Store values if ras has an error
self.logger.store_output(args.gpu, 'values', static_dict)
if self.helpers.is_linux() and self.helpers.is_virtual_os():
2023-04-19 17:40:03 +02:00
self.logger.store_output(args.gpu, 'values', static_dict)
2023-04-21 08:02:53 -05:00
else:
self.logger.store_output(args.gpu, 'values', static_dict)
else:
# Store values in logger.output
self.logger.store_output(args.gpu, 'values', static_dict)
2023-03-06 06:20:21 -06:00
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
2023-04-21 08:02:53 -05:00
self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override)
2023-03-06 06:20:21 -06:00
def static(self, args, multiple_devices=False, gpu=None, asic=None,
bus=None, vbios=None, limit=None, driver=None, ras=None,
board=None, numa=None, vram=None, cache=None, partition=None,
dfc_ucode=None, fb_info=None, num_vf=None, cpu=None,
interface_ver=None, soc_pstate=None, xgmi_plpd = None, process_isolation=None,
clock=None):
"""Get Static information for target gpu and cpu
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
asic (bool, optional): Value override for args.asic. Defaults to None.
bus (bool, optional): Value override for args.bus. Defaults to None.
vbios (bool, optional): Value override for args.vbios. Defaults to None.
limit (bool, optional): Value override for args.limit. Defaults to None.
driver (bool, optional): Value override for args.driver. Defaults to None.
ras (bool, optional): Value override for args.ras. Defaults to None.
board (bool, optional): Value override for args.board. Defaults to None.
numa (bool, optional): Value override for args.numa. Defaults to None.
vram (bool, optional): Value override for args.vram. Defaults to None.
cache (bool, optional): Value override for args.cache. Defaults to None.
partition (bool, optional): Value override for args.partition. Defaults to None.
dfc_ucode (bool, optional): Value override for args.dfc_ucode. Defaults to None.
fb_info (bool, optional): Value override for args.fb_info. Defaults to None.
num_vf (bool, optional): Value override for args.num_vf. Defaults to None.
cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None.
interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None
soc_pstate (bool, optional): Value override for args.soc_pstate. Defaults to None.
2024-03-20 12:06:24 -05:00
xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
2024-04-08 10:35:24 -05:00
process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None.
Raises:
IndexError: Index error if gpu list is empty
Returns:
None: Print output via AMDSMILogger to destination
"""
# Mutually exclusive arguments
if cpu:
args.cpu = cpu
if gpu:
args.gpu = gpu
# Check if a CPU argument has been set
cpu_args_enabled = False
cpu_attributes = ["smu", "interface_ver"]
for attr in cpu_attributes:
if hasattr(args, attr):
if getattr(args, attr):
cpu_args_enabled = True
break
# Check if a GPU argument has been set
gpu_args_enabled = False
gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras",
"board", "numa", "vram", "cache", "partition",
"dfc_ucode", "fb_info", "num_vf", "soc_pstate", "xgmi_plpd",
"process_isolation", "clock"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr):
gpu_args_enabled = True
break
# Handle CPU and GPU intialization cases
if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized():
# Print out all CPU and all GPU static info only if no device was specified.
# If a GPU or CPU argument is provided only print out the specified device.
if args.cpu == None and args.gpu == None:
if not cpu_args_enabled and not gpu_args_enabled:
args.cpu = self.cpu_handles
args.gpu = self.device_handles
# Handle cases where the user has only specified an argument and no specific device
if args.gpu == None and gpu_args_enabled:
args.gpu = self.device_handles
if args.cpu == None and cpu_args_enabled:
args.cpu = self.cpu_handles
if args.cpu:
self.static_cpu(args, multiple_devices, cpu, interface_ver)
if args.gpu:
self.logger.output = {}
self.logger.clear_multiple_devices_ouput()
self.static_gpu(args, multiple_devices, gpu, asic,
bus, vbios, limit, driver, ras,
board, numa, vram, cache, partition,
dfc_ucode, fb_info, num_vf, soc_pstate,
process_isolation, clock)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None:
args.cpu = self.cpu_handles
self.static_cpu(args, multiple_devices, cpu, interface_ver)
elif self.helpers.is_amdgpu_initialized(): # Only GPU is initialized
if args.gpu == None:
args.gpu = self.device_handles
self.logger.clear_multiple_devices_ouput()
self.static_gpu(args, multiple_devices, gpu, asic,
bus, vbios, limit, driver, ras,
board, numa, vram, cache, partition,
dfc_ucode, fb_info, num_vf, soc_pstate, xgmi_plpd,
process_isolation, clock)
2023-03-06 06:20:21 -06:00
def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True):
""" Get Firmware information for target gpu
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
fw_list (bool, optional): True to get list of all firmware information
Raises:
IndexError: Index error if gpu list is empty
Returns:
None: Print output via AMDSMILogger to destination
"""
if gpu:
args.gpu = gpu
2023-09-24 02:03:51 -05:00
if fw_list:
2023-03-06 06:20:21 -06:00
args.fw_list = fw_list
# Handle No GPU passed
2023-09-14 15:13:53 -05:00
if args.gpu == None:
2023-03-06 06:20:21 -06:00
args.gpu = self.device_handles
# Handle multiple GPUs
2023-03-28 15:32:17 -05:00
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.firmware)
if handled_multiple_gpus:
2023-03-30 10:07:46 -05:00
return # This function is recursive
args.gpu = device_handle
2023-03-06 06:20:21 -06:00
2023-04-19 23:58:33 -05:00
fw_list = {}
2023-09-24 16:12:30 -05:00
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
2023-03-06 06:20:21 -06:00
if args.fw_list:
2023-03-28 15:32:17 -05:00
try:
fw_info = amdsmi_interface.amdsmi_get_fw_info(args.gpu)
2023-03-06 06:20:21 -06:00
2023-03-17 14:58:50 +01:00
for fw_index, fw_entry in enumerate(fw_info['fw_list']):
# Change fw_name to fw_id
2024-05-24 13:34:26 +02:00
fw_entry['fw_id'] = fw_entry.pop('fw_name').name.replace("AMDSMI_FW_ID_", "")
2023-10-24 20:08:48 -05:00
fw_entry['fw_version'] = fw_entry.pop('fw_version') # popping to ensure order
2023-03-06 06:20:21 -06:00
2023-03-17 14:58:50 +01:00
# Add custom human readable formatting
if self.logger.is_human_readable_format():
2023-10-24 20:08:48 -05:00
fw_info['fw_list'][fw_index] = {f'FW {fw_index}': fw_entry}
2023-03-17 14:58:50 +01:00
else:
fw_info['fw_list'][fw_index] = fw_entry
2023-03-06 06:20:21 -06:00
2023-04-19 23:58:33 -05:00
fw_list.update(fw_info)
2023-03-17 14:58:50 +01:00
except amdsmi_exception.AmdSmiLibraryException as e:
fw_list['fw_list'] = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get firmware info for gpu %s | %s", gpu_id, e.get_error_info())
2023-03-06 06:20:21 -06:00
2023-04-19 23:58:33 -05:00
multiple_devices_csv_override = False
# Convert and store output by pid for csv format
if self.logger.is_csv_format():
2023-09-24 02:03:51 -05:00
fw_key = 'fw_list'
2023-04-19 23:58:33 -05:00
for fw_info_dict in fw_list[fw_key]:
for key, value in fw_info_dict.items():
multiple_devices_csv_override = True
self.logger.store_output(args.gpu, key, value)
self.logger.store_multiple_device_output()
else:
# Store values in logger.output
self.logger.store_output(args.gpu, 'values', fw_list)
2023-03-06 06:20:21 -06:00
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
2023-04-21 08:02:53 -05:00
self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override)
2023-03-06 06:20:21 -06:00
def bad_pages(self, args, multiple_devices=False, gpu=None, retired=None, pending=None, un_res=None):
""" Get bad pages information for target gpu
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
retired (bool, optional) - Value override for args.retired
2023-04-21 08:02:53 -05:00
pending (bool, optional) - Value override for args.pending/
2023-03-06 06:20:21 -06:00
un_res (bool, optional) - Value override for args.un_res
Raises:
IndexError: Index error if gpu list is empty
Returns:
None: Print output via AMDSMILogger to destination
"""
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
if retired:
args.retired = retired
if pending:
args.pending = pending
if un_res:
args.un_res = un_res
# Handle No GPU passed
2023-09-14 15:13:53 -05:00
if args.gpu == None:
2023-03-06 06:20:21 -06:00
args.gpu = self.device_handles
# Handle multiple GPUs
2023-03-28 15:32:17 -05:00
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.bad_pages)
if handled_multiple_gpus:
2023-03-30 10:07:46 -05:00
return # This function is recursive
args.gpu = device_handle
2023-03-06 06:20:21 -06:00
# If all arguments are False, the print all bad_page information
if not any([args.retired, args.pending, args.un_res]):
args.retired = args.pending = args.un_res = True
values_dict = {}
2023-09-24 16:12:30 -05:00
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
bad_pages_not_found = "No bad pages found."
2023-03-06 06:20:21 -06:00
try:
bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu)
# If bad_page_info is an empty list overwrite with not found error statement
if bad_page_info == []:
bad_page_info = bad_pages_not_found
bad_page_error = True
else:
bad_page_error = False
2023-03-28 15:32:17 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-10-05 02:28:27 -05:00
bad_page_info = "N/A"
2023-08-02 22:50:15 -05:00
bad_page_error = True
logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info())
2023-03-06 06:20:21 -06:00
2023-08-02 22:50:15 -05:00
if args.retired:
if bad_page_error:
2023-10-05 02:28:27 -05:00
values_dict['retired'] = bad_page_info
2023-08-02 22:50:15 -05:00
else:
bad_page_info_output = []
for bad_page in bad_page_info:
if bad_page["status"] == amdsmi_interface.AmdSmiMemoryPageStatus.RESERVED:
bad_page_info_entry = {}
bad_page_info_entry["page_address"] = bad_page["page_address"]
bad_page_info_entry["page_size"] = bad_page["page_size"]
status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]]
bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "")
2023-08-02 22:50:15 -05:00
bad_page_info_output.append(bad_page_info_entry)
# Remove brackets if there is only one value
if len(bad_page_info_output) == 1:
bad_page_info_output = bad_page_info_output[0]
if bad_page_info_output == []:
values_dict['retired'] = bad_pages_not_found
else:
values_dict['retired'] = bad_page_info_output
2023-08-02 22:50:15 -05:00
if args.pending:
if bad_page_error:
2023-10-05 02:28:27 -05:00
values_dict['pending'] = bad_page_info
2023-08-02 22:50:15 -05:00
else:
bad_page_info_output = []
for bad_page in bad_page_info:
if bad_page["status"] == amdsmi_interface.AmdSmiMemoryPageStatus.PENDING:
bad_page_info_entry = {}
bad_page_info_entry["page_address"] = bad_page["page_address"]
bad_page_info_entry["page_size"] = bad_page["page_size"]
status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]]
bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "")
2023-08-02 22:50:15 -05:00
bad_page_info_output.append(bad_page_info_entry)
# Remove brackets if there is only one value
if len(bad_page_info_output) == 1:
bad_page_info_output = bad_page_info_output[0]
if bad_page_info_output == []:
values_dict['pending'] = bad_pages_not_found
else:
values_dict['pending'] = bad_page_info_output
2023-08-02 22:50:15 -05:00
if args.un_res:
2023-09-14 15:13:53 -05:00
if bad_page_error:
2023-10-05 02:28:27 -05:00
values_dict['un_res'] = bad_page_info
2023-09-14 15:13:53 -05:00
else:
bad_page_info_output = []
for bad_page in bad_page_info:
if bad_page["status"] == amdsmi_interface.AmdSmiMemoryPageStatus.UNRESERVABLE:
bad_page_info_entry = {}
bad_page_info_entry["page_address"] = bad_page["page_address"]
bad_page_info_entry["page_size"] = bad_page["page_size"]
status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]]
bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "")
2023-09-14 15:13:53 -05:00
bad_page_info_output.append(bad_page_info_entry)
# Remove brackets if there is only one value
if len(bad_page_info_output) == 1:
bad_page_info_output = bad_page_info_output[0]
2023-03-06 06:20:21 -06:00
if bad_page_info_output == []:
values_dict['un_res'] = bad_pages_not_found
else:
values_dict['un_res'] = bad_page_info_output
2023-03-06 06:20:21 -06:00
# Store values in logger.output
self.logger.store_output(args.gpu, 'values', values_dict)
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output()
def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=None,
2023-09-22 06:19:38 -05:00
usage=None, watch=None, watch_time=None, iterations=None, power=None,
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
2023-10-27 09:39:31 -05:00
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
2024-05-21 20:30:16 -05:00
guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None):
2023-03-06 06:20:21 -06:00
"""Get Metric information for target gpu
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
watching_output (bool, optional): True if watch argument has been set. Defaults to False.
2023-03-06 06:20:21 -06:00
gpu (device_handle, optional): device_handle for target device. Defaults to None.
usage (bool, optional): Value override for args.usage. Defaults to None.
watch (Positive int, optional): Value override for args.watch. Defaults to None.
watch_time (Positive int, optional): Value override for args.watch_time. Defaults to None.
iterations (Positive int, optional): Value override for args.iterations. Defaults to None.
power (bool, optional): Value override for args.power. Defaults to None.
clock (bool, optional): Value override for args.clock. Defaults to None.
temperature (bool, optional): Value override for args.temperature. Defaults to None.
ecc (bool, optional): Value override for args.ecc. Defaults to None.
ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None.
2023-03-06 06:20:21 -06:00
pcie (bool, optional): Value override for args.pcie. Defaults to None.
fan (bool, optional): Value override for args.fan. Defaults to None.
voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None.
2023-03-06 06:20:21 -06:00
overdrive (bool, optional): Value override for args.overdrive. Defaults to None.
perf_level (bool, optional): Value override for args.perf_level. Defaults to None.
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
energy (bool, optional): Value override for args.energy. Defaults to None.
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
2023-10-27 09:39:31 -05:00
schedule (bool, optional): Value override for args.schedule. Defaults to None.
guard (bool, optional): Value override for args.guard. Defaults to None.
guest_data (bool, optional): Value override for args.guest_data. Defaults to None.
fb_usage (bool, optional): Value override for args.fb_usage. Defaults to None.
xgmi (bool, optional): Value override for args.xgmi. Defaults to None.
2024-05-21 20:30:16 -05:00
throttle (bool, optional): Value override for args.throttle. Defaults to None.
2023-03-06 06:20:21 -06:00
Raises:
IndexError: Index error if gpu list is empty
Returns:
None: Print output via AMDSMILogger to destination
"""
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
if watch:
args.watch = watch
if watch_time:
args.watch_time = watch_time
if iterations:
args.iterations = iterations
2023-10-27 09:39:31 -05:00
# Store args that are applicable to the current platform
current_platform_args = []
current_platform_values = []
if not self.helpers.is_hypervisor() and not self.helpers.is_windows():
2023-09-22 06:19:38 -05:00
if mem_usage:
args.mem_usage = mem_usage
2023-10-27 09:39:31 -05:00
current_platform_args += ["mem_usage"]
current_platform_values += [args.mem_usage]
2023-03-06 06:20:21 -06:00
if self.helpers.is_hypervisor() or self.helpers.is_baremetal() or self.helpers.is_linux():
2023-04-19 17:40:03 +02:00
if usage:
args.usage = usage
if power:
args.power = power
if clock:
args.clock = clock
if temperature:
args.temperature = temperature
if pcie:
args.pcie = pcie
2023-04-19 17:40:03 +02:00
if ecc:
args.ecc = ecc
2024-03-04 21:42:44 -06:00
if ecc_blocks:
args.ecc_blocks = ecc_blocks
current_platform_args += ["usage", "power", "clock", "temperature", "pcie", "ecc", "ecc_blocks"]
current_platform_values += [args.usage, args.power, args.clock,
args.temperature, args.pcie]
current_platform_values += [args.ecc, args.ecc_blocks]
2023-10-27 09:39:31 -05:00
if self.helpers.is_baremetal() and self.helpers.is_linux():
2023-04-19 17:40:03 +02:00
if fan:
args.fan = fan
if voltage_curve:
args.voltage_curve = voltage_curve
2023-04-19 17:40:03 +02:00
if overdrive:
args.overdrive = overdrive
if perf_level:
args.perf_level = perf_level
if xgmi_err:
args.xgmi_err = xgmi_err
if energy:
args.energy = energy
if throttle:
args.throttle = throttle
current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level",
"xgmi_err", "energy", "throttle"]
current_platform_values += [args.fan, args.voltage_curve, args.overdrive,
args.perf_level, args.xgmi_err, args.energy, args.throttle]
2023-10-27 09:39:31 -05:00
if self.helpers.is_hypervisor():
if schedule:
args.schedule = schedule
if guard:
args.guard = guard
if guest_data:
args.guest_data = guest_data
if fb_usage:
args.fb_usage = fb_usage
if xgmi:
args.xgmi = xgmi
current_platform_args += ["schedule", "guard", "guest_data", "fb_usage", "xgmi"]
current_platform_values += [args.schedule, args.guard, args.guest_data,
args.fb_usage, args.xgmi]
2023-04-19 17:40:03 +02:00
2023-03-06 06:20:21 -06:00
# Handle No GPU passed
2023-09-14 15:13:53 -05:00
if args.gpu == None:
2023-03-06 06:20:21 -06:00
args.gpu = self.device_handles
# Handle watch logic, will only enter this block once
if args.watch:
self.helpers.handle_watch(args=args, subcommand=self.metric_gpu, logger=self.logger)
2023-04-21 08:02:53 -05:00
return
2023-03-06 06:20:21 -06:00
# Handle multiple GPUs
if isinstance(args.gpu, list):
if len(args.gpu) > 1:
2023-04-21 08:02:53 -05:00
# Deepcopy gpus as recursion will destroy the gpu list
stored_gpus = []
for gpu in args.gpu:
stored_gpus.append(gpu)
# Store output from multiple devices
2023-03-06 06:20:21 -06:00
for device_handle in args.gpu:
self.metric_gpu(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle)
2023-04-21 08:02:53 -05:00
# Reload original gpus
args.gpu = stored_gpus
2023-03-28 15:32:17 -05:00
2023-04-21 08:02:53 -05:00
# Print multiple device output
self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output)
# Add output to total watch output and clear multiple device output
2023-03-06 06:20:21 -06:00
if watching_output:
2023-04-21 08:02:53 -05:00
self.logger.store_watch_output(multiple_device_enabled=True)
# Flush the watching output
self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output)
2023-03-06 06:20:21 -06:00
return
2023-03-28 15:32:17 -05:00
elif len(args.gpu) == 1:
2023-03-06 06:20:21 -06:00
args.gpu = args.gpu[0]
else:
raise IndexError("args.gpu should not be an empty list")
2023-09-24 16:12:30 -05:00
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
2023-10-27 09:39:31 -05:00
if args.loglevel == "DEBUG":
try:
# Get GPU Metrics table version
gpu_metric_version_info = amdsmi_interface.amdsmi_get_gpu_metrics_header_info(args.gpu)
gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4)
logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str)
2024-05-21 20:30:16 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Unable to load GPU Metrics table version for %s | %s", gpu_id, e.err_info)
2024-05-21 20:30:16 -05:00
try:
# Get GPU Metrics table
gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4)
2024-05-21 20:30:16 -05:00
logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, str(gpu_metric_str))
except amdsmi_exception.AmdSmiLibraryException as e:
2024-05-21 20:30:16 -05:00
logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
2023-09-24 16:12:30 -05:00
2023-10-27 09:39:31 -05:00
logging.debug(f"Metric Arg information for GPU {gpu_id} on {self.helpers.os_info()}")
logging.debug(f"Args: {current_platform_args}")
logging.debug(f"Values: {current_platform_values}")
2023-10-27 09:39:31 -05:00
# Set the platform applicable args to True if no args are set
if not any(current_platform_values):
for arg in current_platform_args:
setattr(args, arg, True)
# Add timestamp and store values for specified arguments
values_dict = {}
2024-05-21 20:30:16 -05:00
#get metric info only once per gpu, this will speed up data output
try:
# Get GPU Metrics table
gpu_metric = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
# Populate the pcie_dict first due to multiple gpu metrics calls incorrectly increasing bandwidth
if "pcie" in current_platform_args:
if args.pcie:
pcie_dict = {"width": "N/A",
"speed": "N/A",
"bandwidth": "N/A",
"replay_count" : "N/A",
"l0_to_recovery_count" : "N/A",
"replay_roll_over_count" : "N/A",
"nak_sent_count" : "N/A",
"nak_received_count" : "N/A",
"current_bandwidth_sent": "N/A",
"current_bandwidth_received": "N/A",
2024-05-21 20:30:16 -05:00
"max_packet_size": "N/A",
"lc_perf_other_end_recovery": "N/A"}
try:
pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
pcie_dict['width'] = pcie_metric['pcie_width']
if pcie_metric['pcie_speed'] != "N/A":
if pcie_metric['pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1)
else:
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000)
pcie_dict['speed'] = pcie_speed_GTs_value
pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth']
pcie_dict['replay_count'] = pcie_metric['pcie_replay_count']
pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count']
pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count']
pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count']
pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count']
2024-05-21 20:30:16 -05:00
pcie_dict['lc_perf_other_end_recovery'] = pcie_metric['pcie_lc_perf_other_end_recovery_count']
pcie_speed_unit = 'GT/s'
pcie_bw_unit = 'Mb/s'
if self.logger.is_human_readable_format():
if pcie_dict['speed'] != "N/A":
pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
if pcie_dict['bandwidth'] != "N/A":
pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}"
if self.logger.is_json_format():
if pcie_dict['speed'] != "N/A":
pcie_dict['speed'] = {"value" : pcie_dict['speed'],
"unit" : pcie_speed_unit}
if pcie_dict['bandwidth'] != "N/A":
pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'],
"unit" : pcie_bw_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
try:
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
received = pcie_bw['received'] * pcie_bw['max_pkt_sz']
bw_unit = "Mb/s"
packet_size_unit = "B"
if sent > 0:
sent = sent // 1024 // 1024
if received > 0:
received = received // 1024 // 1024
if self.logger.is_human_readable_format():
sent = f"{sent} {bw_unit}"
received = f"{received} {bw_unit}"
pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}"
if self.logger.is_json_format():
sent = {"value" : sent,
"unit" : bw_unit}
received = {"value" : received,
"unit" : bw_unit}
pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'],
"unit" : packet_size_unit}
pcie_dict['current_bandwidth_sent'] = sent
pcie_dict['current_bandwidth_received'] = received
pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info())
2023-10-27 09:39:31 -05:00
if "usage" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.usage:
try:
engine_usage = amdsmi_interface.amdsmi_get_gpu_activity(args.gpu)
2024-05-21 20:30:16 -05:00
logging.debug(f"engine_usage dictionary = {engine_usage}")
# TODO: move vcn_activity and jpeg_activity into amdsmi_get_gpu_activity
2024-05-21 20:30:16 -05:00
engine_usage['vcn_activity'] = gpu_metric['vcn_activity']
engine_usage['jpeg_activity'] = gpu_metric['jpeg_activity']
num_partition = gpu_metric['num_partition']
engine_usage['gfx_busy_inst'] = "N/A"
engine_usage['jpeg_busy'] = "N/A"
engine_usage['vcn_busy'] = "N/A"
if num_partition != "N/A":
# these are one after another, in order to display each in sub-sections
new_xcp_dict = {}
for current_xcp in range(num_partition):
new_xcp_dict[f"xcp_{current_xcp}"] = gpu_metric['xcp_stats.gfx_busy_inst'][current_xcp]
engine_usage['gfx_busy_inst'] = new_xcp_dict
new_xcp_dict = {}
for current_xcp in range(num_partition):
new_xcp_dict[f"xcp_{current_xcp}"] = gpu_metric['xcp_stats.jpeg_busy'][current_xcp]
engine_usage['jpeg_busy'] = new_xcp_dict
new_xcp_dict = {}
for current_xcp in range(num_partition):
new_xcp_dict[f"xcp_{current_xcp}"] = gpu_metric['xcp_stats.vcn_busy'][current_xcp]
engine_usage['vcn_busy'] = new_xcp_dict
logging.debug(f"After updates to engine_usage dictionary = {engine_usage}")
for key, value in engine_usage.items():
activity_unit = '%'
if self.logger.is_human_readable_format():
2023-12-06 03:25:38 -06:00
if isinstance(value, list):
2024-01-30 20:15:11 -06:00
for index, activity in enumerate(value):
if activity != "N/A":
engine_usage[key][index] = f"{activity} {activity_unit}"
2024-01-30 20:15:11 -06:00
# Convert list to a string for human readable format
engine_usage[key] = '[' + ", ".join(engine_usage[key]) + ']'
2024-05-21 20:30:16 -05:00
elif isinstance(value, dict):
for k, v in value.items():
for index, activity in enumerate(v):
if activity != "N/A":
value[k][index] = f"{activity} {activity_unit}"
# Convert list to a string for human readable format
value[k] = '[' + ", ".join(value[k]) + ']'
2024-01-30 20:15:11 -06:00
elif value != "N/A":
engine_usage[key] = f"{value} {activity_unit}"
if self.logger.is_json_format():
if isinstance(value, list):
for index, activity in enumerate(value):
if activity != "N/A":
engine_usage[key][index] = {"value" : activity,
"unit" : activity_unit}
2024-05-21 20:30:16 -05:00
elif isinstance(value, dict):
for k, v in value.items():
for index, activity in enumerate(v):
if activity != "N/A":
value[k][index] = {"value" : activity,
"unit" : activity_unit}
elif value != "N/A":
engine_usage[key] = {"value" : value,
"unit" : activity_unit}
2023-03-06 06:20:21 -06:00
2023-04-19 17:40:03 +02:00
values_dict['usage'] = engine_usage
2024-05-21 20:30:16 -05:00
except Exception as e:
values_dict['usage'] = "N/A"
2024-05-21 20:30:16 -05:00
logging.debug("Failed to get gpu activity for gpu %s | %s", gpu_id, e)
2023-10-27 09:39:31 -05:00
if "power" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.power:
power_dict = {'socket_power': "N/A",
'gfx_voltage': "N/A",
'soc_voltage': "N/A",
'mem_voltage': "N/A",
2024-06-06 16:17:49 -05:00
'throttle_status': "N/A",
'power_management': "N/A"}
2023-09-20 14:02:45 -05:00
try:
voltage_unit = "mV"
power_unit = "W"
power_info = amdsmi_interface.amdsmi_get_power_info(args.gpu)
for key, value in power_info.items():
2023-11-22 03:32:55 -06:00
if value == 0xFFFF:
power_info[key] = "N/A"
2024-04-04 10:06:47 -05:00
elif "voltage" in key:
power_info[key] = self.helpers.unit_format(self.logger,
value,
voltage_unit)
elif "power" in key:
if ((key == "current_socket_power" or key == "average_socket_power")
and value != "N/A"):
power_dict['socket_power'] = self.helpers.unit_format(self.logger,
value,
power_unit)
power_info[key] = self.helpers.unit_format(self.logger,
value,
power_unit)
2023-11-22 03:32:55 -06:00
power_dict['gfx_voltage'] = power_info['gfx_voltage']
power_dict['soc_voltage'] = power_info['soc_voltage']
power_dict['mem_voltage'] = power_info['mem_voltage']
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get power info for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-20 14:02:45 -05:00
try:
is_power_management_enabled = amdsmi_interface.amdsmi_is_gpu_power_management_enabled(args.gpu)
if is_power_management_enabled:
power_dict['power_management'] = "ENABLED"
else:
power_dict['power_management'] = "DISABLED"
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-20 14:02:45 -05:00
2023-11-22 03:32:55 -06:00
try:
power_dict['throttle_status'] = "N/A"
2024-05-21 20:30:16 -05:00
throttle_status = gpu_metric['throttle_status']
if throttle_status != "N/A":
if throttle_status:
power_dict['throttle_status'] = "THROTTLED"
else:
power_dict['throttle_status'] = "UNTHROTTLED"
2024-05-21 20:30:16 -05:00
except Exception as e:
logging.debug("Failed to get throttle status for gpu %s | %s", gpu_id, e)
2023-11-22 03:32:55 -06:00
2023-04-19 17:40:03 +02:00
values_dict['power'] = power_dict
2023-10-27 09:39:31 -05:00
if "clock" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.clock:
# Populate Skeleton output with N/A
clocks = {}
2023-03-06 06:20:21 -06:00
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS):
gfx_index = f"gfx_{clock_index}"
clocks[gfx_index] = {"clk" : "N/A",
"min_clk" : "N/A",
"max_clk" : "N/A",
"clk_locked" : "N/A",
"deep_sleep" : "N/A"}
clocks["mem_0"] = {"clk" : "N/A",
"min_clk" : "N/A",
"max_clk" : "N/A",
"clk_locked" : "N/A",
"deep_sleep" : "N/A"}
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
vclk_index = f"vclk_{clock_index}"
clocks[vclk_index] = {"clk" : "N/A",
"min_clk" : "N/A",
"max_clk" : "N/A",
"clk_locked" : "N/A",
"deep_sleep" : "N/A"}
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
dclk_index = f"dclk_{clock_index}"
clocks[dclk_index] = {"clk" : "N/A",
"min_clk" : "N/A",
"max_clk" : "N/A",
"clk_locked" : "N/A",
"deep_sleep" : "N/A"}
clocks["fclk_0"] = {"clk" : "N/A",
"min_clk" : "N/A",
"max_clk" : "N/A",
"clk_locked" : "N/A",
"deep_sleep" : "N/A"}
clocks["socclk_0"] = {"clk" : "N/A",
"min_clk" : "N/A",
"max_clk" : "N/A",
"clk_locked" : "N/A",
"deep_sleep" : "N/A"}
clock_unit = "MHz"
# TODO make the deepsleep threshold correspond to the * in sysfs for current deep sleep status
deep_sleep_threshold = 140
# Populate clock values from gpu_metrics_info
try:
# Populate GFX clock values
2024-05-21 20:30:16 -05:00
current_gfx_clocks = gpu_metric["current_gfxclks"]
for clock_index, current_gfx_clock in enumerate(current_gfx_clocks):
# If the current clock is N/A then nothing else applies
if current_gfx_clock == "N/A":
continue
gfx_index = f"gfx_{clock_index}"
clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger,
current_gfx_clock,
clock_unit)
# Populate clock locked status
2024-05-21 20:30:16 -05:00
if gpu_metric["gfxclk_lock_status"] != "N/A":
gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag
2024-05-21 20:30:16 -05:00
if gpu_metric["gfxclk_lock_status"] & gfx_clock_lock_flag:
clocks[gfx_index]["clk_locked"] = "ENABLED"
else:
clocks[gfx_index]["clk_locked"] = "DISABLED"
# Populate deep sleep status
if int(current_gfx_clock) <= deep_sleep_threshold:
clocks[gfx_index]["deep_sleep"] = "ENABLED"
else:
clocks[gfx_index]["deep_sleep"] = "DISABLED"
2023-09-23 14:09:38 -05:00
# Populate MEM clock value
2024-05-21 20:30:16 -05:00
current_mem_clock = gpu_metric["current_uclk"] # single value
if current_mem_clock != "N/A":
clocks["mem_0"]["clk"] = self.helpers.unit_format(self.logger,
current_mem_clock,
clock_unit)
if int(current_mem_clock) <= deep_sleep_threshold:
clocks["mem_0"]["deep_sleep"] = "ENABLED"
else:
clocks["mem_0"]["deep_sleep"] = "DISABLED"
# Populate VCLK clock values
2024-05-21 20:30:16 -05:00
current_vclk_clocks = gpu_metric["current_vclk0s"]
for clock_index, current_vclk_clock in enumerate(current_vclk_clocks):
# If the current clock is N/A then nothing else applies
if current_vclk_clock == "N/A":
continue
vclk_index = f"vclk_{clock_index}"
clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger,
current_vclk_clock,
clock_unit)
if int(current_vclk_clock) <= deep_sleep_threshold:
clocks[vclk_index]["deep_sleep"] = "ENABLED"
2023-11-22 03:32:55 -06:00
else:
clocks[vclk_index]["deep_sleep"] = "DISABLED"
# Populate DCLK clock values
2024-05-21 20:30:16 -05:00
current_dclk_clocks = gpu_metric["current_dclk0s"]
for clock_index, current_dclk_clock in enumerate(current_dclk_clocks):
# If the current clock is N/A then nothing else applies
if current_dclk_clock == "N/A":
continue
dclk_index = f"dclk_{clock_index}"
clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger,
current_dclk_clock,
clock_unit)
if int(current_dclk_clock) <= deep_sleep_threshold:
clocks[dclk_index]["deep_sleep"] = "ENABLED"
else:
clocks[dclk_index]["deep_sleep"] = "DISABLED"
# Populate FCLK clock value; fclk not present in gpu_metrics so use amdsmi_get_clk_freq
frequency_dict = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, amdsmi_interface.AmdSmiClkType.DF)
current_fclk_clock = frequency_dict['frequency'][frequency_dict['current']]
current_fclk_clock = self.helpers.convert_SI_unit(current_fclk_clock, self.helpers.SI_Unit.MICRO)
clocks["fclk_0"]["clk"] = self.helpers.unit_format(self.logger,
current_fclk_clock,
clock_unit)
if int(current_fclk_clock) <= deep_sleep_threshold:
clocks["fclk_0"]["deep_sleep"] = "ENABLED"
else:
clocks["fclk_0"]["deep_sleep"] = "DISABLED"
# Populate SOCCLK clock value
current_socclk_clock = gpu_metric["current_socclk"]
clocks["socclk_0"]["clk"] = self.helpers.unit_format(self.logger,
current_socclk_clock,
clock_unit)
if int(current_socclk_clock) <= deep_sleep_threshold:
clocks["socclk_0"]["deep_sleep"] = "ENABLED"
else:
clocks["socclk_0"]["deep_sleep"] = "DISABLED"
2024-05-21 20:30:16 -05:00
except Exception as e:
logging.debug("Failed to get gpu_metrics_info for gpu %s | %s", gpu_id, e)
2023-09-24 16:12:30 -05:00
# Populate the max and min clock values from sysfs
# Min and Max values are per clock type, not per clock engine
# GFX min and max clocks
try:
gfx_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.GFX)
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS):
gfx_index = f"gfx_{clock_index}"
if clocks[gfx_index]["clk"] == "N/A":
# if the current clock is N/A then we shouldn't populate the max and min values
continue
clocks[gfx_index]["min_clk"] = self.helpers.unit_format(self.logger,
gfx_clock_info_dict["min_clk"],
clock_unit)
clocks[gfx_index]["max_clk"] = self.helpers.unit_format(self.logger,
gfx_clock_info_dict["max_clk"],
clock_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e.get_error_info())
# MEM min and max clocks
try:
mem_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.MEM)
# if the current clock is N/A then we shouldn't populate the max and min values
if clocks["mem_0"]["clk"] != "N/A":
clocks["mem_0"]["min_clk"] = self.helpers.unit_format(self.logger,
mem_clock_info_dict["min_clk"],
clock_unit)
clocks["mem_0"]["max_clk"] = self.helpers.unit_format(self.logger,
mem_clock_info_dict["max_clk"],
clock_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info())
# VCLK & DCLK min and max clocks
try:
vclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.VCLK0)
dclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.DCLK0)
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
vclk_index = f"vclk_{clock_index}"
# if the current clock is N/A then we shouldn't populate the max and min values
if clocks[vclk_index]["clk"] != "N/A":
clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
vclk0_clock_info_dict["min_clk"],
clock_unit)
clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
vclk0_clock_info_dict["max_clk"],
clock_unit)
dclk_index = f"dclk_{clock_index}"
if clocks[dclk_index]["clk"] != "N/A":
clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
dclk0_clock_info_dict["min_clk"],
clock_unit)
clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
dclk0_clock_info_dict["max_clk"],
clock_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get vclk and/or dclk clock info for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-23 14:09:38 -05:00
# FCLK min and max clocks
try:
fclk_clk_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.DF)
# if the current clock is N/A then we shouldn't populate the max and min values
if clocks["fclk_0"]["clk"] != "N/A":
clocks["fclk_0"]["min_clk"] = self.helpers.unit_format(self.logger,
fclk_clk_info_dict["min_clk"],
clock_unit)
clocks["fclk_0"]["max_clk"] = self.helpers.unit_format(self.logger,
fclk_clk_info_dict["max_clk"],
clock_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get fclk info for gpu %s | %s", gpu_id, e.get_error_info())
# SOCCLK min and max clocks
try:
socclk_clk_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
amdsmi_interface.AmdSmiClkType.SOC)
# if the current clock is N/A then we shouldn't populate the max and min values
if clocks["socclk_0"]["clk"] != "N/A":
clocks["socclk_0"]["min_clk"] = self.helpers.unit_format(self.logger,
socclk_clk_info_dict["min_clk"],
clock_unit)
clocks["socclk_0"]["max_clk"] = self.helpers.unit_format(self.logger,
socclk_clk_info_dict["max_clk"],
clock_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get socclk info for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-23 14:09:38 -05:00
values_dict['clock'] = clocks
2023-10-27 09:39:31 -05:00
if "temperature" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.temperature:
try:
temperature_edge_current = amdsmi_interface.amdsmi_get_temp_metric(
2023-04-19 17:40:03 +02:00
args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
2023-09-14 15:13:53 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
temperature_edge_current = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get current edge temperature for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-14 15:13:53 -05:00
try:
temperature_edge_limit = amdsmi_interface.amdsmi_get_temp_metric(
args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL)
2023-09-14 15:13:53 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
temperature_edge_limit = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get edge temperature limit for gpu %s | %s", gpu_id, e.get_error_info())
# If edge limit is reporting 0 then set the current edge temp to N/A
if temperature_edge_limit == 0:
temperature_edge_current = "N/A"
2023-09-14 15:13:53 -05:00
try:
2023-09-13 09:45:33 -05:00
temperature_hotspot_current = amdsmi_interface.amdsmi_get_temp_metric(
args.gpu, amdsmi_interface.AmdSmiTemperatureType.HOTSPOT, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
2023-09-14 15:13:53 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
temperature_hotspot_current = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get current hotspot temperature for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-14 15:13:53 -05:00
try:
temperature_vram_current = amdsmi_interface.amdsmi_get_temp_metric(
2023-04-19 17:40:03 +02:00
args.gpu, amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
2023-09-14 15:13:53 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
temperature_vram_current = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get current vram temperature for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-14 15:13:53 -05:00
temperatures = {'edge': temperature_edge_current,
'hotspot': temperature_hotspot_current,
'mem': temperature_vram_current}
2023-03-06 06:20:21 -06:00
temp_unit_human_readable = '\N{DEGREE SIGN}C'
temp_unit_json = 'C'
for temperature_key, temperature_value in temperatures.items():
if 'N/A' not in str(temperature_value):
if self.logger.is_human_readable_format():
temperatures[temperature_key] = f"{temperature_value} {temp_unit_human_readable}"
if self.logger.is_json_format():
temperatures[temperature_key] = {"value" : temperature_value,
"unit" : temp_unit_json}
2023-03-06 06:20:21 -06:00
2023-09-14 15:13:53 -05:00
values_dict['temperature'] = temperatures
# Since pcie bw may increase based on frequent metrics calls, we add it to the output here, but the populate the values first
2023-10-27 09:39:31 -05:00
if "pcie" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.pcie:
2023-09-22 06:34:47 -05:00
values_dict['pcie'] = pcie_dict
if "ecc" in current_platform_args:
if args.ecc:
ecc_count = {}
try:
ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu)
ecc_count['total_correctable_count'] = ecc_count.pop('correctable_count')
ecc_count['total_uncorrectable_count'] = ecc_count.pop('uncorrectable_count')
ecc_count['total_deferred_count'] = ecc_count.pop('deferred_count')
except amdsmi_exception.AmdSmiLibraryException as e:
ecc_count['total_correctable_count'] = "N/A"
ecc_count['total_uncorrectable_count'] = "N/A"
ecc_count['cache_correctable_count'] = "N/A"
ecc_count['cache_uncorrectable_count'] = "N/A"
logging.debug("Failed to get total ecc count for gpu %s | %s", gpu_id, e.get_error_info())
if ecc_count['total_correctable_count'] != "N/A":
# Get the UMC error count for getting total cache correctable errors
umc_block = amdsmi_interface.AmdSmiGpuBlock['UMC']
try:
umc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, umc_block)
ecc_count['cache_correctable_count'] = ecc_count['total_correctable_count'] - umc_count['correctable_count']
ecc_count['cache_uncorrectable_count'] = ecc_count['total_uncorrectable_count'] - umc_count['uncorrectable_count']
except amdsmi_exception.AmdSmiLibraryException as e:
ecc_count['cache_correctable_count'] = "N/A"
ecc_count['cache_uncorrectable_count'] = "N/A"
logging.debug("Failed to get cache ecc count for gpu %s at block %s | %s", gpu_id, umc_block, e.get_error_info())
values_dict['ecc'] = ecc_count
if "ecc_blocks" in current_platform_args:
if args.ecc_blocks:
2023-10-27 09:39:31 -05:00
ecc_dict = {}
sysfs_blocks = ["UMC", "SDMA", "GFX", "MMHUB", "PCIE_BIF", "HDP", "XGMI_WAFL"]
2023-10-27 09:39:31 -05:00
try:
ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu)
for state in ras_states:
# Only add enabled blocks that are also in sysfs
2023-10-27 09:39:31 -05:00
if state['status'] == amdsmi_interface.AmdSmiRasErrState.ENABLED.name:
gpu_block = amdsmi_interface.AmdSmiGpuBlock[state['block']]
# if the blocks are uncountable do not add them at all.
if gpu_block.name in sysfs_blocks:
2023-10-27 09:39:31 -05:00
try:
ecc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, gpu_block)
ecc_dict[state['block']] = {'correctable_count' : ecc_count['correctable_count'],
'uncorrectable_count' : ecc_count['uncorrectable_count'],
'deferred_count' : ecc_count['deferred_count']}
2023-10-27 09:39:31 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
ecc_dict[state['block']] = {'correctable_count' : "N/A",
'uncorrectable_count' : "N/A",
'deferred_count' : "N/A"}
2023-10-27 09:39:31 -05:00
logging.debug("Failed to get ecc count for gpu %s at block %s | %s", gpu_id, gpu_block, e.get_error_info())
values_dict['ecc_blocks'] = ecc_dict
2023-10-27 09:39:31 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['ecc_blocks'] = "N/A"
2023-10-27 09:39:31 -05:00
logging.debug("Failed to get ecc block features for gpu %s | %s", gpu_id, e.get_error_info())
if "fan" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.fan:
2023-09-24 02:33:10 -05:00
fan_dict = {"speed" : "N/A",
"max" : "N/A",
"rpm" : "N/A",
"usage" : "N/A"}
2023-04-19 17:40:03 +02:00
try:
fan_speed = amdsmi_interface.amdsmi_get_gpu_fan_speed(args.gpu, 0)
2023-09-24 02:33:10 -05:00
fan_dict["speed"] = fan_speed
2023-04-19 17:40:03 +02:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-24 02:33:10 -05:00
logging.debug("Failed to get fan speed for gpu %s | %s", args.gpu, e.get_error_info())
2023-04-19 17:40:03 +02:00
try:
fan_max = amdsmi_interface.amdsmi_get_gpu_fan_speed_max(args.gpu, 0)
2023-09-24 02:33:10 -05:00
fan_usage = "N/A"
if fan_max > 0 and fan_dict["speed"] != "N/A":
fan_usage = round((float(fan_speed) / float(fan_max)) * 100, 2)
fan_usage_unit = '%'
2023-04-19 17:40:03 +02:00
if self.logger.is_human_readable_format():
fan_usage = f"{fan_usage} {fan_usage_unit}"
if self.logger.is_json_format():
fan_usage = {"value" : fan_usage,
"unit" : fan_usage_unit}
2023-09-24 02:33:10 -05:00
fan_dict["max"] = fan_max
fan_dict["usage"] = fan_usage
2023-04-19 17:40:03 +02:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-24 02:33:10 -05:00
logging.debug("Failed to get fan max speed for gpu %s | %s", args.gpu, e.get_error_info())
2023-03-06 06:20:21 -06:00
2023-04-19 17:40:03 +02:00
try:
fan_rpm = amdsmi_interface.amdsmi_get_gpu_fan_rpms(args.gpu, 0)
2023-09-24 02:33:10 -05:00
fan_dict["rpm"] = fan_rpm
2023-04-19 17:40:03 +02:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-24 02:33:10 -05:00
logging.debug("Failed to get fan rpms for gpu %s | %s", args.gpu, e.get_error_info())
2023-03-06 06:20:21 -06:00
2023-09-24 02:33:10 -05:00
values_dict["fan"] = fan_dict
if "voltage_curve" in current_platform_args:
if args.voltage_curve:
# Populate N/A values per voltage point
voltage_point_dict = {}
for point in range(amdsmi_interface.AMDSMI_NUM_VOLTAGE_CURVE_POINTS):
voltage_point_dict[f'point_{point}_frequency'] = "N/A"
voltage_point_dict[f'point_{point}_voltage'] = "N/A"
try:
od_volt = amdsmi_interface.amdsmi_get_gpu_od_volt_info(args.gpu)
logging.debug(f"OD Voltage info: {od_volt}")
except amdsmi_exception.AmdSmiLibraryException as e:
od_volt = "N/A" # Value not used, but needs to not be a dict
logging.debug("Failed to get voltage curve for gpu %s | %s", gpu_id, e.get_error_info())
# Populate voltage point values
for point in range(amdsmi_interface.AMDSMI_NUM_VOLTAGE_CURVE_POINTS):
if isinstance(od_volt, dict):
logging.debug(f"point_{point} frequency: {od_volt['curve.vc_points'][point].frequency}")
logging.debug(f"point_{point} voltage: {od_volt['curve.vc_points'][point].voltage}")
frequency = int(od_volt["curve.vc_points"][point].frequency / 1000000)
voltage = int(od_volt["curve.vc_points"][point].voltage)
else:
frequency = "N/A"
voltage = "N/A"
if frequency == 0:
frequency = "N/A"
if voltage == 0:
voltage = "N/A"
if frequency != "N/A":
frequency = self.helpers.unit_format(self.logger, frequency, "Mhz")
if voltage != "N/A":
voltage = self.helpers.unit_format(self.logger, voltage, "mV")
voltage_point_dict[f'point_{point}_frequency'] = frequency
voltage_point_dict[f'point_{point}_voltage'] = voltage
values_dict['voltage_curve'] = voltage_point_dict
2023-10-27 09:39:31 -05:00
if "overdrive" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.overdrive:
try:
overdrive_level = amdsmi_interface.amdsmi_get_gpu_overdrive_level(args.gpu)
od_unit = '%'
values_dict['overdrive'] = self.helpers.unit_format(self.logger, overdrive_level, od_unit)
2023-04-19 17:40:03 +02:00
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['overdrive'] = "N/A"
logging.debug("Failed to get gpu overdrive level for gpu %s | %s", gpu_id, e.get_error_info())
try:
mem_overdrive_level = amdsmi_interface.amdsmi_get_gpu_mem_overdrive_level(args.gpu)
od_unit = '%'
values_dict['mem_overdrive'] = self.helpers.unit_format(self.logger, mem_overdrive_level, od_unit)
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['mem_overdrive'] = "N/A"
logging.debug("Failed to get mem overdrive level for gpu %s | %s", gpu_id, e.get_error_info())
2023-10-27 09:39:31 -05:00
if "perf_level" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.perf_level:
try:
perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu)
2023-04-19 17:40:03 +02:00
values_dict['perf_level'] = perf_level
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['perf_level'] = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get perf level for gpu %s | %s", gpu_id, e.get_error_info())
2023-10-27 09:39:31 -05:00
if "xgmi_err" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.xgmi_err:
try:
xgmi_err_status = amdsmi_interface.amdsmi_gpu_xgmi_error_status(args.gpu)
values_dict['xgmi_err'] = amdsmi_interface.amdsmi_wrapper.amdsmi_xgmi_status_t__enumvalues[xgmi_err_status]
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['xgmi_err'] = "N/A"
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info())
2023-10-27 09:39:31 -05:00
if "energy" in current_platform_args:
2023-04-19 17:40:03 +02:00
if args.energy:
2023-09-24 01:17:54 -05:00
try:
energy_dict = amdsmi_interface.amdsmi_get_energy_count(args.gpu)
energy = round(energy_dict["energy_accumulator"] * energy_dict["counter_resolution"], 3)
2023-09-24 01:17:54 -05:00
energy /= 1000000
energy = round(energy, 3)
energy_unit = 'J'
2023-09-24 01:17:54 -05:00
if self.logger.is_human_readable_format():
energy = f"{energy} {energy_unit}"
if self.logger.is_json_format():
energy = {"value" : energy,
"unit" : energy_unit}
2023-09-24 01:17:54 -05:00
values_dict['energy'] = {"total_energy_consumption" : energy}
except amdsmi_interface.AmdSmiLibraryException as e:
values_dict['energy'] = "N/A"
logging.debug("Failed to get energy usage for gpu %s | %s", args.gpu, e.get_error_info())
2023-10-27 09:39:31 -05:00
if "mem_usage" in current_platform_args:
2023-09-22 06:19:38 -05:00
if args.mem_usage:
memory_usage = {'total_vram': "N/A",
'used_vram': "N/A",
'free_vram': "N/A",
'total_visible_vram': "N/A",
'used_visible_vram': "N/A",
'free_visible_vram': "N/A",
'total_gtt': "N/A",
'used_gtt': "N/A",
'free_gtt': "N/A"}
# Total VRAM
try:
total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM)
memory_usage['total_vram'] = total_vram // (1024*1024)
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get total VRAM memory for gpu %s | %s", gpu_id, e.get_error_info())
2023-03-28 15:32:17 -05:00
2023-09-22 06:19:38 -05:00
try:
total_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM)
memory_usage['total_visible_vram'] = total_visible_vram // (1024*1024)
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get total VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info())
2023-03-28 15:32:17 -05:00
2023-09-22 06:19:38 -05:00
try:
total_gtt = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT)
memory_usage['total_gtt'] = total_gtt // (1024*1024)
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get total GTT memory for gpu %s | %s", gpu_id, e.get_error_info())
2023-03-28 15:32:17 -05:00
2023-09-22 06:19:38 -05:00
# Used VRAM
try:
used_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM)
memory_usage['used_vram'] = used_vram // (1024*1024)
2023-09-22 06:19:38 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get used VRAM memory for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-22 06:19:38 -05:00
try:
used_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM)
memory_usage['used_visible_vram'] = used_visible_vram // (1024*1024)
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get used VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-22 06:19:38 -05:00
try:
used_gtt = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT)
memory_usage['used_gtt'] = used_gtt // (1024*1024)
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get used GTT memory for gpu %s | %s", gpu_id, e.get_error_info())
2023-09-22 06:19:38 -05:00
# Free VRAM
if memory_usage['total_vram'] != "N/A" and memory_usage['used_vram'] != "N/A":
memory_usage['free_vram'] = memory_usage['total_vram'] - memory_usage['used_vram']
2023-03-06 06:20:21 -06:00
2023-09-22 06:19:38 -05:00
if memory_usage['total_visible_vram'] != "N/A" and memory_usage['used_visible_vram'] != "N/A":
memory_usage['free_visible_vram'] = memory_usage['total_visible_vram'] - memory_usage['used_visible_vram']
if memory_usage['total_gtt'] != "N/A" and memory_usage['used_gtt'] != "N/A":
memory_usage['free_gtt'] = memory_usage['total_gtt'] - memory_usage['used_gtt']
memory_unit = 'MB'
for key, value in memory_usage.items():
if value != "N/A":
if self.logger.is_human_readable_format():
memory_usage[key] = f"{value} {memory_unit}"
if self.logger.is_json_format():
memory_usage[key] = {"value" : value,
"unit" : memory_unit}
2023-09-22 06:19:38 -05:00
values_dict['mem_usage'] = memory_usage
2024-05-21 20:30:16 -05:00
if "throttle" in current_platform_args:
if args.throttle:
throttle_status = {
# Current values - counter/accumulated
2024-05-21 20:30:16 -05:00
'accumulation_counter': "N/A",
'prochot_accumulated': "N/A",
'ppt_accumulated': "N/A",
'socket_thermal_accumulated': "N/A",
'vr_thermal_accumulated': "N/A",
'hbm_thermal_accumulated': "N/A",
# violation status values - active/not active
'prochot_violation_status': "N/A",
'ppt_violation_status': "N/A",
'socket_thermal_violation_status': "N/A",
'vr_thermal_violation_status': "N/A",
'hbm_thermal_violation_status': "N/A",
# violation activity values - percent
'prochot_violation_activity': "N/A",
'ppt_violation_activity': "N/A",
'socket_thermal_violation_activity': "N/A",
'vr_thermal_violation_activity': "N/A",
'hbm_thermal_violation_activity': "N/A"
2024-05-21 20:30:16 -05:00
}
try:
violation_status = amdsmi_interface.amdsmi_get_violation_status(args.gpu)
throttle_status['accumulation_counter'] = violation_status['acc_counter']
throttle_status['prochot_accumulated'] = violation_status['acc_prochot_thrm']
throttle_status['ppt_accumulated'] = violation_status['acc_ppt_pwr']
throttle_status['socket_thermal_accumulated'] = violation_status['acc_socket_thrm']
throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm']
throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm']
throttle_status['prochot_violation_status'] = violation_status['active_prochot_thrm']
throttle_status['ppt_violation_status'] = violation_status['active_ppt_pwr']
throttle_status['socket_thermal_violation_status'] = violation_status['active_socket_thrm']
throttle_status['vr_thermal_violation_status'] = violation_status['active_vr_thrm']
throttle_status['hbm_thermal_violation_status'] = violation_status['active_hbm_thrm']
2024-05-21 20:30:16 -05:00
throttle_status['prochot_violation_activity'] = violation_status['per_prochot_thrm']
throttle_status['ppt_violation_activity'] = violation_status['per_ppt_pwr']
throttle_status['socket_thermal_violation_activity'] = violation_status['per_socket_thrm']
throttle_status['vr_thermal_violation_activity'] = violation_status['per_vr_thrm']
throttle_status['hbm_thermal_violation_activity'] = violation_status['per_hbm_thrm']
2024-05-21 20:30:16 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['throttle'] = throttle_status
logging.debug("Failed to get violation status' for gpu %s | %s", gpu_id, e.get_error_info())
for key, value in throttle_status.items():
if "_status" in key:
2024-09-30 13:23:45 -05:00
if value is True:
throttle_status[key] = "ACTIVE"
2024-09-30 13:23:45 -05:00
elif value is False:
throttle_status[key] = "NOT ACTIVE"
2024-05-21 20:30:16 -05:00
continue
if "_activity" not in key:
2024-05-21 20:30:16 -05:00
continue
activity_unit = '%'
if self.logger.is_human_readable_format():
if isinstance(value, list):
for index, activity in enumerate(value):
if activity != "N/A":
throttle_status[key][index] = f"{activity} {activity_unit}"
# Convert list to a string for human readable format
throttle_status[key] = '[' + ", ".join(throttle_status[key]) + ']'
elif value != "N/A":
throttle_status[key] = f"{value} {activity_unit}"
if self.logger.is_json_format():
if isinstance(value, list):
for index, activity in enumerate(value):
if activity != "N/A":
throttle_status[key][index] = {"value" : activity,
"unit" : activity_unit}
elif value != "N/A":
throttle_status[key] = {"value" : value,
"unit" : activity_unit}
values_dict['throttle'] = throttle_status
2023-03-06 06:20:21 -06:00
2023-04-21 08:02:53 -05:00
# Store timestamp first if watching_output is enabled
if watching_output:
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
2023-03-06 06:20:21 -06:00
self.logger.store_output(args.gpu, 'values', values_dict)
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
2023-04-21 08:02:53 -05:00
self.logger.print_output(watching_output=watching_output)
2023-03-06 06:20:21 -06:00
if watching_output: # End of single gpu add to watch_output
2023-04-21 08:02:53 -05:00
self.logger.store_watch_output(multiple_device_enabled=False)
2023-03-06 06:20:21 -06:00
def metric_cpu(self, args, multiple_devices=False, cpu=None, cpu_power_metrics=None, cpu_prochot=None,
cpu_freq_metrics=None, cpu_c0_res=None, cpu_lclk_dpm_level=None,
cpu_pwr_svi_telemtry_rails=None, cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None,
cpu_metrics_ver=None, cpu_metrics_table=None, cpu_socket_energy=None,
cpu_ddr_bandwidth=None, cpu_temp=None, cpu_dimm_temp_range_rate=None,
cpu_dimm_pow_consumption=None, cpu_dimm_thermal_sensor=None):
"""Get Metric information for target cpu
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
cpu (cpu_handle, optional): device_handle for target device. Defaults to None.
cpu_power_metrics (bool, optional): Value override for args.cpu_power_metrics. Defaults to None
cpu_prochot (bool, optional): Value override for args.cpu_prochot. Defaults to None.
cpu_freq_metrics (bool, optional): Value override for args.cpu_freq_metrics. Defaults to None.
cpu_c0_res (bool, optional): Value override for args.cpu_c0_res. Defaults to None
cpu_lclk_dpm_level (list, optional): Value override for args.cpu_lclk_dpm_level. Defaults to None
cpu_pwr_svi_telemtry_rails (list, optional): value override for args.cpu_pwr_svi_telemtry_rails. Defaults to None
cpu_io_bandwidth (list, optional): value override for args.cpu_io_bandwidth. Defaults to None
cpu_xgmi_bandwidth (list, optional): value override for args.cpu_xgmi_bandwidth. Defaults to None
cpu_metrics_ver (bool, optional): Value override for args.cpu_metrics_ver. Defaults to None
cpu_metrics_table (bool, optional): Value override for args.cpu_metrics_table. Defaults to None
cpu_socket_energy (bool, optional): Value override for args.cpu_socket_energy. Defaults to None
cpu_ddr_bandwidth (bool, optional): Value override for args.cpu_ddr_bandwidth. Defaults to None
cpu_temp (bool, optional): Value override for args.cpu_temp. Defaults to None
cpu_dimm_temp_range_rate (list, optional): Dimm address. Value override for args.cpu_dimm_temp_range_rate. Defaults to None
cpu_dimm_pow_consumption (list, optional): Dimm address. Value override for args.cpu_dimm_pow_consumption. Defaults to None
cpu_dimm_thermal_sensor (list, optional): Dimm address. Value override for args.cpu_dimm_thermal_sensor. Defaults to None
Returns:
None: Print output via AMDSMILogger to destination
"""
if cpu:
args.cpu = cpu
if cpu_power_metrics:
args.cpu_power_metrics = cpu_power_metrics
if cpu_prochot:
args.cpu_prochot = cpu_prochot
if cpu_freq_metrics:
args.cpu_freq_metrics = cpu_freq_metrics
if cpu_c0_res:
args.cpu_c0_res = cpu_c0_res
if cpu_lclk_dpm_level:
args.cpu_lclk_dpm_level = cpu_lclk_dpm_level
if cpu_pwr_svi_telemtry_rails:
args.cpu_pwr_svi_telemtry_rails = cpu_pwr_svi_telemtry_rails
if cpu_io_bandwidth:
args.cpu_io_bandwidth = cpu_io_bandwidth
if cpu_xgmi_bandwidth:
args.cpu_xgmi_bandwidth = cpu_xgmi_bandwidth
if cpu_metrics_ver:
args.cpu_metrics_ver = cpu_metrics_ver
if cpu_metrics_table:
args.cpu_metrics_table = cpu_metrics_table
if cpu_socket_energy:
args.cpu_socket_energy = cpu_socket_energy
if cpu_ddr_bandwidth:
args.cpu_ddr_bandwidth = cpu_ddr_bandwidth
if cpu_temp:
args.cpu_temp = cpu_temp
if cpu_dimm_temp_range_rate:
args.cpu_dimm_temp_range_rate = cpu_dimm_temp_range_rate
if cpu_dimm_pow_consumption:
args.cpu_dimm_pow_consumption = cpu_dimm_pow_consumption
if cpu_dimm_thermal_sensor:
args.cpu_dimm_thermal_sensor = cpu_dimm_thermal_sensor
#store cpu args that are applicable to the current platform
curr_platform_cpu_args = ["cpu_power_metrics", "cpu_prochot", "cpu_freq_metrics",
"cpu_c0_res", "cpu_lclk_dpm_level", "cpu_pwr_svi_telemtry_rails",
"cpu_io_bandwidth", "cpu_xgmi_bandwidth", "cpu_metrics_ver",
"cpu_metrics_table", "cpu_socket_energy", "cpu_ddr_bandwidth",
"cpu_temp", "cpu_dimm_temp_range_rate", "cpu_dimm_pow_consumption",
"cpu_dimm_thermal_sensor"]
curr_platform_cpu_values = [args.cpu_power_metrics, args.cpu_prochot, args.cpu_freq_metrics,
args.cpu_c0_res, args.cpu_lclk_dpm_level, args.cpu_pwr_svi_telemtry_rails,
args.cpu_io_bandwidth, args.cpu_xgmi_bandwidth, args.cpu_metrics_ver,
args.cpu_metrics_table, args.cpu_socket_energy, args.cpu_ddr_bandwidth,
args.cpu_temp, args.cpu_dimm_temp_range_rate, args.cpu_dimm_pow_consumption,
args.cpu_dimm_thermal_sensor]
# Handle No CPU passed (fall back as this should be defined in metric())
if args.cpu == None:
args.cpu = self.cpu_handles
if not any(curr_platform_cpu_values):
for arg in curr_platform_cpu_args:
if arg not in("cpu_lclk_dpm_level", "cpu_io_bandwidth", "cpu_xgmi_bandwidth",
"cpu_dimm_temp_range_rate", "cpu_dimm_pow_consumption", "cpu_dimm_thermal_sensor"):
setattr(args, arg, True)
handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args,
self.logger,
self.metric_cpu)
if handled_multiple_cpus:
return # This function is recursive
args.cpu = device_handle
# get cpu id for logging
cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu)
logging.debug(f"Metric Arg information for CPU {cpu_id} on {self.helpers.os_info()}")
static_dict = {}
if args.cpu_power_metrics:
static_dict["power_metrics"] = {}
try:
soc_pow = amdsmi_interface.amdsmi_get_cpu_socket_power(args.cpu)
static_dict["power_metrics"]["socket power"] = soc_pow
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["power_metrics"]["socket power"] = "N/A"
logging.debug("Failed to get socket power for cpu %s | %s", cpu_id, e.get_error_info())
try:
soc_pwr_limit = amdsmi_interface.amdsmi_get_cpu_socket_power_cap(args.cpu)
static_dict["power_metrics"]["socket power limit"] = soc_pwr_limit
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["power_metrics"]["socket power limit"] = "N/A"
logging.debug("Failed to get socket power limit for cpu %s | %s", cpu_id, e.get_error_info())
try:
soc_max_pwr_limit = amdsmi_interface.amdsmi_get_cpu_socket_power_cap_max(args.cpu)
static_dict["power_metrics"]["socket max power limit"] = soc_max_pwr_limit
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["power_metrics"]["socket max power limit"] = "N/A"
logging.debug("Failed to get max socket power limit for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_prochot:
static_dict["prochot"] = {}
try:
proc_status = amdsmi_interface.amdsmi_get_cpu_prochot_status(args.cpu)
static_dict["prochot"]["prochot_status"] = proc_status
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["prochot"]["prochot_status"] = "N/A"
logging.debug("Failed to get prochot status for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_freq_metrics:
static_dict["freq_metrics"] = {}
try:
fclk_mclk = amdsmi_interface.amdsmi_get_cpu_fclk_mclk(args.cpu)
static_dict["freq_metrics"]["fclkmemclk"] = fclk_mclk
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["freq_metrics"]["fclkmemclk"] = "N/A"
logging.debug("Failed to get current fclkmemclk freq for cpu %s | %s", cpu_id, e.get_error_info())
try:
cclk_freq = amdsmi_interface.amdsmi_get_cpu_cclk_limit(args.cpu)
static_dict["freq_metrics"]["cclkfreqlimit"] = cclk_freq
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["freq_metrics"]["cclkfreqlimit"] = "N/A"
logging.debug("Failed to get current cclk freq for cpu %s | %s", cpu_id, e.get_error_info())
try:
soc_cur_freq_limit = amdsmi_interface.amdsmi_get_cpu_socket_current_active_freq_limit(args.cpu)
static_dict["freq_metrics"]["soc_current_active_freq_limit"] = soc_cur_freq_limit
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["freq_metrics"]["soc_current_active_freq_limit"] = "N/A"
logging.debug("Failed to get socket current freq limit for cpu %s | %s", cpu_id, e.get_error_info())
try:
soc_freq_range = amdsmi_interface.amdsmi_get_cpu_socket_freq_range(args.cpu)
static_dict["freq_metrics"]["soc_freq_range"] = soc_freq_range
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["freq_metrics"]["soc_freq_range"] = "N/A"
logging.debug("Failed to get socket freq range for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_c0_res:
static_dict["c0_residency"] = {}
try:
residency = amdsmi_interface.amdsmi_get_cpu_socket_c0_residency(args.cpu)
static_dict["c0_residency"]["residency"] = residency
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["c0_residency"]["residency"] = "N/A"
logging.debug("Failed to get C0 residency for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_lclk_dpm_level:
static_dict["socket_dpm"] = {}
try:
dpm_val = amdsmi_interface.amdsmi_get_cpu_socket_lclk_dpm_level(args.cpu,
args.cpu_lclk_dpm_level[0][0])
static_dict["socket_dpm"]["dpml_level_range"] = dpm_val
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["socket_dpm"]["dpml_level_range"] = "N/A"
logging.debug("Failed to get socket dpm level range for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_pwr_svi_telemtry_rails:
static_dict["svi_telemetry_all_rails"] = {}
try:
power = amdsmi_interface.amdsmi_get_cpu_pwr_svi_telemetry_all_rails(args.cpu)
static_dict["svi_telemetry_all_rails"]["power"] = power
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["c0_residency"]["residency"] = "N/A"
logging.debug("Failed to get svi telemetry all rails for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_io_bandwidth:
static_dict["io_bandwidth"] = {}
try:
bandwidth = amdsmi_interface.amdsmi_get_cpu_current_io_bandwidth(args.cpu,
int(args.cpu_io_bandwidth[0][0]),
args.cpu_io_bandwidth[0][1])
static_dict["io_bandwidth"]["band_width"] = bandwidth
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["io_bandwidth"]["band_width"] = "N/A"
logging.debug("Failed to get io bandwidth for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_xgmi_bandwidth:
static_dict["xgmi_bandwidth"] = {}
try:
bandwidth = amdsmi_interface.amdsmi_get_cpu_current_xgmi_bw(args.cpu,
int(args.cpu_xgmi_bandwidth[0][0]),
args.cpu_xgmi_bandwidth[0][1])
static_dict["xgmi_bandwidth"]["band_width"] = bandwidth
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["xgmi_bandwidth"]["band_width"] = "N/A"
logging.debug("Failed to get xgmi bandwidth for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_metrics_ver:
static_dict["metric_version"] = {}
try:
version = amdsmi_interface.amdsmi_get_hsmp_metrics_table_version(args.cpu)
static_dict["metric_version"]["version"] = version
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["metric_version"]["version"] = "N/A"
logging.debug("Failed to get metrics table version for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_metrics_table:
static_dict["metrics_table"] = {}
try:
cpu_fam = amdsmi_interface.amdsmi_get_cpu_family()
static_dict["metrics_table"]["cpu_family"] = cpu_fam
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["metrics_table"]["cpu_family"] = "N/A"
logging.debug("Failed to get cpu family | %s", e.get_error_info())
try:
cpu_mod = amdsmi_interface.amdsmi_get_cpu_model()
static_dict["metrics_table"]["cpu_model"] = cpu_mod
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["metrics_table"]["cpu_model"] = "N/A"
logging.debug("Failed to get cpu model | %s", e.get_error_info())
try:
cpu_metrics_table = amdsmi_interface.amdsmi_get_hsmp_metrics_table(args.cpu)
static_dict["metrics_table"]["response"] = cpu_metrics_table
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["metrics_table"]["response"] = "N/A"
logging.debug("Failed to get metrics table for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_socket_energy:
static_dict["socket_energy"] = {}
try:
energy = amdsmi_interface.amdsmi_get_cpu_socket_energy(args.cpu)
static_dict["socket_energy"]["response"] = energy
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["socket_energy"]["response"] = "N/A"
logging.debug("Failed to get socket energy for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_ddr_bandwidth:
static_dict["ddr_bandwidth"] = {}
try:
resp = amdsmi_interface.amdsmi_get_cpu_ddr_bw(args.cpu)
static_dict["ddr_bandwidth"]["response"] = resp
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["ddr_bandwidth"]["response"] = "N/A"
logging.debug("Failed to get ddr bandwdith for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_temp:
static_dict["cpu_temp"] = {}
try:
resp = amdsmi_interface.amdsmi_get_cpu_socket_temperature(args.cpu)
static_dict["cpu_temp"]["response"] = resp
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["cpu_temp"]["response"] = "N/A"
logging.debug("Failed to get cpu temperature for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_dimm_temp_range_rate:
static_dict["dimm_temp_range_rate"] = {}
try:
resp = amdsmi_interface.amdsmi_get_cpu_dimm_temp_range_and_refresh_rate(args.cpu, args.cpu_dimm_temp_range_rate[0][0])
static_dict["dimm_temp_range_rate"]["response"] = resp
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["dimm_temp_range_rate"]["response"] = "N/A"
logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_dimm_pow_consumption:
static_dict["dimm_pow_consumption"] = {}
try:
resp = amdsmi_interface.amdsmi_get_cpu_dimm_power_consumption(args.cpu, args.cpu_dimm_pow_consumption[0][0])
static_dict["dimm_pow_consumption"]["response"] = resp
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["dimm_pow_consumption"]["response"] = "N/A"
logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_dimm_thermal_sensor:
static_dict["dimm_thermal_sensor"] = {}
try:
resp = amdsmi_interface.amdsmi_get_cpu_dimm_thermal_sensor(args.cpu, args.cpu_dimm_thermal_sensor[0][0])
static_dict["dimm_thermal_sensor"]["response"] = resp
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["dimm_thermal_sensor"]["response"] = "N/A"
logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info())
multiple_devices_csv_override = False
self.logger.store_cpu_output(args.cpu, 'values', static_dict)
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override)
def metric_core(self, args, multiple_devices=False, core=None, core_boost_limit=None,
core_curr_active_freq_core_limit=None, core_energy=None):
"""Get Static information for target core
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
core (device_handle, optional): device_handle for target core. Defaults to None.
core_boost_limit (bool, optional): Value override for args.core_boost_limit. Defaults to None
core_curr_active_freq_core_limit (bool, optional): Value override for args.core_curr_active_freq_core_limit. Defaults to None
core_energy (bool, optional): Value override for args.core_energy. Defaults to None
Returns:
None: Print output via AMDSMILogger to destination
"""
if core:
args.core = core
if core_boost_limit:
args.core_boost_limit = core_boost_limit
if core_curr_active_freq_core_limit:
args.core_curr_active_freq_core_limit = core_curr_active_freq_core_limit
if core_energy:
args.core_energy = core_energy
#store core args that are applicable to the current platform
curr_platform_core_args = ["core_boost_limit", "core_curr_active_freq_core_limit", "core_energy"]
curr_platform_core_values = [args.core_boost_limit, args.core_curr_active_freq_core_limit, args.core_energy]
# Handle No cores passed
if args.core == None:
args.core = self.core_handles
if not any(curr_platform_core_values):
for arg in curr_platform_core_args:
setattr(args, arg, True)
handled_multiple_cores, device_handle = self.helpers.handle_cores(args,
self.logger,
self.metric_core)
if handled_multiple_cores:
return # This function is recursive
args.core = device_handle
# get core id for logging
core_id = self.helpers.get_core_id_from_device_handle(args.core)
logging.debug(f"Static Arg information for Core {core_id} on {self.helpers.os_info()}")
static_dict = {}
if args.core_boost_limit:
static_dict["boost_limit"] ={}
try:
core_boost_limit = amdsmi_interface.amdsmi_get_cpu_core_boostlimit(args.core)
static_dict["boost_limit"]["value"] = core_boost_limit
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["boost_limit"]["value"] = "N/A"
logging.debug("Failed to get core boost limit for core %s | %s", core_id, e.get_error_info())
if args.core_curr_active_freq_core_limit:
static_dict["curr_active_freq_core_limit"] = {}
try:
freq = amdsmi_interface.amdsmi_get_cpu_core_current_freq_limit(args.core)
static_dict["curr_active_freq_core_limit"]["value"] = freq
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["curr_active_freq_core_limit"]["value"] = "N/A"
logging.debug("Failed to get current active frequency core for core %s | %s", core_id, e.get_error_info())
if args.core_energy:
static_dict["core_energy"] ={}
try:
energy = amdsmi_interface.amdsmi_get_cpu_core_energy(args.core)
static_dict["core_energy"]["value"] = energy
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["core_energy"]["value"] = "N/A"
logging.debug("Failed to get core energy for core %s | %s", core_id, e.get_error_info())
multiple_devices_csv_override = False
self.logger.store_core_output(args.core, 'values', static_dict)
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override)
def metric(self, args, multiple_devices=False, watching_output=False, gpu=None,
usage=None, watch=None, watch_time=None, iterations=None, power=None,
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
guard=None, guest_data=None, fb_usage=None, xgmi=None,
cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None,
cpu_c0_res=None, cpu_lclk_dpm_level=None, cpu_pwr_svi_telemtry_rails=None,
cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None, cpu_metrics_ver=None,
cpu_metrics_table=None, cpu_socket_energy=None, cpu_ddr_bandwidth=None,
cpu_temp=None, cpu_dimm_temp_range_rate=None, cpu_dimm_pow_consumption=None,
cpu_dimm_thermal_sensor=None,
core=None, core_boost_limit=None, core_curr_active_freq_core_limit=None,
2024-05-21 20:30:16 -05:00
core_energy=None, throttle=None):
"""Get Metric information for target gpu
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
watching_output (bool, optional): True if watch argument has been set. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
usage (bool, optional): Value override for args.usage. Defaults to None.
watch (Positive int, optional): Value override for args.watch. Defaults to None.
watch_time (Positive int, optional): Value override for args.watch_time. Defaults to None.
iterations (Positive int, optional): Value override for args.iterations. Defaults to None.
power (bool, optional): Value override for args.power. Defaults to None.
clock (bool, optional): Value override for args.clock. Defaults to None.
temperature (bool, optional): Value override for args.temperature. Defaults to None.
ecc (bool, optional): Value override for args.ecc. Defaults to None.
ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None.
pcie (bool, optional): Value override for args.pcie. Defaults to None.
fan (bool, optional): Value override for args.fan. Defaults to None.
voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None.
overdrive (bool, optional): Value override for args.overdrive. Defaults to None.
perf_level (bool, optional): Value override for args.perf_level. Defaults to None.
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
energy (bool, optional): Value override for args.energy. Defaults to None.
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
schedule (bool, optional): Value override for args.schedule. Defaults to None.
guard (bool, optional): Value override for args.guard. Defaults to None.
guest_data (bool, optional): Value override for args.guest_data. Defaults to None.
fb_usage (bool, optional): Value override for args.fb_usage. Defaults to None.
xgmi (bool, optional): Value override for args.xgmi. Defaults to None.
cpu (cpu_handle, optional): device_handle for target device. Defaults to None.
cpu_power_metrics (bool, optional): Value override for args.cpu_power_metrics. Defaults to None
cpu_prochot (bool, optional): Value override for args.cpu_prochot. Defaults to None.
cpu_freq_metrics (bool, optional): Value override for args.cpu_freq_metrics. Defaults to None.
cpu_c0_res (bool, optional): Value override for args.cpu_c0_res. Defaults to None
cpu_lclk_dpm_level (list, optional): Value override for args.cpu_lclk_dpm_level. Defaults to None
cpu_pwr_svi_telemtry_rails (list, optional): value override for args.cpu_pwr_svi_telemtry_rails. Defaults to None
cpu_io_bandwidth (list, optional): value override for args.cpu_io_bandwidth. Defaults to None
cpu_xgmi_bandwidth (list, optional): value override for args.cpu_xgmi_bandwidth. Defaults to None
cpu_metrics_ver (bool, optional): Value override for args.cpu_metrics_ver. Defaults to None
cpu_metrics_table (bool, optional): Value override for args.cpu_metrics_table. Defaults to None
cpu_socket_energy (bool, optional): Value override for args.cpu_socket_energy. Defaults to None
cpu_ddr_bandwidth (bool, optional): Value override for args.cpu_ddr_bandwidth. Defaults to None
cpu_temp (bool, optional): Value override for args.cpu_temp. Defaults to None
cpu_dimm_temp_range_rate (list, optional): Dimm address. Value override for args.cpu_dimm_temp_range_rate. Defaults to None
cpu_dimm_pow_consumption (list, optional): Dimm address. Value override for args.cpu_dimm_pow_consumption. Defaults to None
cpu_dimm_thermal_sensor (list, optional): Dimm address. Value override for args.cpu_dimm_thermal_sensor. Defaults to None
core (device_handle, optional): device_handle for target core. Defaults to None.
core_boost_limit (bool, optional): Value override for args.core_boost_limit. Defaults to None
core_curr_active_freq_core_limit (bool, optional): Value override for args.core_curr_active_freq_core_limit. Defaults to None
core_energy (bool, optional): Value override for args.core_energy. Defaults to None
Raises:
IndexError: Index error if gpu list is empty
Returns:
None: Print output via AMDSMILogger to destination
"""
# TODO Move watch logic into here and make it driver agnostic or enable it for CPU arguments
# Mutually exculsive args
if gpu:
args.gpu = gpu
if cpu:
args.cpu = cpu
if core:
args.core = core
# Check if a GPU argument has been set
gpu_args_enabled = False
gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock",
"temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve",
"overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule",
2024-05-21 20:30:16 -05:00
"guard", "guest_data", "fb_usage", "xgmi", "throttle"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr):
gpu_args_enabled = True
break
# Check if a CPU argument has been set
cpu_args_enabled = False
cpu_attributes = ["cpu_power_metrics", "cpu_prochot", "cpu_freq_metrics", "cpu_c0_res",
"cpu_lclk_dpm_level", "cpu_pwr_svi_telemtry_rails", "cpu_io_bandwidth",
"cpu_xgmi_bandwidth", "cpu_metrics_ver", "cpu_metrics_table",
"cpu_socket_energy", "cpu_ddr_bandwidth", "cpu_temp", "cpu_dimm_temp_range_rate",
"cpu_dimm_pow_consumption", "cpu_dimm_thermal_sensor"]
for attr in cpu_attributes:
if hasattr(args, attr):
if getattr(args, attr):
cpu_args_enabled = True
break
# Check if a Core argument has been set
core_args_enabled = False
core_attributes = ["core_boost_limit", "core_curr_active_freq_core_limit", "core_energy"]
for attr in core_attributes:
if hasattr(args, attr):
if getattr(args, attr):
core_args_enabled = True
break
# Handle CPU and GPU driver intialization cases
if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized():
logging.debug("gpu_args_enabled: %s, cpu_args_enabled: %s, core_args_enabled: %s",
gpu_args_enabled, cpu_args_enabled, core_args_enabled)
logging.debug("args.gpu: %s, args.cpu: %s, args.core: %s", args.gpu, args.cpu, args.core)
# If a GPU or CPU argument is provided only print out the specified device.
if args.cpu == None and args.gpu == None and args.core == None:
# If no args are set, print out all CPU, GPU, and Core metrics info
if not gpu_args_enabled and not cpu_args_enabled and not core_args_enabled:
args.cpu = self.cpu_handles
args.gpu = self.device_handles
args.core = self.core_handles
# Handle cases where the user has only specified an argument and no specific device
if args.gpu == None and gpu_args_enabled:
args.gpu = self.device_handles
if args.cpu == None and cpu_args_enabled:
args.cpu = self.cpu_handles
if args.core == None and core_args_enabled:
args.core = self.core_handles
# Print out CPU first
if args.cpu:
self.metric_cpu(args, multiple_devices, cpu, cpu_power_metrics, cpu_prochot,
cpu_freq_metrics, cpu_c0_res, cpu_lclk_dpm_level,
cpu_pwr_svi_telemtry_rails, cpu_io_bandwidth, cpu_xgmi_bandwidth,
cpu_metrics_ver, cpu_metrics_table, cpu_socket_energy,
cpu_ddr_bandwidth, cpu_temp, cpu_dimm_temp_range_rate,
cpu_dimm_pow_consumption, cpu_dimm_thermal_sensor)
if args.core:
self.logger.output = {}
self.logger.clear_multiple_devices_ouput()
self.metric_core(args, multiple_devices, core, core_boost_limit,
core_curr_active_freq_core_limit, core_energy)
if args.gpu:
self.logger.output = {}
self.logger.clear_multiple_devices_ouput()
self.metric_gpu(args, multiple_devices, watching_output, gpu,
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
xgmi_err, energy, mem_usage, schedule,
2024-05-21 20:30:16 -05:00
guard, guest_data, fb_usage, xgmi, throttle)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None and args.core == None:
# If no args are set, print out all CPU and Core metrics info
if not cpu_args_enabled and not core_args_enabled:
args.cpu = self.cpu_handles
args.core = self.core_handles
if args.cpu == None and cpu_args_enabled:
args.cpu = self.cpu_handles
if args.core == None and core_args_enabled:
args.core = self.core_handles
if args.cpu:
self.metric_cpu(args, multiple_devices, cpu, cpu_power_metrics, cpu_prochot,
cpu_freq_metrics, cpu_c0_res, cpu_lclk_dpm_level,
cpu_pwr_svi_telemtry_rails, cpu_io_bandwidth, cpu_xgmi_bandwidth,
cpu_metrics_ver, cpu_metrics_table, cpu_socket_energy,
cpu_ddr_bandwidth, cpu_temp, cpu_dimm_temp_range_rate,
cpu_dimm_pow_consumption, cpu_dimm_thermal_sensor)
if args.core:
self.logger.output = {}
self.logger.clear_multiple_devices_ouput()
self.metric_core(args, multiple_devices, core, core_boost_limit,
core_curr_active_freq_core_limit, core_energy)
elif self.helpers.is_amdgpu_initialized(): # Only GPU is initialized
if args.gpu == None:
args.gpu = self.device_handles
self.logger.clear_multiple_devices_ouput()
self.metric_gpu(args, multiple_devices, watching_output, gpu,
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
2024-05-21 20:30:16 -05:00
xgmi_err, energy, mem_usage, schedule, throttle)
2023-03-06 06:20:21 -06:00
def process(self, args, multiple_devices=False, watching_output=False,
gpu=None, general=None, engine=None, pid=None, name=None,
watch=None, watch_time=None, iterations=None):
"""Get Process Information from the target GPU
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
watching_output (bool, optional): True if watch argument has been set. Defaults to False.
2023-03-06 06:20:21 -06:00
gpu (device_handle, optional): device_handle for target device. Defaults to None.
general (bool, optional): Value override for args.general. Defaults to None.
engine (bool, optional): Value override for args.engine. Defaults to None.
pid (Positive int, optional): Value override for args.pid. Defaults to None.
name (str, optional): Value override for args.name. Defaults to None.
watch (Positive int, optional): Value override for args.watch. Defaults to None.
watch_time (Positive int, optional): Value override for args.watch_time. Defaults to None.
iterations (Positive int, optional): Value override for args.iterations. Defaults to None.
Raises:
IndexError: Index error if gpu list is empty
Returns:
None: Print output via AMDSMILogger to destination
"""
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
if general:
args.general = general
if engine:
args.engine = engine
if pid:
args.pid = pid
if name:
args.name = name
if watch:
args.watch = watch
if watch_time:
args.watch_time = watch_time
if iterations:
args.iterations = iterations
# Handle No GPU passed
2023-09-14 15:13:53 -05:00
if args.gpu == None:
2023-03-06 06:20:21 -06:00
args.gpu = self.device_handles
# Handle watch logic, will only enter this block once
if args.watch:
2023-04-21 08:02:53 -05:00
self.helpers.handle_watch(args=args, subcommand=self.process, logger=self.logger)
2023-03-06 06:20:21 -06:00
return
# Handle multiple GPUs
if isinstance(args.gpu, list):
if len(args.gpu) > 1:
2023-04-21 08:02:53 -05:00
# Deepcopy gpus as recursion will destroy the gpu list
stored_gpus = []
for gpu in args.gpu:
stored_gpus.append(gpu)
# Store output from multiple devices
2023-03-06 06:20:21 -06:00
for device_handle in args.gpu:
2023-04-21 08:02:53 -05:00
self.process(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle)
# Reload original gpus
args.gpu = stored_gpus
# Print multiple device output
self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output)
2023-03-28 15:32:17 -05:00
2023-04-21 08:02:53 -05:00
# Add output to total watch output and clear multiple device output
2023-03-06 06:20:21 -06:00
if watching_output:
2023-04-21 08:02:53 -05:00
self.logger.store_watch_output(multiple_device_enabled=True)
# Flush the watching output
self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output)
2023-03-06 06:20:21 -06:00
return
2023-03-28 15:32:17 -05:00
elif len(args.gpu) == 1:
2023-03-06 06:20:21 -06:00
args.gpu = args.gpu[0]
else:
raise IndexError("args.gpu should not be an empty list")
2023-09-24 16:12:30 -05:00
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
2023-03-06 06:20:21 -06:00
# Populate initial processes
2023-03-28 15:32:17 -05:00
try:
process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu)
2023-03-28 15:32:17 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-26 14:45:44 -05:00
logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
2023-03-28 15:32:17 -05:00
raise e
2023-03-06 06:20:21 -06:00
filtered_process_values = []
2024-04-11 05:11:54 -05:00
for process_info in process_list:
2023-03-06 06:20:21 -06:00
process_info['mem_usage'] = process_info.pop('mem')
process_info['usage'] = process_info.pop('engine_usage')
2024-04-11 05:11:54 -05:00
engine_usage_unit = "ns"
memory_usage_unit = "B"
2023-03-06 06:20:21 -06:00
if self.logger.is_human_readable_format():
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
2023-03-06 06:20:21 -06:00
for usage_metric in process_info['memory_usage']:
process_info["memory_usage"][usage_metric] = self.helpers.convert_bytes_to_readable(process_info["memory_usage"][usage_metric])
memory_usage_unit = ""
2024-04-11 05:11:54 -05:00
process_info['mem_usage'] = self.helpers.unit_format(self.logger,
process_info['mem_usage'],
memory_usage_unit)
2024-04-11 05:11:54 -05:00
for usage_metric in process_info['usage']:
process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger,
process_info['usage'][usage_metric],
engine_usage_unit)
for usage_metric in process_info['memory_usage']:
process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger,
process_info['memory_usage'][usage_metric],
memory_usage_unit)
2024-04-11 05:11:54 -05:00
filtered_process_values.append({'process_info': process_info})
2023-03-06 06:20:21 -06:00
2024-04-11 05:11:54 -05:00
if not filtered_process_values:
process_info = "N/A"
logging.debug("Failed to detect any process on gpu %s", gpu_id)
2023-03-06 06:20:21 -06:00
filtered_process_values.append({'process_info': process_info})
# Arguments will filter the populated processes
# General and Engine to expose process_info values
if args.general or args.engine:
for process_info in filtered_process_values:
2024-08-01 11:30:23 -05:00
if not process_info['process_info'] == "N/A":
if args.general and args.engine:
del process_info['process_info']['memory_usage']
elif args.general:
del process_info['process_info']['memory_usage']
del process_info['process_info']['usage'] # Used in engine
elif args.engine:
del process_info['process_info']['memory_usage']
del process_info['process_info']['mem_usage'] # Used in general
2023-03-06 06:20:21 -06:00
# Filter out non specified pids
if args.pid:
process_pids = []
for process_info in filtered_process_values:
if process_info['process_info'] == "N/A":
continue
2023-03-06 06:20:21 -06:00
pid = str(process_info['process_info']['pid'])
if str(args.pid) == pid:
process_pids.append(process_info)
filtered_process_values = process_pids
# Filter out non specified process names
if args.name:
process_names = []
for process_info in filtered_process_values:
if process_info['process_info'] == "N/A":
continue
2023-03-06 06:20:21 -06:00
process_name = str(process_info['process_info']['name']).lower()
if str(args.name).lower() == process_name:
process_names.append(process_info)
filtered_process_values = process_names
2024-06-08 05:36:01 -05:00
# If the name or pid args filter processes out then insert an N/A placeholder
if not filtered_process_values:
filtered_process_values.append({'process_info': "N/A"})
logging.debug(f"Process Info for GPU {gpu_id} | {filtered_process_values}")
2024-06-08 05:36:01 -05:00
for index, process in enumerate(filtered_process_values):
if process['process_info'] == "N/A":
filtered_process_values[index]['process_info'] = "No running processes detected"
2023-04-21 08:02:53 -05:00
2024-06-08 05:36:01 -05:00
if self.logger.is_json_format():
2023-04-21 08:02:53 -05:00
if watching_output:
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
2024-06-08 05:36:01 -05:00
self.logger.store_output(args.gpu, 'process_list', filtered_process_values)
2023-04-21 08:02:53 -05:00
2024-06-08 05:36:01 -05:00
if self.logger.is_human_readable_format():
if watching_output:
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
# When we print out process_info we remove the index
# The removal is needed only for human readable process format to align with Host
for index, process in enumerate(filtered_process_values):
self.logger.store_output(args.gpu, f'process_info_{index}', process['process_info'])
multiple_devices_csv_override = False
if self.logger.is_csv_format():
multiple_devices_csv_override = True
for process in filtered_process_values:
if watching_output:
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
self.logger.store_output(args.gpu, 'process_info', process['process_info'])
self.logger.store_multiple_device_output()
2023-03-06 06:20:21 -06:00
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
2024-06-08 05:36:01 -05:00
multiple_devices = multiple_devices or multiple_devices_csv_override
self.logger.print_output(multiple_device_enabled=multiple_devices, watching_output=watching_output)
2023-03-06 06:20:21 -06:00
if watching_output: # End of single gpu add to watch_output
2024-06-08 05:36:01 -05:00
self.logger.store_watch_output(multiple_device_enabled=multiple_devices)
2023-03-06 06:20:21 -06:00
def profile(self, args):
"""Not applicable to linux baremetal"""
2023-03-28 15:32:17 -05:00
print('Not applicable to linux baremetal')
2023-03-06 06:20:21 -06:00
def event(self, args, gpu=None):
""" Get event information for target gpus
Args:
args (Namespace): argparser args to pass to subcommand
gpu (device_handle, optional): device_handle for target device. Defaults to None.
Return:
stdout event information for target gpus
"""
if args.gpu:
gpu = args.gpu
if gpu == None:
args.gpu = self.device_handles
if not isinstance(args.gpu, list):
args.gpu = [args.gpu]
2023-03-28 15:32:17 -05:00
print('EVENT LISTENING:\n')
print('Press q and hit ENTER when you want to stop.')
self.stop = False
2023-03-30 10:07:46 -05:00
threads = []
2023-10-27 09:39:31 -05:00
for device_handle in range(len(args.gpu)):
x = threading.Thread(target=self._event_thread, args=(self, device_handle))
2023-03-28 15:32:17 -05:00
threads.append(x)
x.start()
while True:
user_input = input()
if user_input == 'q':
print("Escape Sequence Detected; Exiting")
self.stop = True
break
2023-03-06 06:20:21 -06:00
2023-03-28 15:32:17 -05:00
for thread in threads:
thread.join()
2023-03-06 06:20:21 -06:00
2023-03-28 15:32:17 -05:00
def topology(self, args, multiple_devices=False, gpu=None, access=None,
weight=None, hops=None, link_type=None, numa_bw=None,
coherent=None, atomics=None, dma=None, bi_dir=None):
2023-03-06 06:20:21 -06:00
""" Get topology information for target gpus
params:
args - argparser args to pass to subcommand
multiple_devices (bool) - True if checking for multiple devices
gpu (device_handle) - device_handle for target device
2023-03-28 15:32:17 -05:00
access (bool) - Value override for args.access
weight (bool) - Value override for args.weight
hops (bool) - Value override for args.hops
type (bool) - Value override for args.type
numa_bw (bool) - Value override for args.numa_bw
coherent (bool) - Value override for args.coherent
atomics (bool) - Value override for args.atomics
dma (bool) - Value override for args.dma
bi_dir (bool) - Value override for args.bi_dir
2023-03-06 06:20:21 -06:00
return:
Nothing
"""
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
2023-03-28 15:32:17 -05:00
if access:
args.access = access
if weight:
args.weight = weight
if hops:
args.hops = hops
2023-04-21 15:10:38 -05:00
if link_type:
args.link_type = link_type
2023-03-28 15:32:17 -05:00
if numa_bw:
args.numa_bw = numa_bw
if coherent:
args.coherent = coherent
if atomics:
args.atomics = atomics
if dma:
args.dma = dma
if bi_dir:
args.bi_dir = bi_dir
2023-03-06 06:20:21 -06:00
# Handle No GPU passed
2023-09-14 15:13:53 -05:00
if args.gpu == None:
2023-03-06 06:20:21 -06:00
args.gpu = self.device_handles
2023-04-21 15:10:38 -05:00
if not isinstance(args.gpu, list):
args.gpu = [args.gpu]
2023-03-06 06:20:21 -06:00
# Handle all args being false
if not any([args.access, args.weight, args.hops, args.link_type, args.numa_bw,
args.coherent, args.atomics, args.dma, args.bi_dir]):
args.access = args.weight = args.hops = args.link_type= args.numa_bw = \
args.coherent = args.atomics = args.dma = args.bi_dir = True
2023-04-21 15:10:38 -05:00
2024-01-30 20:15:11 -06:00
# Clear the table header
self.logger.table_header = ''.rjust(12)
2023-04-21 15:10:38 -05:00
# Populate the possible gpus
topo_values = []
2024-03-27 17:10:59 -05:00
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
topo_values.append({"gpu" : src_gpu_id})
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
topo_values[src_gpu_index]['bdf'] = src_gpu_bdf
self.logger.table_header += src_gpu_bdf.rjust(13)
if not self.logger.is_json_format():
continue # below is for JSON format only
##########################
# JSON formatting start #
##########################
links = []
# create json obj for data alignment
# dest_gpu_links = {
# "gpu": GPU #
# "bdf": BDF identification
# "weight": 0 - self (current node); weight >= 0 correlated with hops (GPU-CPU, GPU-GPU, GPU-CPU-CPU-GPU, etc..)
2024-04-05 01:58:39 -05:00
# "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked; Correlated to access
2024-03-27 17:10:59 -05:00
# "link_type": "SELF" - current node, "PCIE", "XGMI", "N/A" - no link,"UNKNOWN" - unidentified link type
# "num_hops": num_hops - # of hops between devices
# "bandwidth": numa_bw - The NUMA "minimum bandwidth-maximum bandwidth" beween src and dest nodes
# "N/A" - self node or not connected devices
# "coherent": coherent - Coherant / Non-Coherant io links
# "atomics": atomics - 32 and 64-bit atomic io link capability between nodes
# "dma": dma - P2P direct memory access (DMA) link capability between nodes
# "bi_dir": bi_dir - P2P bi-directional link capability between nodes
2024-03-27 17:10:59 -05:00
# }
for dest_gpu_index, dest_gpu in enumerate(args.gpu):
link_type = "SELF"
if src_gpu != dest_gpu:
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
if isinstance(link_type, int):
if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_UNDEFINED:
link_type = "UNKNOWN"
elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_PCIEXPRESS:
link_type = "PCIE"
elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_XGMI:
link_type = "XGMI"
else:
link_type = "N/A"
numa_bw = "N/A"
if src_gpu != dest_gpu:
try:
bw_dict = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu)
numa_bw = f"{bw_dict['min_bandwidth']}-{bw_dict['max_bandwidth']}"
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get min max bandwidth for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
weight = 0
num_hops = 0
2024-04-05 01:58:39 -05:00
if src_gpu != dest_gpu:
2024-03-27 17:10:59 -05:00
weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu)
num_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops']
link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu)
if link_status:
link_status = "ENABLED"
else:
link_status = "DISABLED"
link_coherent = "SELF"
link_atomics = "SELF"
link_dma = "SELF"
link_bi_dir = "SELF"
if src_gpu != dest_gpu:
try:
cap = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap']
link_coherent = (
"C" if cap['is_iolink_coherent'] == 1 else
"NC" if cap['is_iolink_coherent'] == 0 else
"N/A"
)
link_atomics = (
"64,32" if cap['is_iolink_atomics_32bit'] == 1 and cap['is_iolink_atomics_64bit'] == 1 else
"32" if cap['is_iolink_atomics_32bit'] == 1 else
"64" if cap['is_iolink_atomics_64bit'] == 1 else
"N/A"
)
link_dma = (
"T" if cap['is_iolink_dma'] == 1 else
"F" if cap['is_iolink_dma'] == 0 else
"N/A"
)
link_bi_dir = (
"T" if cap['is_iolink_bi_directional'] == 1 else
"F" if cap['is_iolink_bi_directional'] == 0 else
"N/A"
)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get link status for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
2024-03-27 17:10:59 -05:00
# link_status = amdsmi_is_P2P_accessible(src,dest)
dest_gpu_links = {
"gpu": self.helpers.get_gpu_id_from_device_handle(dest_gpu),
"bdf": amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu),
"weight": weight,
"link_status": link_status,
"link_type": link_type,
"num_hops": num_hops,
"bandwidth": numa_bw,
"coherent": link_coherent,
"atomics": link_atomics,
"dma": link_dma,
"bi_dir": link_bi_dir
2024-03-27 17:10:59 -05:00
}
2024-04-05 01:58:39 -05:00
if not args.access:
2024-03-27 17:10:59 -05:00
del dest_gpu_links['link_status']
if not args.weight:
del dest_gpu_links['weight']
if not args.link_type:
del dest_gpu_links['link_type']
if not args.hops:
del dest_gpu_links['num_hops']
if not args.numa_bw:
del dest_gpu_links['bandwidth']
if not args.coherent:
del dest_gpu_links['coherent']
if not args.atomics:
del dest_gpu_links['atomics']
if not args.dma:
del dest_gpu_links['dma']
if not args.bi_dir:
del dest_gpu_links['bi_dir']
2024-03-27 17:10:59 -05:00
links.append(dest_gpu_links)
2024-04-05 01:58:39 -05:00
dest_end = dest_gpu_index+1 == len(args.gpu)
2024-03-27 17:10:59 -05:00
isEndOfSrc = src_gpu_index+1 == len(args.gpu)
2024-04-05 01:58:39 -05:00
if dest_end:
2024-03-27 17:10:59 -05:00
topo_values[src_gpu_index]['links'] = links
continue
if isEndOfSrc:
self.logger.multiple_device_output = topo_values
self.logger.print_output(multiple_device_enabled=True, tabular=True)
return
##########################
# JSON formatting end #
##########################
2023-03-28 15:32:17 -05:00
if args.access:
tabular_output = []
2023-04-21 15:10:38 -05:00
for src_gpu_index, src_gpu in enumerate(args.gpu):
2024-03-27 17:10:59 -05:00
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
2024-02-22 13:45:02 -06:00
if self.logger.is_human_readable_format():
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
2024-02-22 13:45:02 -06:00
else:
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : src_gpu_bdf}
2023-04-21 15:10:38 -05:00
src_gpu_links = {}
for dest_gpu in args.gpu:
2023-04-24 21:34:44 -05:00
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
2023-04-21 15:10:38 -05:00
try:
dest_gpu_link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu)
2023-11-22 03:32:55 -06:00
if dest_gpu_link_status:
src_gpu_links[dest_gpu_key] = "ENABLED"
else:
src_gpu_links[dest_gpu_key] = "DISABLED"
2023-04-21 15:10:38 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_links[dest_gpu_key] = "N/A"
2023-09-24 16:12:30 -05:00
logging.debug("Failed to get link status for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
2023-04-21 15:10:38 -05:00
topo_values[src_gpu_index]['link_accessibility'] = src_gpu_links
2023-04-21 08:02:53 -05:00
tabular_output_dict.update(src_gpu_links)
tabular_output.append(tabular_output_dict)
if self.logger.is_human_readable_format():
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "ACCESS TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
2023-03-28 15:32:17 -05:00
if args.weight:
tabular_output = []
2023-04-21 15:10:38 -05:00
for src_gpu_index, src_gpu in enumerate(args.gpu):
2024-03-27 17:10:59 -05:00
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
2024-02-22 13:45:02 -06:00
if self.logger.is_human_readable_format():
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
2024-02-22 13:45:02 -06:00
else:
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : src_gpu_bdf}
2023-04-21 15:10:38 -05:00
src_gpu_weight = {}
for dest_gpu in args.gpu:
2023-04-24 21:34:44 -05:00
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
2023-04-21 08:02:53 -05:00
2023-04-21 15:10:38 -05:00
if src_gpu == dest_gpu:
2023-04-24 21:34:44 -05:00
src_gpu_weight[dest_gpu_key] = 0
2023-04-21 15:10:38 -05:00
continue
2023-04-21 08:02:53 -05:00
2023-04-21 15:10:38 -05:00
try:
dest_gpu_link_weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu)
2023-04-24 21:34:44 -05:00
src_gpu_weight[dest_gpu_key] = dest_gpu_link_weight
2023-04-21 15:10:38 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_weight[dest_gpu_key] = "N/A"
2023-09-24 16:12:30 -05:00
logging.debug("Failed to get link weight for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
2023-04-21 15:10:38 -05:00
topo_values[src_gpu_index]['weight'] = src_gpu_weight
tabular_output_dict.update(src_gpu_weight)
tabular_output.append(tabular_output_dict)
if self.logger.is_human_readable_format():
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "WEIGHT TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
2023-04-21 15:10:38 -05:00
if args.hops:
tabular_output = []
2023-04-21 15:10:38 -05:00
for src_gpu_index, src_gpu in enumerate(args.gpu):
2024-03-27 17:10:59 -05:00
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
2024-02-22 13:45:02 -06:00
if self.logger.is_human_readable_format():
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
2024-02-22 13:45:02 -06:00
else:
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : src_gpu_bdf}
2023-04-21 15:10:38 -05:00
src_gpu_hops = {}
for dest_gpu in args.gpu:
2023-04-24 21:34:44 -05:00
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
2023-04-21 15:10:38 -05:00
if src_gpu == dest_gpu:
2023-04-24 21:34:44 -05:00
src_gpu_hops[dest_gpu_key] = 0
2023-04-21 15:10:38 -05:00
continue
try:
dest_gpu_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops']
2023-04-24 21:34:44 -05:00
src_gpu_hops[dest_gpu_key] = dest_gpu_hops
2023-04-21 15:10:38 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_hops[dest_gpu_key] = "N/A"
2023-09-24 16:12:30 -05:00
logging.debug("Failed to get link hops for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
2023-04-21 15:10:38 -05:00
topo_values[src_gpu_index]['hops'] = src_gpu_hops
tabular_output_dict.update(src_gpu_hops)
tabular_output.append(tabular_output_dict)
if self.logger.is_human_readable_format():
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "HOPS TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
2023-04-21 15:10:38 -05:00
if args.link_type:
tabular_output = []
2023-04-21 15:10:38 -05:00
for src_gpu_index, src_gpu in enumerate(args.gpu):
2024-03-27 17:10:59 -05:00
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
2024-02-22 13:45:02 -06:00
if self.logger.is_human_readable_format():
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
2024-02-22 13:45:02 -06:00
else:
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : src_gpu_bdf}
2023-04-21 15:10:38 -05:00
src_gpu_link_type = {}
for dest_gpu in args.gpu:
2023-04-24 21:34:44 -05:00
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
2023-04-21 15:10:38 -05:00
if src_gpu == dest_gpu:
2023-08-01 06:20:12 -05:00
src_gpu_link_type[dest_gpu_key] = "SELF"
2023-04-21 15:10:38 -05:00
continue
try:
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
if isinstance(link_type, int):
2023-08-01 06:20:12 -05:00
if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_UNDEFINED:
src_gpu_link_type[dest_gpu_key] = "UNKNOWN"
elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_PCIEXPRESS:
2023-04-24 21:34:44 -05:00
src_gpu_link_type[dest_gpu_key] = "PCIE"
2023-08-01 06:20:12 -05:00
elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_XGMI:
src_gpu_link_type[dest_gpu_key] = "XGMI"
2023-04-21 15:10:38 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_link_type[dest_gpu_key] = "N/A"
2023-09-24 16:12:30 -05:00
logging.debug("Failed to get link type for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
2023-04-21 15:10:38 -05:00
topo_values[src_gpu_index]['link_type'] = src_gpu_link_type
2023-04-21 08:02:53 -05:00
tabular_output_dict.update(src_gpu_link_type)
tabular_output.append(tabular_output_dict)
if self.logger.is_human_readable_format():
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "LINK TYPE TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
2023-03-28 15:32:17 -05:00
if args.numa_bw:
tabular_output = []
2023-04-21 15:10:38 -05:00
for src_gpu_index, src_gpu in enumerate(args.gpu):
2024-03-27 17:10:59 -05:00
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
2024-02-22 13:45:02 -06:00
if self.logger.is_human_readable_format():
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
2024-02-22 13:45:02 -06:00
else:
2024-03-27 17:10:59 -05:00
tabular_output_dict = {'gpu' : src_gpu_bdf}
2023-04-21 15:10:38 -05:00
src_gpu_link_type = {}
for dest_gpu in args.gpu:
2023-04-24 21:34:44 -05:00
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
2023-04-21 08:02:53 -05:00
2023-04-21 15:10:38 -05:00
if src_gpu == dest_gpu:
src_gpu_link_type[dest_gpu_key] = "N/A"
2023-04-21 15:10:38 -05:00
continue
2023-04-21 08:02:53 -05:00
2023-04-21 15:10:38 -05:00
try:
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
if isinstance(link_type, int):
if link_type != 2:
# non_xgmi = True
src_gpu_link_type[dest_gpu_key] = "N/A"
2023-04-21 15:10:38 -05:00
continue
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_link_type[dest_gpu_key] = "N/A"
2023-09-24 16:12:30 -05:00
logging.debug("Failed to get link type for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
2023-04-21 08:02:53 -05:00
2023-04-21 15:10:38 -05:00
try:
2023-10-16 21:52:34 -05:00
bw_dict = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu)
src_gpu_link_type[dest_gpu_key] = f"{bw_dict['min_bandwidth']}-{bw_dict['max_bandwidth']}"
2023-04-21 15:10:38 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-04-24 21:34:44 -05:00
src_gpu_link_type[dest_gpu_key] = e.get_error_info()
2023-09-24 16:12:30 -05:00
logging.debug("Failed to get min max bandwidth for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
2023-04-21 15:10:38 -05:00
topo_values[src_gpu_index]['numa_bandwidth'] = src_gpu_link_type
tabular_output_dict.update(src_gpu_link_type)
tabular_output.append(tabular_output_dict)
if self.logger.is_human_readable_format():
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "NUMA BW TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
if args.coherent:
tabular_output = []
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
if self.logger.is_human_readable_format():
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
else:
tabular_output_dict = {'gpu' : src_gpu_bdf}
src_gpu_coherent = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
if src_gpu == dest_gpu:
src_gpu_coherent[dest_gpu_key] = "SELF"
continue
try:
iolink_coherent = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap']['is_iolink_coherent']
src_gpu_coherent[dest_gpu_key] = "C" if iolink_coherent == 1 else "NC" if iolink_coherent == 0 else "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_coherent[dest_gpu_key] = "N/A"
logging.debug("Failed to get link coherent for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
topo_values[src_gpu_index]['coherent'] = src_gpu_coherent
tabular_output_dict.update(src_gpu_coherent)
tabular_output.append(tabular_output_dict)
if self.logger.is_human_readable_format():
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "CACHE COHERANCY TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
if args.atomics:
tabular_output = []
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
if self.logger.is_human_readable_format():
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
else:
tabular_output_dict = {'gpu' : src_gpu_bdf}
src_gpu_atomics = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
if src_gpu == dest_gpu:
src_gpu_atomics[dest_gpu_key] = "SELF"
continue
try:
cap = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap']
src_gpu_atomics[dest_gpu_key] = (
"64,32" if cap['is_iolink_atomics_32bit'] == 1 and cap['is_iolink_atomics_64bit'] == 1 else
"32" if cap['is_iolink_atomics_32bit'] == 1 else
"64" if cap['is_iolink_atomics_64bit'] == 1 else
"N/A"
)
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_atomics[dest_gpu_key] = "N/A"
logging.debug("Failed to get link atomics for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
topo_values[src_gpu_index]['atomics'] = src_gpu_atomics
tabular_output_dict.update(src_gpu_atomics)
tabular_output.append(tabular_output_dict)
if self.logger.is_human_readable_format():
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "ATOMICS TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
if args.dma:
tabular_output = []
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
if self.logger.is_human_readable_format():
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
else:
tabular_output_dict = {'gpu' : src_gpu_bdf}
src_gpu_dma = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
if src_gpu == dest_gpu:
src_gpu_dma[dest_gpu_key] = "SELF"
continue
try:
iolink_dma = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap']['is_iolink_dma']
src_gpu_dma[dest_gpu_key] = "T" if iolink_dma == 1 else "F" if iolink_dma == 0 else "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_dma[dest_gpu_key] = "N/A"
logging.debug("Failed to get link dma for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
topo_values[src_gpu_index]['dma'] = src_gpu_dma
tabular_output_dict.update(src_gpu_dma)
tabular_output.append(tabular_output_dict)
if self.logger.is_human_readable_format():
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "DMA TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
if args.bi_dir:
tabular_output = []
for src_gpu_index, src_gpu in enumerate(args.gpu):
src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
if self.logger.is_human_readable_format():
tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
else:
tabular_output_dict = {'gpu' : src_gpu_bdf}
src_gpu_bi_dir = {}
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_key = f'gpu_{dest_gpu_id}'
if src_gpu == dest_gpu:
src_gpu_bi_dir[dest_gpu_key] = "SELF"
continue
try:
iolink_bi_dir = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap']['is_iolink_bi_directional']
src_gpu_bi_dir[dest_gpu_key] = "T" if iolink_bi_dir == 1 else "F" if iolink_bi_dir == 0 else "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
src_gpu_bi_dir[dest_gpu_key] = "N/A"
logging.debug("Failed to get link bi-directional for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
topo_values[src_gpu_index]['bi_dir'] = src_gpu_bi_dir
tabular_output_dict.update(src_gpu_bi_dir)
tabular_output.append(tabular_output_dict)
if self.logger.is_human_readable_format():
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "BI-DIRECTIONAL TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
if self.logger.is_human_readable_format():
# Populate the legend output
legend_parts = [
"\n\nLegend:",
" SELF = Current GPU",
" ENABLED / DISABLED = Link is enabled or disabled",
" N/A = Not supported",
" T/F = True / False",
" C/NC = Coherant / Non-Coherant io links",
" 64,32 = 64 bit and 32 bit atomic support",
" <BW from>-<BW to>"
]
legend_output = "\n".join(legend_parts)
if self.logger.destination == 'stdout':
print(legend_output)
else:
with self.logger.destination.open('a', encoding="utf-8") as output_file:
output_file.write(legend_output + '\n')
2023-04-21 15:10:38 -05:00
self.logger.multiple_device_output = topo_values
if self.logger.is_csv_format():
new_output = []
for elem in self.logger.multiple_device_output:
new_output.append(self.logger.flatten_dict(elem, topology_override=True))
self.logger.multiple_device_output = new_output
if not self.logger.is_human_readable_format():
self.logger.print_output(multiple_device_enabled=True)
2023-03-06 06:20:21 -06:00
def set_core(self, args, multiple_devices=False, core=None, core_boost_limit=None):
"""Issue set commands to target core(s)
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
core (device_handle, optional): device_handle for target device. Defaults to None.
core_boost_limit (list, optional): Value override for args.core_boost_limit. Defaults to None. Defaults to None.
Raises:
ValueError: Value error if no core value is provided
IndexError: Index error if core list is empty
Return:
Nothing
"""
if core:
args.core = core
if core_boost_limit:
args.core_boost_limit = core_boost_limit
if args.core == None:
raise ValueError('No Core provided, specific Core targets(S) are needed')
# Handle multiple cores
handled_multiple_cores, device_handle = self.helpers.handle_cores(args, self.logger, self.set_core)
if handled_multiple_cores:
return # This function is recursive
# Error if no subcommand args are passed
if not any([args.core_boost_limit]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
args.core = device_handle
# build core string for errors
try:
core_id = self.helpers.get_core_id_from_device_handle(args.core)
except IndexError:
core_id = f'ID Unavailable for {args.core}'
static_dict = {}
if args.core_boost_limit:
static_dict["set_core_boost_limit"] = {}
try:
amdsmi_interface.amdsmi_set_cpu_core_boostlimit(args.core, args.core_boost_limit[0][0])
static_dict["set_core_boost_limit"]["Response"] = "Set Operation successful"
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["set_core_boost_limit"]["Response"] = f"Error occured for Core {core_id} - {e.get_error_info()}"
logging.debug("Failed to set core boost limit for cpu %s | %s", core_id, e.get_error_info())
multiple_devices_csv_override = False
self.logger.store_core_output(args.core, 'values', static_dict)
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override)
def set_cpu(self, args, multiple_devices=False, cpu=None, cpu_pwr_limit=None,
cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None,
cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None,
cpu_enable_apb=None, cpu_disable_apb=None, soc_boost_limit=None):
"""Issue set commands to target cpu(s)
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
cpu (cpu_handle, optional): device_handle for target device. Defaults to None.
cpu_pwr_limit (int, optional): Value override for args.cpu_pwr_limit. Defaults to None.
cpu_xgmi_link_width (List[int], optional): Value override for args.cpu_xgmi_link_width. Defaults to None.
cpu_lclk_dpm_level (List[int], optional): Value override for args.cpu_lclk_dpm_level. Defaults to None.
cpu_pwr_eff_mode (int, optional): Value override for args.cpu_pwr_eff_mode. Defaults to None.
cpu_gmi3_link_width (List[int], optional): Value override for args.cpu_gmi3_link_width. Defaults to None.
cpu_pcie_link_rate (int, optional): Value override for args.cpu_pcie_link_rate. Defaults to None.
cpu_df_pstate_range (List[int], optional): Value override for args.cpu_df_pstate_range. Defaults to None.
cpu_enable_apb (bool, optional): Value override for args.cpu_enable_apb. Defaults to None.
cpu_disable_apb (int, optional): Value override for args.cpu_disable_apb. Defaults to None.
soc_boost_limit (int, optional): Value override for args.soc_boost_limit. Defaults to None.
Raises:
ValueError: Value error if no cpu value is provided
IndexError: Index error if cpu list is empty
Return:
Nothing
"""
if cpu:
args.cpu = cpu
if cpu_pwr_limit:
args.cpu_pwr_limit = cpu_pwr_limit
if cpu_xgmi_link_width:
args.cpu_xgmi_link_width = cpu_xgmi_link_width
if cpu_lclk_dpm_level:
args.cpu_lclk_dpm_level = cpu_lclk_dpm_level
if cpu_pwr_eff_mode:
args.cpu_pwr_eff_mode = cpu_pwr_eff_mode
if cpu_gmi3_link_width:
args.cpu_gmi3_link_width = cpu_gmi3_link_width
if cpu_pcie_link_rate:
args.cpu_pcie_link_rate = cpu_pcie_link_rate
if cpu_df_pstate_range:
args.cpu_df_pstate_range = cpu_df_pstate_range
if cpu_enable_apb:
args.cpu_enable_apb = cpu_enable_apb
if cpu_disable_apb:
args.cpu_disable_apb = cpu_disable_apb
if soc_boost_limit:
args.soc_boost_limit = soc_boost_limit
if args.cpu == None:
raise ValueError('No CPU provided, specific CPU targets(S) are needed')
#Handle multiple CPU's
handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args, self.logger, self.set_cpu)
if handled_multiple_cpus:
return # This function is recursive
args.cpu = device_handle
#Error if no subcommand args are passed
if not any([args.cpu_pwr_limit, args.cpu_xgmi_link_width, args.cpu_lclk_dpm_level,
args.cpu_pwr_eff_mode, args.cpu_gmi3_link_width, args.cpu_pcie_link_rate,
args.cpu_df_pstate_range, args.cpu_enable_apb, args.cpu_disable_apb,
args.soc_boost_limit]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
# Build CPU string for errors
try:
cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu)
except IndexError:
cpu_id = f'ID Unavailable for {args.cpu}'
static_dict = {}
if args.cpu_pwr_limit:
static_dict["set_pwr_limit"] = {}
try:
amdsmi_interface.amdsmi_set_cpu_socket_power_cap(args.cpu, args.cpu_pwr_limit[0][0])
static_dict["set_pwr_limit"]["Response"] = "Set Operation successful"
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["set_pwr_limit"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}"
logging.debug("Failed to set power limit for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_xgmi_link_width:
static_dict["set_xgmi_link_width"] = {}
try:
amdsmi_interface.amdsmi_set_cpu_xgmi_width(args.cpu, args.cpu_xgmi_link_width[0][0],
args.cpu_xgmi_link_width[0][1])
static_dict["set_xgmi_link_width"]["Response"] = "Set Operation successful"
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["set_xgmi_link_width"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}"
logging.debug("Failed to set xgmi link width for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_lclk_dpm_level:
static_dict["set_lclk_dpm_level"] = {}
try:
amdsmi_interface.amdsmi_set_cpu_socket_lclk_dpm_level(args.cpu, args.cpu_lclk_dpm_level[0][0],
args.cpu_lclk_dpm_level[0][1],
args.cpu_lclk_dpm_level[0][2])
static_dict["set_lclk_dpm_level"]["Response"] = "Set Operation successful"
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["set_lclk_dpm_level"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}"
logging.debug("Failed to set lclk dpm level for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_pwr_eff_mode:
static_dict["set_pwr_eff_mode"] = {}
try:
amdsmi_interface.amdsmi_set_cpu_pwr_efficiency_mode(args.cpu, args.cpu_pwr_eff_mode[0][0])
static_dict["set_pwr_eff_mode"]["Response"] = "Set Operation successful"
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["set_pwr_eff_mode"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}"
logging.debug("Failed to set power efficiency mode for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_gmi3_link_width:
static_dict["set_gmi3_link_width"] = {}
try:
amdsmi_interface.amdsmi_set_cpu_gmi3_link_width_range(args.cpu, args.cpu_gmi3_link_width[0][0],
args.cpu_gmi3_link_width[0][1])
static_dict["set_gmi3_link_width"]["response"] = "Set Operation successful"
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["set_gmi3_link_width"]["response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}"
logging.debug("Failed to set gmi3 link width for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_pcie_link_rate:
static_dict["set_pcie_link_rate"] = {}
try:
resp = amdsmi_interface.amdsmi_set_cpu_pcie_link_rate(args.cpu, args.cpu_pcie_link_rate[0][0])
static_dict["set_pcie_link_rate"]["prev_mode"] = resp
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["set_pcie_link_rate"]["prev_mode"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}"
logging.debug("Failed to set pcie link rate for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_df_pstate_range:
static_dict["set_df_pstate_range"] = {}
try:
amdsmi_interface.amdsmi_set_cpu_df_pstate_range(args.cpu, args.cpu_df_pstate_range[0][0],
args.cpu_df_pstate_range[0][1])
static_dict["set_df_pstate_range"]["response"] = "Set Operation successful"
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["set_df_pstate_range"]["response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}"
logging.debug("Failed to set df pstate range for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_enable_apb:
static_dict["apbenable"] = {}
try:
amdsmi_interface.amdsmi_cpu_apb_enable(args.cpu)
static_dict["apbenable"]["state"] = "Enabled DF - Pstate performance boost algorithm"
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["apbenable"]["state"] = "N/A"
logging.debug("Failed to enable APB for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_disable_apb:
static_dict["apbdisable"] = {}
try:
amdsmi_interface.amdsmi_cpu_apb_disable(args.cpu, args.cpu_disable_apb[0][0])
static_dict["apbdisable"]["state"] = "Disabled DF - Pstate performance boost algorithm"
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["apbdisable"]["state"] = "N/A"
logging.debug("Failed to enable APB for cpu %s | %s", cpu_id, e.get_error_info())
if args.soc_boost_limit:
static_dict["set_soc_boost_limit"] = {}
try:
amdsmi_interface.amdsmi_set_cpu_socket_boostlimit(args.cpu, args.soc_boost_limit[0][0])
static_dict["set_soc_boost_limit"]["Response"] = "Set Operation successful"
except amdsmi_exception.AmdSmiLibraryException as e:
#static_dict["set_soc_boost_limit"]["Response"] = "N/A"
static_dict["set_soc_boost_limit"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}"
logging.debug("Failed to set socket boost limit for cpu %s | %s", cpu_id, e.get_error_info())
multiple_devices_csv_override = False
self.logger.store_cpu_output(args.cpu, 'values', static_dict)
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override)
def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
2023-10-16 11:11:03 -05:00
profile=None, perf_determinism=None, compute_partition=None,
memory_partition=None, power_cap=None, soc_pstate=None, xgmi_plpd = None,
process_isolation=None, clk_limit=None, clk_level=None):
2023-03-28 15:32:17 -05:00
"""Issue reset commands to target gpu(s)
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
2023-04-21 08:02:53 -05:00
fan (int, optional): Value override for args.fan. Defaults to None.
perf_level (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perf_level. Defaults to None.
2023-04-21 08:02:53 -05:00
profile (bool, optional): Value override for args.profile. Defaults to None.
2023-10-16 11:11:03 -05:00
perf_determinism (int, optional): Value override for args.perf_determinism. Defaults to None.
compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None.
memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None.
2023-10-17 00:08:43 -05:00
power_cap (int, optional): Value override for args.power_cap. Defaults to None.
soc_pstate (int, optional): Value override for args.soc_pstate. Defaults to None.
2024-03-20 12:06:24 -05:00
xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.
2024-04-08 10:35:24 -05:00
process_isolation (int, optional): Value override for args.process_isolation. Defaults to None.
2023-03-28 15:32:17 -05:00
Raises:
ValueError: Value error if no gpu value is provided
IndexError: Index error if gpu list is empty
Return:
Nothing
"""
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
2023-10-17 00:08:43 -05:00
if fan is not None:
2023-03-28 15:32:17 -05:00
args.fan = fan
if perf_level:
args.perf_level = perf_level
2023-03-28 15:32:17 -05:00
if profile:
args.profile = profile
2023-10-17 00:08:43 -05:00
if perf_determinism is not None:
2023-10-16 11:11:03 -05:00
args.perf_determinism = perf_determinism
if compute_partition:
args.compute_partition = compute_partition
if memory_partition:
args.memory_partition = memory_partition
2023-10-17 00:08:43 -05:00
if power_cap:
args.power_cap = power_cap
if soc_pstate:
args.soc_pstate = soc_pstate
2024-03-20 12:06:24 -05:00
if xgmi_plpd:
args.xgmi_plpd = xgmi_plpd
2024-04-08 10:35:24 -05:00
if process_isolation:
args.process_isolation = process_isolation
if clk_limit:
args.clk_limit = clk_limit
if clk_level:
args.clk_level = clk_level
2023-03-28 15:32:17 -05:00
# Handle No GPU passed
2023-09-14 15:13:53 -05:00
if args.gpu == None:
args.gpu = self.device_handles
2023-03-28 15:32:17 -05:00
# Handle multiple GPUs
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.set_gpu)
2023-03-28 15:32:17 -05:00
if handled_multiple_gpus:
2023-03-30 10:07:46 -05:00
return # This function is recursive
args.gpu = device_handle
2023-03-28 15:32:17 -05:00
# Error if no subcommand args are passed
if self.helpers.is_baremetal():
if not any([args.fan is not None,
args.perf_level,
args.profile,
args.compute_partition,
args.memory_partition,
args.perf_determinism is not None,
args.power_cap is not None,
args.soc_pstate is not None,
args.xgmi_plpd is not None,
args.clk_level is not None,
args.clk_limit is not None,
args.process_isolation is not None]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
else:
if not any([args.process_isolation is not None,
args.clk_limit is not None]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
# Build GPU string for errors
try:
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu)
except amdsmi_exception.AmdSmiLibraryException:
gpu_bdf = f'BDF Unavailable for {args.gpu}'
try:
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
except IndexError:
gpu_id = f'ID Unavailable for {args.gpu}'
gpu_string = f"GPU ID: {gpu_id} BDF:{gpu_bdf}"
# Handle args
if self.helpers.is_baremetal():
if isinstance(args.fan, int):
try:
amdsmi_interface.amdsmi_set_gpu_fan_speed(args.gpu, 0, args.fan)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e
2023-03-28 15:32:17 -05:00
self.logger.store_output(args.gpu, 'fan', f"Successfully set fan speed {args.fan}")
if args.perf_level:
perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perf_level]
try:
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, perf_level)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set performance level {args.perf_level} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perf_level}")
if args.profile:
self.logger.store_output(args.gpu, 'profile', "Not Yet Implemented")
if isinstance(args.perf_determinism, int):
try:
amdsmi_interface.amdsmi_set_gpu_perf_determinism_mode(args.gpu, args.perf_determinism)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perf_determinism} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perf_determinism}")
if args.compute_partition:
compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition]
try:
amdsmi_interface.amdsmi_set_gpu_compute_partition(args.gpu, compute_partition)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set compute partition to {args.compute_partition} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'computepartition', f"Successfully set compute partition to {args.compute_partition}")
if args.memory_partition:
lock = multiprocessing.Lock()
lock.acquire()
####################################################################
# Get current and available memory partition modes #
# Info used if AMDSMI_STATUS_INVAL is caught & to set progress bar #
####################################################################
try:
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu) # this info likely actually comes from different apis than used here
except amdsmi_exception.AmdSmiLibraryException as e:
memory_partition = "N/A"
logging.debug("Failed to get current memory partition for GPU %s | %s", gpu_id, e.get_error_info())
try:
mem_caps_str = "N/A"
partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(args.gpu)
temp_mem_caps = partition_dict['partition_profile']['memory_caps']
mem_caps = temp_mem_caps.nps_cap_mask
if temp_mem_caps.amdsmi_nps_flags_t == None:
mem_caps_list = []
if mem_caps & 1 == 1:
mem_caps_list.append("NPS1")
if mem_caps & 2 == 2:
mem_caps_list.append("NPS2")
if mem_caps & 4 == 4:
mem_caps_list.append("NPS4")
if mem_caps & 8 == 8:
mem_caps_list.append("NPS8")
mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "")
else:
mem_caps = temp_mem_caps.amdsmi_nps_flags_t
mem_caps_list = []
if mem_caps.nps1_cap == 1:
mem_caps_list.append("NPS1")
if mem_caps.nps2_cap == 1:
mem_caps_list.append("NPS2")
if mem_caps.nps4_cap == 1:
mem_caps_list.append("NPS4")
if mem_caps.nps8_cap == 1:
mem_caps_list.append("NPS8")
mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "").replace("\'", "")
if mem_caps_str == "":
mem_caps_str = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info())
memory_dict = {'caps': mem_caps_str, 'current': memory_partition}
###############################################################
# memory partition set starts here #
###############################################################
showProgressBar = False
if ((str(memory_dict['current']) != "N/A") and (str(args.memory_partition) in mem_caps_str)
and ((str(memory_dict['current']) != str(args.memory_partition)))):
showProgressBar = True # Only show progress bar if
# 1) Device can set memory partition modes
# 2) Requested mode is a valid mode to set
# 3) Current is not already the requested mode
# otherwise function will return fast
else:
showProgressBar = False
threads = []
k140secs = 140
string_out = f"Updating memory partition for gpu {gpu_id}"
timesToRetryRestartErr = 1
self.helpers.increment_set_count()
set_count = self.helpers.get_set_count()
if set_count == 1: # only show reload warning on 1st set
self.helpers.confirm_changing_memory_partition_gpu_reload_warning()
while timesToRetryRestartErr >= 0:
timesToRetryRestartErr -= 1
try:
if showProgressBar: # only show reload warning on 1st set
t1 = multiprocessing.Process(target=self.helpers.showProgressbar,
args=(string_out, k140secs,))
threads.append(t1)
t1.start()
memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition]
amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition)
for thread in threads:
thread.terminate()
print("")
break # successful case
except amdsmi_exception.AmdSmiLibraryException as e:
f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform)
print("\n\n", end='\r', flush=True, file=f)
for thread in threads:
thread.terminate()
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL:
out = f"[AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition} on {gpu_string}"
print(f"Valid Memory partition Modes: {mem_caps_str}\n")
self.logger.store_output(args.gpu, 'memory_partition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
lock.release()
return
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED:
out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Device does not support setting memory partition to {args.memory_partition} on {gpu_string}"
self.logger.store_output(args.gpu, 'memory_partition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
lock.release()
return
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_AMDGPU_RESTART_ERR:
# Try again on a failure -> work around for not being able to close libdrm
string_out = f"Trying again - Updating memory partition for gpu {gpu_id}"
for thread in threads:
thread.terminate()
thread.join()
if timesToRetryRestartErr < 0:
out = f"[AMDSMI_STATUS_AMDGPU_RESTART_ERR] Could not successfully restart driver after applying {args.memory_partition} on {gpu_string}"
self.logger.store_output(args.gpu, 'memory_partition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
return
continue
f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform)
print("\n\n", end='\r', flush=True, file=f)
out = f"Unable to set memory partition to {args.memory_partition} on {gpu_string}"
print(out)
self.logger.store_output(args.gpu, 'memorypartition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
lock.release()
return
except Exception as e:
for thread in threads:
thread.terminate()
out = f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}"
print(out)
lock.release()
raise ValueError(f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'memory_partition', f"Successfully set memory partition to {args.memory_partition}")
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
lock.release()
return
if isinstance(args.power_cap, int):
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}")
min_power_cap = power_cap_info["min_power_cap"]
min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
max_power_cap = power_cap_info["max_power_cap"]
max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
current_power_cap = power_cap_info["power_cap"]
current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to get power cap info from {gpu_string}") from e
2023-10-17 00:08:43 -05:00
if args.power_cap == current_power_cap:
self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}")
elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap:
try:
new_power_cap = self.helpers.convert_SI_unit(args.power_cap, AMDSMIHelpers.SI_Unit.BASE,
AMDSMIHelpers.SI_Unit.MICRO)
amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, new_power_cap)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set power cap to {args.power_cap} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}")
else:
# setting power cap to 0 will return the current power cap so the technical minimum value is 1
if min_power_cap == 0:
min_power_cap = 1
self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap} and {max_power_cap}")
if isinstance(args.soc_pstate, int):
2023-10-17 00:08:43 -05:00
try:
amdsmi_interface.amdsmi_set_soc_pstate(args.gpu, args.soc_pstate)
2023-10-17 00:08:43 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set dpm soc pstate policy to {args.soc_pstate} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'socpstate', f"Successfully soc pstate dpm policy to id {args.soc_pstate}")
if isinstance(args.xgmi_plpd, int):
try:
amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.xgmi_plpd}")
if isinstance(args.clk_level, tuple):
clk_type = args.clk_level.clk_type
perf_levels = args.clk_level.perf_levels
# check if perf levels are all valid levels
try:
if clk_type == "sclk":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.SYS
elif clk_type == "mclk":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.MEM
elif clk_type == "pcie":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.PCIE
elif clk_type == "fclk":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.DF
elif clk_type == "socclk":
clk_type_conversion = amdsmi_interface.AmdSmiClkType.SOC
else:
clk_type_conversion = "N/A"
frequencies = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, clk_type_conversion)
num_supported = frequencies['num_supported']
except amdsmi_exception.AmdSmiLibraryException as e:
pass
# convert perf_levels given to freq bit mask
freq_mask = 0
valid_level = True
invalid_levels = []
for level in perf_levels:
if level < num_supported:
freq_mask += 2**level
else:
# cancel this instance since the level should not be possible
invalid_levels.append(level)
valid_level = False
if valid_level:
perf_levels_str = str(perf_levels).strip('[]')
if clk_type.lower() == "pcie":
try:
amdsmi_interface.amdsmi_set_gpu_pci_bandwidth(args.gpu, freq_mask)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set pcie bandwidth to perf level(s) {perf_levels_str} on {gpu_string}") from e
else:
try:
amdsmi_interface.amdsmi_set_clk_freq(args.gpu, clk_type, freq_mask)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set {clk_type} to perf level(s) {perf_levels_str} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'clk_level', f"Successfully changed {clk_type} perf level(s) to {perf_levels_str}")
else:
invalid_levels_str = str(invalid_levels).strip('[]')
self.logger.store_output(args.gpu, 'clk_level', f"level(s) {invalid_levels_str} is/are greater than performance levels supported for device")
if isinstance(args.clk_limit, tuple):
clk_type = args.clk_limit.clk_type
lim_type = args.clk_limit.lim_type
val = args.clk_limit.val
val_changed = True # Assume Clock limit value is changed
# Validate the value against the extremum
try:
# Parser only allows two options sclk or mclk
if clk_type == "sclk":
amdsmi_clk_type = amdsmi_interface.AmdSmiClkType.GFX
elif clk_type == "mclk":
amdsmi_clk_type = amdsmi_interface.AmdSmiClkType.MEM
clk_tuple = amdsmi_interface.amdsmi_get_clock_info(args.gpu, amdsmi_clk_type)
if lim_type == "min":
if val > clk_tuple['max_clk']:
raise IndexError("cannot set min value greater than max")
if val == clk_tuple['min_clk']:
val_changed = False # Clock limit value did not changed
if lim_type == "max":
if val < clk_tuple['min_clk']:
raise IndexError("cannot set max value less than min")
if val == clk_tuple['max_clk']:
val_changed = False # Clock limit value did not changed
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get clock extremum info for gpu %s | %s", gpu_id, e.get_error_info())
# Set the value
try:
if val_changed:
amdsmi_interface.amdsmi_set_gpu_clk_limit(args.gpu, clk_type, lim_type, val)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val} on {gpu_string}") from e
if val_changed:
self.logger.store_output(args.gpu, 'clk_limit', f"Successfully changed {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val}")
else:
self.logger.store_output(args.gpu, 'clk_limit', f"Clock limit is already set to {args.clk_limit.val}")
2024-04-08 10:35:24 -05:00
if isinstance(args.process_isolation, int):
status_string = "Enabled" if args.process_isolation else "Disabled"
result = f"Requested process isolation to {status_string}" # This should not print out
try:
current_status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu)
if current_status == args.process_isolation:
result = f"Process isolation is already {status_string}"
else:
amdsmi_interface.amdsmi_set_gpu_process_isolation(args.gpu, args.process_isolation)
result = f"Successfully set process isolation to {status_string}"
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to set process isolation to {status_string} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'process_isolation', result)
2023-10-16 19:29:17 -05:00
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output()
2023-03-06 06:20:21 -06:00
def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
profile=None, perf_determinism=None, compute_partition=None,
memory_partition=None, power_cap=None,
cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None,
cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None,
cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None,
soc_boost_limit=None, core=None, core_boost_limit=None, soc_pstate=None, xgmi_plpd=None,
process_isolation=None, clk_limit=None, clk_level=None):
"""Issue reset commands to target gpu(s)
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
fan (int, optional): Value override for args.fan. Defaults to None.
perf_level (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perf_level. Defaults to None.
profile (bool, optional): Value override for args.profile. Defaults to None.
perf_determinism (int, optional): Value override for args.perf_determinism. Defaults to None.
compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None.
memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None.
power_cap (int, optional): Value override for args.power_cap. Defaults to None.
cpu (cpu_handle, optional): device_handle for target device. Defaults to None.
cpu_pwr_limit (int, optional): Value override for args.cpu_pwr_limit. Defaults to None.
cpu_xgmi_link_width (List[int], optional): Value override for args.cpu_xgmi_link_width. Defaults to None.
cpu_lclk_dpm_level (List[int], optional): Value override for args.cpu_lclk_dpm_level. Defaults to None.
cpu_pwr_eff_mode (int, optional): Value override for args.cpu_pwr_eff_mode. Defaults to None.
cpu_gmi3_link_width (List[int], optional): Value override for args.cpu_gmi3_link_width. Defaults to None.
cpu_pcie_link_rate (int, optional): Value override for args.cpu_pcie_link_rate. Defaults to None.
cpu_df_pstate_range (List[int], optional): Value override for args.cpu_df_pstate_range. Defaults to None.
cpu_enable_apb (bool, optional): Value override for args.cpu_enable_apb. Defaults to None.
cpu_disable_apb (int, optional): Value override for args.cpu_disable_apb. Defaults to None.
soc_boost_limit (int, optional): Value override for args.soc_boost_limit. Defaults to None.
core (device_handle, optional): device_handle for target core. Defaults to None.
core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None
soc_pstate (int, optional): Value override for args.soc_pstate. Defaults to None.
2024-03-20 12:06:24 -05:00
xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.
2024-04-08 10:35:24 -05:00
process_isolation (int, optional): Value override for args.process_isolation. Defaults to None.
Raises:
ValueError: Value error if no gpu value is provided
IndexError: Index error if gpu list is empty
Return:
Nothing
"""
# These are the only args checked at this point, the other args will be passed
# in through the applicable function set_gpu, set_cpu, or set_core function
if gpu:
args.gpu = gpu
if cpu:
args.cpu = cpu
if core:
args.core = core
# Check if a GPU argument has been set
gpu_args_enabled = False
gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
"memory_partition", "power_cap", "soc_pstate", "xgmi_plpd",
"process_isolation", "clk_limit", "clk_level"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr) is not None:
gpu_args_enabled = True
break
# Check if a CPU argument has been set
cpu_args_enabled = False
cpu_attributes = ["cpu_pwr_limit", "cpu_xgmi_link_width", "cpu_lclk_dpm_level", "cpu_pwr_eff_mode",
"cpu_gmi3_link_width", "cpu_pcie_link_rate", "cpu_df_pstate_range",
2024-03-11 12:44:14 +00:00
"cpu_enable_apb", "cpu_disable_apb", "soc_boost_limit"]
for attr in cpu_attributes:
if hasattr(args, attr):
if getattr(args, attr) not in [None, False]:
cpu_args_enabled = True
break
# Check if a Core argument has been set
core_args_enabled = False
core_attributes = ["core_boost_limit"]
for attr in core_attributes:
if hasattr(args, attr):
if getattr(args, attr) is not None:
core_args_enabled = True
break
# Error if no subcommand args are passed
if self.helpers.is_baremetal():
if not any([args.gpu, args.fan, args.perf_level, args.profile, args.perf_determinism, \
args.compute_partition, args.memory_partition, args.power_cap,\
args.soc_pstate, args.xgmi_plpd, args.clk_limit, args.process_isolation]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
else:
if not any([args.clean_local_data]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
# Only allow one device's arguments to be set at a time
2024-05-28 15:52:43 +09:00
if not any([gpu_args_enabled, cpu_args_enabled, core_args_enabled]):
raise ValueError('No GPU, CPU, or CORE arguments provided, specific arguments are needed')
2024-05-28 15:52:43 +09:00
elif all([gpu_args_enabled, cpu_args_enabled, core_args_enabled]):
raise ValueError('Cannot set GPU, CPU, and CORE arguments at the same time')
elif not (gpu_args_enabled ^ cpu_args_enabled ^ core_args_enabled):
raise ValueError('Cannot set GPU, CPU, or CORE arguments at the same time')
# Handle CPU and GPU intialization cases
if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized():
# Print out all CPU and all GPU static info only if no device was specified.
# If a GPU or CPU argument is provided only print out the specified device.
if args.cpu == None and args.gpu == None and args.core == None:
raise ValueError('No GPU, CPU, or CORE provided, specific target(s) are needed')
if args.cpu:
self.set_cpu(args, multiple_devices, cpu, cpu_pwr_limit,
cpu_xgmi_link_width, cpu_lclk_dpm_level, cpu_pwr_eff_mode,
cpu_gmi3_link_width, cpu_pcie_link_rate, cpu_df_pstate_range,
cpu_enable_apb, cpu_disable_apb, soc_boost_limit)
if args.core:
self.logger.output = {}
self.logger.clear_multiple_devices_ouput()
self.set_core(args, multiple_devices, core, core_boost_limit)
if args.gpu:
self.logger.output = {}
self.logger.clear_multiple_devices_ouput()
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap, soc_pstate, xgmi_plpd,
process_isolation, clk_limit, clk_level)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None and args.core == None:
raise ValueError('No CPU or CORE provided, specific target(s) are needed')
if args.cpu:
self.set_cpu(args, multiple_devices, cpu, cpu_pwr_limit,
cpu_xgmi_link_width, cpu_lclk_dpm_level, cpu_pwr_eff_mode,
cpu_gmi3_link_width, cpu_pcie_link_rate, cpu_df_pstate_range,
cpu_enable_apb, cpu_disable_apb, soc_boost_limit)
if args.core:
self.logger.output = {}
self.logger.clear_multiple_devices_ouput()
self.set_core(args, multiple_devices, core, core_boost_limit)
elif self.helpers.is_amdgpu_initialized(): # Only GPU is initialized
if args.gpu == None:
args.gpu = self.device_handles
self.logger.clear_multiple_devices_ouput()
self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
profile, perf_determinism, compute_partition,
memory_partition, power_cap, soc_pstate, xgmi_plpd,
process_isolation, clk_limit, clk_level)
2023-03-06 06:20:21 -06:00
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
2023-10-16 11:11:03 -05:00
clocks=None, fans=None, profile=None, xgmierr=None, perf_determinism=None,
power_cap=None, clean_local_data=None):
2023-03-06 06:20:21 -06:00
"""Issue reset commands to target gpu(s)
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
2023-04-21 08:02:53 -05:00
gpureset (bool, optional): Value override for args.gpureset. Defaults to None.
clocks (bool, optional): Value override for args.clocks. Defaults to None.
fans (bool, optional): Value override for args.fans. Defaults to None.
profile (bool, optional): Value override for args.profile. Defaults to None.
xgmierr (bool, optional): Value override for args.xgmierr. Defaults to None.
2023-10-16 11:11:03 -05:00
perf_determinism (bool, optional): Value override for args.perf_determinism. Defaults to None.
2024-05-28 11:26:25 -05:00
power_cap (bool, optional): Value override for args.power_cap. Defaults to None.
clean_local_data (bool, optional): Value override for args.run_cleaner_shader. Defaults to None.
2024-05-28 11:26:25 -05:00
2023-03-06 06:20:21 -06:00
Raises:
ValueError: Value error if no gpu value is provided
IndexError: Index error if gpu list is empty
Return:
Nothing
"""
# Set args.* to passed in arguments
2023-03-28 15:32:17 -05:00
if gpu:
args.gpu = gpu
if gpureset:
args.gpureset = gpureset
if clocks:
args.clocks = clocks
if fans:
args.fans = fans
if profile:
args.profile = profile
if xgmierr:
args.xgmierr = xgmierr
2023-10-16 11:11:03 -05:00
if perf_determinism:
args.perf_determinism = perf_determinism
2023-10-17 00:08:43 -05:00
if power_cap:
args.power_cap = power_cap
if clean_local_data:
args.clean_local_data = clean_local_data
2023-03-06 06:20:21 -06:00
# Handle No GPU passed
2023-09-14 15:13:53 -05:00
if args.gpu == None:
args.gpu = self.device_handles
2023-03-06 06:20:21 -06:00
# Handle multiple GPUs
2023-03-28 15:32:17 -05:00
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.reset)
if handled_multiple_gpus:
2023-03-30 10:07:46 -05:00
return # This function is recursive
args.gpu = device_handle
2023-03-06 06:20:21 -06:00
2023-09-24 16:12:30 -05:00
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
# Error if no subcommand args are passed
if self.helpers.is_baremetal():
if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, \
args.perf_determinism, \
args.power_cap, args.clean_local_data]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
else:
if not any([args.clean_local_data]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
if self.helpers.is_baremetal():
if args.gpureset:
if self.helpers.is_amd_device(args.gpu):
try:
amdsmi_interface.amdsmi_reset_gpu(args.gpu)
result = 'Successfully reset GPU'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
result = "Failed to reset GPU"
else:
result = 'Unable to reset non-amd GPU'
self.logger.store_output(args.gpu, 'gpu_reset', result)
if args.clocks:
reset_clocks_results = {'overdrive': '',
'clocks': '',
'performance': ''}
2023-03-06 06:20:21 -06:00
try:
amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, 0)
reset_clocks_results['overdrive'] = 'Overdrive set to 0'
2023-03-28 15:32:17 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-24 16:12:30 -05:00
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
2023-04-21 08:02:53 -05:00
raise PermissionError('Command requires elevation') from e
reset_clocks_results['overdrive'] = "N/A"
logging.debug("Failed to reset overdrive on gpu %s | %s", gpu_id, e.get_error_info())
2023-03-06 06:20:21 -06:00
try:
level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto)
reset_clocks_results['clocks'] = 'Successfully reset clocks'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
reset_clocks_results['clocks'] = "N/A"
logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info())
2023-03-06 06:20:21 -06:00
try:
level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto)
reset_clocks_results['performance'] = 'Performance level reset to auto'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
reset_clocks_results['performance'] = "N/A"
logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info())
2023-03-06 06:20:21 -06:00
self.logger.store_output(args.gpu, 'reset_clocks', reset_clocks_results)
if args.fans:
try:
amdsmi_interface.amdsmi_reset_gpu_fan(args.gpu, 0)
result = 'Successfully reset fan speed to driver control'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
result = "N/A"
logging.debug("Failed to reset fans on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.store_output(args.gpu, 'reset_fans', result)
if args.profile:
reset_profile_results = {'power_profile' : 'N/A',
'performance_level': 'N/A'}
try:
power_profile_mask = amdsmi_interface.AmdSmiPowerProfilePresetMasks.BOOTUP_DEFAULT
amdsmi_interface.amdsmi_set_gpu_power_profile(args.gpu, 0, power_profile_mask)
reset_profile_results['power_profile'] = 'Successfully reset Power Profile'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
reset_profile_results['power_profile'] = "N/A"
logging.debug("Failed to reset power profile on gpu %s | %s", gpu_id, e.get_error_info())
2023-10-17 00:08:43 -05:00
try:
level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto)
reset_profile_results['performance_level'] = 'Successfully reset Performance Level'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
reset_profile_results['performance_level'] = "N/A"
logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.store_output(args.gpu, 'reset_profile', reset_profile_results)
if args.xgmierr:
try:
amdsmi_interface.amdsmi_reset_gpu_xgmi_error(args.gpu)
result = 'Successfully reset XGMI Error count'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
result = "N/A"
logging.debug("Failed to reset xgmi error count on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.store_output(args.gpu, 'reset_xgmi_err', result)
if args.perf_determinism:
try:
level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO
amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto)
result = 'Successfully disabled performance determinism'
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
result = "N/A"
logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.store_output(args.gpu, 'reset_perf_determinism', result)
if args.power_cap:
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}")
default_power_cap_in_w = power_cap_info["default_power_cap"]
default_power_cap_in_w = self.helpers.convert_SI_unit(default_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO)
current_power_cap_in_w = power_cap_info["power_cap"]
current_power_cap_in_w = self.helpers.convert_SI_unit(current_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO)
except amdsmi_exception.AmdSmiLibraryException as e:
raise ValueError(f"Unable to get power cap info from {gpu_id}") from e
if current_power_cap_in_w == default_power_cap_in_w:
self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap_in_w}")
else:
try:
default_power_cap_in_uw = self.helpers.convert_SI_unit(default_power_cap_in_w,
AMDSMIHelpers.SI_Unit.BASE,
AMDSMIHelpers.SI_Unit.MICRO)
amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap_in_uw)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to reset power cap to {default_power_cap_in_w} on GPU {gpu_id}") from e
self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap_in_w}")
if args.clean_local_data:
2024-05-28 11:26:25 -05:00
try:
amdsmi_interface.amdsmi_clean_gpu_local_data(args.gpu)
result = 'Successfully clean GPU local data'
2024-05-28 11:26:25 -05:00
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
raise ValueError(f"Unable to clean local data on GPU {gpu_id}") from e
self.logger.store_output(args.gpu, 'clean_local_data', result)
2023-03-06 06:20:21 -06:00
if multiple_devices:
self.logger.store_multiple_device_output()
return # Skip printing when there are multiple devices
self.logger.print_output()
2023-12-06 16:57:26 -06:00
def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
watch=None, watch_time=None, iterations=None, power_usage=None,
temperature=None, gfx_util=None, mem_util=None, encoder=None, decoder=None,
2024-05-21 20:30:16 -05:00
ecc=None, vram_usage=None, pcie=None, process=None, violation=None):
2023-12-06 16:57:26 -06:00
""" Populate a table with each GPU as an index to rows of targeted data
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
gpu (device_handle, optional): device_handle for target device. Defaults to None.
watch (bool, optional): Value override for args.watch. Defaults to None.
watch_time (int, optional): Value override for args.watch_time. Defaults to None.
iterations (int, optional): Value override for args.iterations. Defaults to None.
power_usage (bool, optional): Value override for args.power_usage. Defaults to None.
temperature (bool, optional): Value override for args.temperature. Defaults to None.
gfx (bool, optional): Value override for args.gfx. Defaults to None.
mem (bool, optional): Value override for args.mem. Defaults to None.
encoder (bool, optional): Value override for args.encoder. Defaults to None.
decoder (bool, optional): Value override for args.decoder. Defaults to None.
ecc (bool, optional): Value override for args.ecc. Defaults to None.
vram_usage (bool, optional): Value override for args.vram_usage. Defaults to None.
pcie (bool, optional): Value override for args.pcie. Defaults to None.
process (bool, optional): Value override for args.process. Defaults to None.
2024-05-21 20:30:16 -05:00
violation (bool, optional): Value override for args.violation. Defaults to None.
2023-12-06 16:57:26 -06:00
Raises:
ValueError: Value error if no gpu value is provided
IndexError: Index error if gpu list is empty
Return:
Nothing
"""
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
if watch:
args.watch = watch
if watch_time:
args.watch_time = watch_time
if iterations:
args.iterations = iterations
# monitor args
if power_usage:
args.power_usage = power_usage
if temperature:
args.temperature = temperature
if gfx_util:
args.gfx = gfx_util
if mem_util:
args.mem = mem_util
if encoder:
args.encoder = encoder
if decoder:
args.decoder = decoder
if ecc:
args.ecc = ecc
if vram_usage:
args.vram_usage = vram_usage
if pcie:
args.pcie = pcie
if process:
args.process = process
2024-05-21 20:30:16 -05:00
if violation:
args.violation = violation
2023-12-06 16:57:26 -06:00
# Handle No GPU passed
if args.gpu == None:
args.gpu = self.device_handles
# If all arguments are False, the print all values
# Don't include process in this logic as it's an optional edge case
2023-12-06 16:57:26 -06:00
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
2024-06-06 16:17:49 -05:00
args.encoder, args.decoder, args.ecc,
2024-05-21 20:30:16 -05:00
args.vram_usage, args.pcie, args.violation]):
2023-12-06 16:57:26 -06:00
args.power_usage = args.temperature = args.gfx = args.mem = \
2024-06-06 16:17:49 -05:00
args.encoder = args.decoder = args.ecc = \
2024-05-21 20:30:16 -05:00
args.vram_usage = args.pcie = args.violation = True
2023-12-06 16:57:26 -06:00
# Handle watch logic, will only enter this block once
if args.watch:
self.helpers.handle_watch(args=args, subcommand=self.monitor, logger=self.logger)
return
# Handle multiple GPUs
if isinstance(args.gpu, list):
if len(args.gpu) > 1:
# Deepcopy gpus as recursion will destroy the gpu list
stored_gpus = []
for gpu in args.gpu:
stored_gpus.append(gpu)
# Store output from multiple devices without printing to console
2023-12-06 16:57:26 -06:00
for device_handle in args.gpu:
self.monitor(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle)
# Reload original gpus
args.gpu = stored_gpus
dual_csv_output = False
if args.process:
if self.logger.is_csv_format():
dual_csv_output = True
# Flush the output
self.logger.print_output(multiple_device_enabled=True,
watching_output=watching_output,
tabular=True,
dual_csv_output=dual_csv_output)
2023-12-06 16:57:26 -06:00
# Add output to total watch output and clear multiple device output
if watching_output:
self.logger.store_watch_output(multiple_device_enabled=True)
return
elif len(args.gpu) == 1:
args.gpu = args.gpu[0]
else:
raise IndexError("args.gpu should not be an empty list")
monitor_values = {}
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
# Reset the table header and store the timestamp if watch output is enabled
self.logger.table_header = 'GPU'
2023-12-06 16:57:26 -06:00
if watching_output:
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
self.logger.table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.table_header
2023-12-06 16:57:26 -06:00
if args.loglevel == "DEBUG":
try:
# Get GPU Metrics table version
gpu_metric_version_info = amdsmi_interface.amdsmi_get_gpu_metrics_header_info(args.gpu)
gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4)
logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Unable to load GPU Metrics table version for %s | %s", gpu_id, e.err_info)
try:
# Get GPU Metrics table
gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4)
logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, str(gpu_metric_str))
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
# Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls
if args.pcie:
try:
pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
except amdsmi_exception.AmdSmiLibraryException as e:
pcie_info = "N/A"
logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info())
# Resume regular ordering of values
2023-12-06 16:57:26 -06:00
if args.power_usage:
try:
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
2024-01-30 20:15:11 -06:00
if gpu_metrics_info['current_socket_power'] != "N/A":
monitor_values['power_usage'] = gpu_metrics_info['current_socket_power']
else: # Fallback to average_socket_power for older gpu_metrics versions
monitor_values['power_usage'] = gpu_metrics_info['average_socket_power']
power_unit = 'W'
2023-12-06 16:57:26 -06:00
if self.logger.is_human_readable_format() and monitor_values['power_usage'] != "N/A":
monitor_values['power_usage'] = f"{monitor_values['power_usage']} {power_unit}"
if self.logger.is_json_format() and monitor_values['power_usage'] != "N/A":
monitor_values['power_usage'] = {"value" : monitor_values['power_usage'],
"unit" : power_unit}
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['power_usage'] = "N/A"
logging.debug("Failed to get power usage on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'POWER'.rjust(7)
if args.temperature:
try:
temperature = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['temperature_hotspot']
monitor_values['hotspot_temperature'] = temperature
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['hotspot_temperature'] = "N/A"
logging.debug("Failed to get hotspot temperature on gpu %s | %s", gpu_id, e.get_error_info())
try:
temperature = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['temperature_mem']
2023-12-06 16:57:26 -06:00
monitor_values['memory_temperature'] = temperature
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['memory_temperature'] = "N/A"
logging.debug("Failed to get memory temperature on gpu %s | %s", gpu_id, e.get_error_info())
temp_unit_human_readable = '\N{DEGREE SIGN}C'
temp_unit_json = 'C'
if monitor_values['hotspot_temperature'] != "N/A":
if self.logger.is_human_readable_format():
monitor_values['hotspot_temperature'] = f"{monitor_values['hotspot_temperature']} {temp_unit_human_readable}"
if self.logger.is_json_format():
monitor_values['hotspot_temperature'] = {"value" : monitor_values['hotspot_temperature'],
"unit" : temp_unit_json}
if monitor_values['memory_temperature'] != "N/A":
if self.logger.is_human_readable_format():
monitor_values['memory_temperature'] = f"{monitor_values['memory_temperature']} {temp_unit_human_readable}"
if self.logger.is_json_format():
monitor_values['memory_temperature'] = {"value" : monitor_values['memory_temperature'],
"unit" : temp_unit_json}
2023-12-06 16:57:26 -06:00
self.logger.table_header += 'GPU_TEMP'.rjust(10)
self.logger.table_header += 'MEM_TEMP'.rjust(10)
if args.gfx:
try:
gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_gfx_activity']
2023-12-06 16:57:26 -06:00
monitor_values['gfx'] = gfx_util
activity_unit = '%'
if gfx_util != "N/A":
if self.logger.is_human_readable_format():
monitor_values['gfx'] = f"{monitor_values['gfx']} {activity_unit}"
if self.logger.is_json_format():
monitor_values['gfx'] = {"value" : monitor_values['gfx'],
"unit" : activity_unit}
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['gfx'] = "N/A"
logging.debug("Failed to get gfx utilization on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'GFX_UTIL'.rjust(10)
try:
gfx_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_gfxclk']
monitor_values['gfx_clock'] = gfx_clock
freq_unit = 'MHz'
if gfx_clock != "N/A":
if self.logger.is_human_readable_format():
monitor_values['gfx_clock'] = f"{monitor_values['gfx_clock']} {freq_unit}"
if self.logger.is_json_format():
monitor_values['gfx_clock'] = {"value" : monitor_values['gfx_clock'],
"unit" : freq_unit}
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['gfx_clock'] = "N/A"
logging.debug("Failed to get gfx clock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'GFX_CLOCK'.rjust(11)
if args.mem:
try:
mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_umc_activity']
2023-12-06 16:57:26 -06:00
monitor_values['mem'] = mem_util
activity_unit = '%'
if mem_util != "N/A":
if self.logger.is_human_readable_format():
monitor_values['mem'] = f"{monitor_values['mem']} {activity_unit}"
if self.logger.is_json_format():
monitor_values['mem'] = {"value" : monitor_values['mem'],
"unit" : activity_unit}
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['mem'] = "N/A"
logging.debug("Failed to get mem utilization on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'MEM_UTIL'.rjust(10)
try:
mem_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_uclk']
monitor_values['mem_clock'] = mem_clock
freq_unit = 'MHz'
if mem_clock != "N/A":
if self.logger.is_human_readable_format():
monitor_values['mem_clock'] = f"{monitor_values['mem_clock']} {freq_unit}"
if self.logger.is_json_format():
monitor_values['mem_clock'] = {"value" : monitor_values['mem_clock'],
"unit" : freq_unit}
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['mem_clock'] = "N/A"
logging.debug("Failed to get mem clock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'MEM_CLOCK'.rjust(11)
if args.encoder:
# TODO: The encoding utilization is in progress for Navi. Note: MI3x ASICs only support decoding.
2023-12-06 16:57:26 -06:00
try:
# Get List of vcn activity values
encoder_util = "N/A" # Not yet implemented
2023-12-06 16:57:26 -06:00
encoding_activity_avg = []
for value in encoder_util:
2024-01-30 20:15:11 -06:00
if isinstance(value, int):
2023-12-06 16:57:26 -06:00
encoding_activity_avg.append(value)
2024-01-30 20:15:11 -06:00
2023-12-06 16:57:26 -06:00
# Averaging the possible encoding activity values
if encoding_activity_avg:
encoding_activity_avg = sum(encoding_activity_avg) / len(encoding_activity_avg)
else:
encoding_activity_avg = "N/A"
2024-01-30 20:15:11 -06:00
2023-12-06 16:57:26 -06:00
monitor_values['encoder'] = encoding_activity_avg
activity_unit = '%'
if monitor_values['encoder'] != "N/A":
if self.logger.is_human_readable_format():
monitor_values['encoder'] = f"{monitor_values['encoder']} {activity_unit}"
if self.logger.is_json_format():
monitor_values['encoder'] = {"value" : monitor_values['encoder'],
"unit" : activity_unit}
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['encoder'] = "N/A"
logging.debug("Failed to get encoder utilization on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'ENC_UTIL'.rjust(10)
if args.decoder:
2023-12-06 16:57:26 -06:00
try:
# Get List of vcn activity values
# Note: MI3x ASICs only support decoding, so the vcn_activity is used for decoding activity.
decoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity']
decoding_activity_avg = []
for value in decoder_util:
if isinstance(value, int):
decoding_activity_avg.append(value)
# Averaging the possible decoding activity values
if decoding_activity_avg:
decoding_activity_avg = sum(decoding_activity_avg) / len(decoding_activity_avg)
else:
decoding_activity_avg = "N/A"
monitor_values['decoder'] = decoding_activity_avg
activity_unit = '%'
if monitor_values['decoder'] != "N/A":
if self.logger.is_human_readable_format():
monitor_values['decoder'] = f"{monitor_values['decoder']} {activity_unit}"
if self.logger.is_json_format():
monitor_values['decoder'] = {"value" : monitor_values['decoder'],
"unit" : activity_unit}
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['decoder'] = "N/A"
logging.debug("Failed to get decoder utilization on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'DEC_UTIL'.rjust(10)
if args.encoder or args.decoder:
try:
vclock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0']
monitor_values['vclock'] = vclock
freq_unit = 'MHz'
if vclock != "N/A":
if self.logger.is_human_readable_format():
monitor_values['vclock'] = f"{monitor_values['vclock']} {freq_unit}"
if self.logger.is_json_format():
monitor_values['vclock'] = {"value" : monitor_values['vclock'],
"unit" : freq_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['vclock'] = "N/A"
logging.debug("Failed to get dclock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'VCLOCK'.rjust(8)
2023-12-06 16:57:26 -06:00
try:
dclock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_dclk0']
monitor_values['dclock'] = dclock
freq_unit = 'MHz'
if dclock != "N/A":
if self.logger.is_human_readable_format():
monitor_values['dclock'] = f"{monitor_values['dclock']} {freq_unit}"
if self.logger.is_json_format():
monitor_values['dclock'] = {"value" : monitor_values['dclock'],
"unit" : freq_unit}
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['dclock'] = "N/A"
logging.debug("Failed to get vclock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'DCLOCK'.rjust(8)
2023-12-06 16:57:26 -06:00
if args.ecc:
try:
ecc = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu)
monitor_values['single_bit_ecc'] = ecc['correctable_count']
monitor_values['double_bit_ecc'] = ecc['uncorrectable_count']
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['ecc'] = "N/A"
logging.debug("Failed to get ecc on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'SINGLE_ECC'.rjust(12)
self.logger.table_header += 'DOUBLE_ECC'.rjust(12)
try:
pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
monitor_values['pcie_replay'] = pcie_metric['pcie_replay_count']
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['pcie_replay'] = "N/A"
logging.debug("Failed to get gpu_metrics pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info())
if monitor_values['pcie_replay'] == "N/A":
try:
pcie_replay = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
monitor_values['pcie_replay'] = pcie_replay
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get sysfs pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info())
2023-12-06 16:57:26 -06:00
self.logger.table_header += 'PCIE_REPLAY'.rjust(13)
if args.vram_usage:
try:
vram_usage = amdsmi_interface.amdsmi_get_gpu_vram_usage(args.gpu)
monitor_values['vram_used'] = vram_usage['vram_used']
monitor_values['vram_total'] = vram_usage['vram_total']
vram_usage_unit = "MB"
2023-12-06 16:57:26 -06:00
if self.logger.is_human_readable_format():
monitor_values['vram_used'] = f"{monitor_values['vram_used']} {vram_usage_unit}"
monitor_values['vram_total'] = f"{monitor_values['vram_total']} {vram_usage_unit}"
if self.logger.is_json_format():
monitor_values['vram_used'] = {"value" : monitor_values['vram_used'],
"unit" : vram_usage_unit}
monitor_values['vram_total'] = {"value" : monitor_values['vram_total'],
"unit" : vram_usage_unit}
2023-12-06 16:57:26 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['vram_used'] = "N/A"
monitor_values['vram_total'] = "N/A"
logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'VRAM_USED'.rjust(11)
self.logger.table_header += 'VRAM_TOTAL'.rjust(12)
if args.pcie:
if pcie_info != "N/A":
pcie_bw_unit = 'Mb/s'
monitor_values['pcie_bw'] = self.helpers.unit_format(self.logger, pcie_info['pcie_bandwidth'], pcie_bw_unit)
else:
monitor_values['pcie_bw'] = pcie_info
2023-12-06 16:57:26 -06:00
self.logger.table_header += 'PCIE_BW'.rjust(12)
2023-12-06 16:57:26 -06:00
2024-05-21 20:30:16 -05:00
if args.violation:
violation_status = {
"pviol": "N/A",
"tviol": "N/A",
"phot_tviol": "N/A",
"vr_tviol": "N/A",
"hbm_tviol": "N/A",
}
try:
violations = amdsmi_interface.amdsmi_get_violation_status(args.gpu)
violation_status['pviol'] = violations['per_ppt_pwr']
violation_status['tviol'] = violations['per_socket_thrm']
violation_status['phot_tviol'] = violations['per_prochot_thrm']
violation_status['vr_tviol'] = violations['per_vr_thrm']
violation_status['hbm_tviol'] = violations['per_hbm_thrm']
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['pviol'] = violation_status['pviol']
monitor_values['tviol'] = violation_status['tviol']
monitor_values['phot_tviol'] = violation_status['phot_tviol']
monitor_values['vr_tviol'] = violation_status['vr_tviol']
monitor_values['hbm_tviol'] = violation_status['hbm_tviol']
logging.debug("Failed to get violation status on gpu %s | %s", gpu_id, e.get_error_info())
violation_status_unit = "%"
kTVIOL_MAX_WIDTH = 10
kPVIOL_MAX_WIDTH = 10
kPHOT_MAX_WIDTH = 12
kVR_MAX_WIDTH = 10
kHBM_MAX_WIDTH = 11
for key, value in violation_status.items():
monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit)
if self.logger.is_human_readable_format():
monitor_values['pviol'] = monitor_values['pviol'].rjust(kPVIOL_MAX_WIDTH, ' ')
monitor_values['tviol'] = monitor_values['tviol'].rjust(kTVIOL_MAX_WIDTH, ' ')
monitor_values['phot_tviol'] = monitor_values['phot_tviol'].rjust(kPHOT_MAX_WIDTH, ' ')
monitor_values['vr_tviol'] = monitor_values['vr_tviol'].rjust(kVR_MAX_WIDTH, ' ')
monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'].rjust(kHBM_MAX_WIDTH, ' ')
self.logger.table_header += 'PVIOL'.rjust(kPVIOL_MAX_WIDTH, ' ')
self.logger.table_header += 'TVIOL'.rjust(kTVIOL_MAX_WIDTH, ' ')
self.logger.table_header += 'PHOT_TVIOL'.rjust(kPHOT_MAX_WIDTH, ' ')
self.logger.table_header += 'VR_TVIOL'.rjust(kVR_MAX_WIDTH, ' ')
self.logger.table_header += 'HBM_TVIOL'.rjust(kHBM_MAX_WIDTH, ' ')
2023-12-06 16:57:26 -06:00
self.logger.store_output(args.gpu, 'values', monitor_values)
# intialize dual_csv_format; applicable to process only
dual_csv_output = False
# Store process list seperately
if args.process:
# Populate initial processes
try:
process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
raise e
# Clean processes dictionary
filtered_process_values = []
for process_info in process_list:
process_info['mem_usage'] = process_info.pop('mem')
process_info['usage'] = process_info.pop('engine_usage')
engine_usage_unit = "ns"
memory_usage_unit = "B"
if self.logger.is_human_readable_format():
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
for usage_metric in process_info['memory_usage']:
process_info["memory_usage"][usage_metric] = self.helpers.convert_bytes_to_readable(process_info["memory_usage"][usage_metric])
memory_usage_unit = ""
process_info['mem_usage'] = self.helpers.unit_format(self.logger,
process_info['mem_usage'],
memory_usage_unit)
for usage_metric in process_info['usage']:
process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger,
process_info['usage'][usage_metric],
engine_usage_unit)
for usage_metric in process_info['memory_usage']:
process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger,
process_info['memory_usage'][usage_metric],
memory_usage_unit)
filtered_process_values.append({'process_info': process_info})
# If no processes are populated then we populate an N/A placeholder
if not filtered_process_values:
logging.debug("Monitor - Failed to detect any process on gpu %s", gpu_id)
filtered_process_values.append({'process_info': "N/A"})
for index, process in enumerate(filtered_process_values):
if process['process_info'] == "N/A":
filtered_process_values[index]['process_info'] = "No running processes detected"
# Build the process table's title and header
self.logger.secondary_table_title = "PROCESS INFO"
self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(22) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USAGE".rjust(11) + \
"GFX".rjust(8) + "ENC".rjust(8)
if watching_output:
self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header
logging.debug(f"Monitor - Process Info for GPU {gpu_id} | {filtered_process_values}")
if self.logger.is_json_format():
self.logger.store_output(args.gpu, 'process_list', filtered_process_values)
if self.logger.is_human_readable_format():
# Print out process in flattened format
# The logger detects if process list is present and pulls it out and prints
# that table with timestamp, gpu, and prints headers separately
self.logger.store_output(args.gpu, 'process_list', filtered_process_values)
if self.logger.is_csv_format():
dual_csv_output = True
# The logger detects if process list is present and pulls it out and prints
# that table with timestamp, gpu, and prints headers separately
self.logger.store_output(args.gpu, 'process_list', filtered_process_values)
# Now handling the single gpu case only
2023-12-06 16:57:26 -06:00
if multiple_devices:
self.logger.store_multiple_device_output()
return
2023-12-06 16:57:26 -06:00
if watching_output and not self.logger.destination == "stdout": # End of single gpu add to watch_output
2023-12-06 16:57:26 -06:00
self.logger.store_watch_output(multiple_device_enabled=False)
self.logger.print_output(multiple_device_enabled=False, watching_output=watching_output, tabular=True, dual_csv_output=dual_csv_output)
2023-03-06 06:20:21 -06:00
def rocm_smi(self, args):
print("Placeholder for rocm-smi legacy commands")
2023-03-28 15:32:17 -05:00
2024-02-22 23:08:37 -06:00
def xgmi(self, args, multiple_devices=False, gpu=None, metric=None):
""" Get topology information for target gpus
params:
args - argparser args to pass to subcommand
multiple_devices (bool) - True if checking for multiple devices
gpu (device_handle) - device_handle for target device
metric (bool) - Value override for args.metric
return:
Nothing
"""
# Not supported with partitions
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
if metric:
args.metric = metric
# Handle No GPU passed
if args.gpu == None:
args.gpu = self.device_handles
if not isinstance(args.gpu, list):
args.gpu = [args.gpu]
# Handle all args being false
if not any([args.metric]):
args.metric = True
# Clear the table header
self.logger.table_header = ''.rjust(7)
# Populate the possible gpus and their bdfs
xgmi_values = []
for gpu in args.gpu:
logging.debug("check1 device_handle: %s", gpu)
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu)
xgmi_values.append({"gpu" : gpu_id,
"bdf" : gpu_bdf})
# Populate header with just bdfs
self.logger.table_header += gpu_bdf.rjust(13)
if args.metric:
# prepend link metrics header to the table header
link_metrics_header = " " + "bdf".ljust(13) + \
"bit_rate".ljust(9) + "max_bandwidth".ljust(14) + \
"link_type".ljust(10)
self.logger.table_header = link_metrics_header + self.logger.table_header.strip()
# Populate dictionary according to format
for xgmi_dict in xgmi_values:
src_gpu_id = xgmi_dict['gpu']
src_gpu_bdf = xgmi_dict['bdf']
src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf)
2024-02-22 23:08:37 -06:00
logging.debug("check2 device_handle: %s", src_gpu)
# This should be the same order as the check1
xgmi_dict['link_metrics'] = {
"bit_rate" : "N/A",
"max_bandwidth" : "N/A",
"link_type" : "N/A",
"links" : []
}
try:
2024-03-05 14:01:06 -06:00
pcie_static = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static']
if pcie_static['max_pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000, 1)
2024-02-22 23:08:37 -06:00
else:
2024-03-05 14:01:06 -06:00
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000)
2024-02-22 23:08:37 -06:00
bitrate = pcie_speed_GTs_value
2024-03-05 14:01:06 -06:00
max_bandwidth = bitrate * pcie_static['max_pcie_width']
2024-02-22 23:08:37 -06:00
except amdsmi_exception.AmdSmiLibraryException as e:
bitrate = "N/A"
max_bandwidth = "N/A"
2024-02-22 23:08:37 -06:00
logging.debug("Failed to get bitrate and bandwidth for GPU %s | %s", src_gpu_id,
e.get_error_info())
# Populate bitrate and max_bandwidth with units logic
bw_unit = 'Gb/s'
if self.logger.is_human_readable_format():
xgmi_dict['link_metrics']['bit_rate'] = f"{bitrate} {bw_unit}"
xgmi_dict['link_metrics']['max_bandwidth'] = f"{max_bandwidth} {bw_unit}"
elif self.logger.is_json_format():
xgmi_dict['link_metrics']['bit_rate'] = {"value" : bitrate,
"unit" : bw_unit}
xgmi_dict['link_metrics']['max_bandwidth'] = {"value" : max_bandwidth,
"unit" : bw_unit}
elif self.logger.is_csv_format():
xgmi_dict['link_metrics']['bit_rate'] = bitrate
xgmi_dict['link_metrics']['max_bandwidth'] = max_bandwidth
# Populate link metrics
for dest_gpu in args.gpu:
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu)
dest_link_dict = {
"gpu" : dest_gpu_id,
"bdf" : dest_gpu_bdf,
"read" : "N/A",
"write" : "N/A"
}
# Don't make a call to check link status for the same gpu
if dest_gpu_bdf == src_gpu_bdf:
dest_link_dict['read'] = "N/A"
dest_link_dict['write'] = "N/A"
xgmi_dict['link_metrics']['links'].append(dest_link_dict)
continue
try:
# Get the read write relative to the source gpu
metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(src_gpu)
read = metrics_info['xgmi_read_data_acc'][dest_gpu_id]
write = metrics_info['xgmi_write_data_acc'][dest_gpu_id]
except amdsmi_exception.AmdSmiLibraryException as e:
read = "N/A"
write = "N/A"
2024-02-22 23:08:37 -06:00
logging.debug("Failed to get read data for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
data_unit = 'KB'
if self.logger.is_human_readable_format():
dest_link_dict['read'] = self.helpers.convert_bytes_to_readable(read * 1024, True)
dest_link_dict['write'] = self.helpers.convert_bytes_to_readable(write * 1024, True)
2024-02-22 23:08:37 -06:00
elif self.logger.is_json_format():
dest_link_dict['read'] = {"value" : read,
"unit" : data_unit}
dest_link_dict['write'] = {"value" : write,
"unit" : data_unit}
elif self.logger.is_csv_format():
dest_link_dict['read'] = read
dest_link_dict['write'] = write
try:
link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
if xgmi_dict['link_metrics']['link_type'] != "XGMI" and isinstance(link_type, int):
if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_UNDEFINED:
xgmi_dict['link_metrics']['link_type'] = "UNKNOWN"
elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_PCIEXPRESS:
xgmi_dict['link_metrics']['link_type'] = "PCIE"
elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_XGMI:
xgmi_dict['link_metrics']['link_type'] = "XGMI"
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get link type for %s to %s | %s",
self.helpers.get_gpu_id_from_device_handle(src_gpu),
self.helpers.get_gpu_id_from_device_handle(dest_gpu),
e.get_error_info())
xgmi_dict['link_metrics']['links'].append(dest_link_dict)
# Handle printing for tabular format
if self.logger.is_human_readable_format():
# Populate tabular output
tabular_output = []
for xgmi_dict in xgmi_values:
tabular_output_dict = {}
# Create GPU row and add to tabular_output
for key, value in xgmi_dict.items():
if key == "gpu":
tabular_output_dict["gpu#"] = f"GPU{value}"
if key == "bdf":
tabular_output_dict["bdf"] = value
if key == "link_metrics":
for link_key, link_value in value.items():
if link_key == "bit_rate":
tabular_output_dict["bit_rate"] = link_value
if link_key == "max_bandwidth":
tabular_output_dict["max_bandwidth"] = link_value
if link_key == "link_type":
tabular_output_dict["link_type"] = link_value
tabular_output.append(tabular_output_dict)
# Create Read and Write rows and add to tabular_output
read_output_dict = {"RW" : " Read"}
write_output_dict = {"RW" : " Write"}
2024-02-22 23:08:37 -06:00
for key, value in xgmi_dict.items():
if key == "link_metrics":
for link_key, link_value in value.items():
if link_key == "links":
for link in link_value:
read_output_dict[f"bdf_{link['gpu']}"] = link["read"]
write_output_dict[f"bdf_{link['gpu']}"] = link["write"]
tabular_output.append(read_output_dict)
tabular_output.append(write_output_dict)
# Print out the tabular output
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "LINK METRIC TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
self.logger.multiple_device_output = xgmi_values
if self.logger.is_csv_format():
2024-02-22 23:08:37 -06:00
new_output = []
for elem in self.logger.multiple_device_output:
new_output.append(self.logger.flatten_dict(elem, topology_override=True))
self.logger.multiple_device_output = new_output
if not self.logger.is_human_readable_format():
self.logger.print_output(multiple_device_enabled=True)
def partition(self, args, multiple_devices=False, gpu=None, current=None, memory=None, accelerator=None):
""" Display parition information for the target GPU
param:
args - argparser args to pass to subcommand
multiple_devices (bool) - True if checking for multiple devices
gpu (device_handle) - device_handle for target device
current - boolean which dictates whether the current partition information is shown
memory - boolean which dictates whether the memory partition information is shown
accelerator - boolean which dictates whether the accelerator partition information is shown
returns:
nothing
"""
if gpu:
args.gpu = gpu
if args.gpu == None:
args.gpu = self.device_handles
if not isinstance(args.gpu, list):
args.gpu = [args.gpu]
if current:
args.current = current
if memory:
args.memory = memory
if accelerator:
args.accelerator = accelerator
# if no args are present, then everything should be displayed
if not args.current and not args.memory and not args.accelerator:
args.current = True
args.memory = True
args.accelerator = True
if args.current:
self.logger.table_header = ''.rjust(7)
current_header = "GPU_ID".ljust(13) + \
"MEMORY".ljust(8) + \
"ACCELERATOR_TYPE".ljust(18) + \
"ACCELERATOR_PROFILE_INDEX".ljust(27) + \
"PARTITION_ID".ljust(14)
self.logger.table_header = current_header + self.logger.table_header.strip()
tabular_output = []
for gpu in args.gpu:
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
try:
partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu)
profile_type = partition_dict['partition_profile']['profile_type']
profile_index = partition_dict['partition_profile']['profile_index']
partition_id = partition_dict['partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
profile_type = "N/A"
profile_index = "N/A"
partition_id = "N/A"
logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info())
try:
current_mem_cap = amdsmi_interface.amdsmi_get_gpu_memory_partition(gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
current_mem_cap = "N/A"
logging.debug("Failed to get current memory partition capabilties for GPU %s | %s", gpu_id, e.get_error_info())
if profile_type == 0:
profile_type = "N/A"
tabular_output_dict = {"gpu_id": gpu_id,
"memory": current_mem_cap,
"accelerator_type": profile_type,
"accelerator_profile_index": profile_index,
"partition_id": partition_id}
tabular_output.append(tabular_output_dict)
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "CURRENT_PARTITION"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
self.logger.clear_multiple_devices_ouput()
if args.memory:
for gpu in args.gpu:
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
try:
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(gpu) # this info likely actually comes from different apis than used here
except amdsmi_exception.AmdSmiLibraryException as e:
memory_partition = "N/A"
logging.debug("Failed to get current memory partition for GPU %s | %s", gpu_id, e.get_error_info())
try:
partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu)
temp_mem_caps = partition_dict['partition_profile']['memory_caps']
if temp_mem_caps.amdsmi_nps_flags_t == None:
mem_caps = temp_mem_caps.nps_cap_mask
mem_caps_list = []
if mem_caps & 1 == 1:
mem_caps_list.append("NPS1")
if mem_caps & 2 == 2:
mem_caps_list.append("NPS2")
if mem_caps & 4 == 4:
mem_caps_list.append("NPS4")
if mem_caps & 8 == 8:
mem_caps_list.append("NPS8")
mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "")
else:
mem_caps = temp_mem_caps.amdsmi_nps_flags_t
mem_caps_list = []
if mem_caps.nps1_cap == 1:
mem_caps_list.append("NPS1")
if mem_caps.nps2_cap == 1:
mem_caps_list.append("NPS2")
if mem_caps.nps4_cap == 1:
mem_caps_list.append("NPS4")
if mem_caps.nps8_cap == 1:
mem_caps_list.append("NPS8")
mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "").replace("\'", "")
if mem_caps_str == "":
mem_caps_str = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
mem_caps_str = "N/A"
logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info())
memory_dict = {'caps': mem_caps_str, 'current': memory_partition}
self.logger.store_output(gpu, 'memory_partition', memory_dict)
self.logger.store_multiple_device_output()
self.logger.print_output(multiple_device_enabled=True)
self.logger.clear_multiple_devices_ouput()
if args.accelerator:
self.logger.table_header = ''.rjust(7)
current_header = "GPU_ID".ljust(13) + \
"PROFILE_INDEX".ljust(15) + \
"MEMORY_PARTITION_CAPS".ljust(23) + \
"ACCELERATOR_TYPE".ljust(18) + \
"PARTITION_ID".ljust(14) + \
"NUM_PARTITIONS".ljust(16) + \
"NUM_RESOURCES".ljust(15) + \
"RESOURCE_INDEX".ljust(16) + \
"RESOURCE_TYPE".ljust(15) + \
"RESOURCE_INSTANCES".ljust(20) + \
"RESOURCES_SHARED".ljust(18)
self.logger.table_header = current_header + self.logger.table_header.strip()
tabular_output = []
for gpu in args.gpu:
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
try:
partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu)
profile_type = partition_dict['partition_profile']['profile_type']
profile_index = partition_dict['partition_profile']['profile_index']
temp_mem_caps = partition_dict['partition_profile']['memory_caps']
parition_id = partition_dict['partition_id']
num_resources = partition_dict['partition_profile']['num_resources']
resources = partition_dict['partition_profile']['resources']
if temp_mem_caps.amdsmi_nps_flags_t == None:
mem_caps = temp_mem_caps.nps_cap_mask
mem_caps_list = []
if mem_caps & 1 == 1:
mem_caps_list.append("NPS1")
if mem_caps & 2 == 2:
mem_caps_list.append("NPS2")
if mem_caps & 4 == 4:
mem_caps_list.append("NPS4")
if mem_caps & 8 == 8:
mem_caps_list.append("NPS8")
mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "").replace("\'", "")
else:
mem_caps = temp_mem_caps.amdsmi_nps_flags_t
mem_caps_list = []
if mem_caps.nps1_cap == 1:
mem_caps_list.append("NPS1")
if mem_caps.nps2_cap == 1:
mem_caps_list.append("NPS2")
if mem_caps.nps4_cap == 1:
mem_caps_list.append("NPS4")
if mem_caps.nps8_cap == 1:
mem_caps_list.append("NPS8")
mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "").replace("\'", "")
if mem_caps_str == "":
mem_caps_str = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
profile_type = "N/A"
profile_index = "N/A"
temp_mem_caps = "N/A"
parition_id = "N/A"
num_resources = "N/A"
resources = "N/A"
mem_caps_str = "N/A"
logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info())
if profile_type == 0:
profile_type = "N/A"
tabular_output_dict = {"gpu_id": gpu_id,
"profile_index": profile_index,
"memory_partition_caps": mem_caps_str,
"accelerator_type": profile_type,
"partition_id": parition_id,
"num_partitions": 0,
"num_resources": num_resources,
"resource_index": resources,
"resource_type": resources,
"resource_instances": resources,
"resources_shared": resources}
tabular_output.append(tabular_output_dict)
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "ACCELERATOR_PARTITION_PROFILES"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
self.logger.clear_multiple_devices_ouput()
2024-02-22 23:08:37 -06:00
2023-03-28 15:32:17 -05:00
def _event_thread(self, commands, i):
devices = commands.device_handles
if len(devices) == 0:
print("No GPUs on machine")
return
device = devices[i]
listener = amdsmi_interface.AmdSmiEventReader(device,
amdsmi_interface.AmdSmiEvtNotificationType)
2023-03-28 15:32:17 -05:00
values_dict = {}
while not self.stop:
2023-03-28 15:32:17 -05:00
try:
events = listener.read(2000)
2023-03-28 15:32:17 -05:00
for event in events:
values_dict["event"] = event["event"]
2024-11-14 14:39:59 -06:00
# parse message as it's own dictionary
message_list = event["message"].split(" ")
message_dict = {}
for item in message_list:
if not item == "":
item_list = item.split(": ")
message_dict.update({item_list[0]: item_list[1]})
values_dict["message"] = message_dict
2023-03-28 15:32:17 -05:00
commands.logger.store_output(device, 'values', values_dict)
commands.logger.print_output()
except amdsmi_exception.AmdSmiLibraryException as e:
2023-09-24 16:12:30 -05:00
if e.err_code != amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_DATA:
2023-03-28 15:32:17 -05:00
print(e)
except Exception as e:
print(e)
listener.stop()