SWDEV-413122 - Initial Monitor subcommand
Change-Id: Iaeaef77efeaa4289b19f1f676dcae6245f0e0c9e
This commit is contained in:
@@ -68,6 +68,7 @@ if __name__ == "__main__":
|
||||
amd_smi_commands.topology,
|
||||
amd_smi_commands.set_value,
|
||||
amd_smi_commands.reset,
|
||||
amd_smi_commands.monitor,
|
||||
amd_smi_commands.rocm_smi)
|
||||
try:
|
||||
try:
|
||||
|
||||
@@ -2254,6 +2254,355 @@ class AMDSMICommands():
|
||||
self.logger.print_output()
|
||||
|
||||
|
||||
def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
|
||||
watch=None, watch_time=None, iterations=None, power_usage=None,
|
||||
temperature=None, gfx_util=None, mem_util=None, encoder=None, decoder=None,
|
||||
throttle_status=None, ecc=None, vram_usage=None, pcie=None):
|
||||
""" Populate a table with each GPU as an index to rows of targeted data
|
||||
|
||||
Args:
|
||||
args (Namespace): Namespace containing the parsed CLI args
|
||||
multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False.
|
||||
gpu (device_handle, optional): device_handle for target device. Defaults to None.
|
||||
watch (bool, optional): Value override for args.watch. Defaults to None.
|
||||
watch_time (int, optional): Value override for args.watch_time. Defaults to None.
|
||||
iterations (int, optional): Value override for args.iterations. Defaults to None.
|
||||
power_usage (bool, optional): Value override for args.power_usage. Defaults to None.
|
||||
temperature (bool, optional): Value override for args.temperature. Defaults to None.
|
||||
gfx (bool, optional): Value override for args.gfx. Defaults to None.
|
||||
mem (bool, optional): Value override for args.mem. Defaults to None.
|
||||
encoder (bool, optional): Value override for args.encoder. Defaults to None.
|
||||
decoder (bool, optional): Value override for args.decoder. Defaults to None.
|
||||
throttle_status (bool, optional): Value override for args.throttle_status. Defaults to None.
|
||||
ecc (bool, optional): Value override for args.ecc. Defaults to None.
|
||||
vram_usage (bool, optional): Value override for args.vram_usage. Defaults to None.
|
||||
pcie (bool, optional): Value override for args.pcie. Defaults to None.
|
||||
|
||||
Raises:
|
||||
ValueError: Value error if no gpu value is provided
|
||||
IndexError: Index error if gpu list is empty
|
||||
|
||||
Return:
|
||||
Nothing
|
||||
"""
|
||||
# Set args.* to passed in arguments
|
||||
if gpu:
|
||||
args.gpu = gpu
|
||||
if watch:
|
||||
args.watch = watch
|
||||
if watch_time:
|
||||
args.watch_time = watch_time
|
||||
if iterations:
|
||||
args.iterations = iterations
|
||||
|
||||
# monitor args
|
||||
if power_usage:
|
||||
args.power_usage = power_usage
|
||||
if temperature:
|
||||
args.temperature = temperature
|
||||
if gfx_util:
|
||||
args.gfx = gfx_util
|
||||
if mem_util:
|
||||
args.mem = mem_util
|
||||
if encoder:
|
||||
args.encoder = encoder
|
||||
if decoder:
|
||||
args.decoder = decoder
|
||||
if throttle_status:
|
||||
args.throttle_status = throttle_status
|
||||
if ecc:
|
||||
args.ecc = ecc
|
||||
if vram_usage:
|
||||
args.vram_usage = vram_usage
|
||||
if pcie:
|
||||
args.pcie = pcie
|
||||
|
||||
# Handle No GPU passed
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
# If all arguments are False, the print all values
|
||||
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
|
||||
args.encoder, args.decoder, args.throttle_status, args.ecc,
|
||||
args.vram_usage, args.pcie]):
|
||||
args.power_usage = args.temperature = args.gfx = args.mem = \
|
||||
args.encoder = args.decoder = args.throttle_status = args.ecc = \
|
||||
args.vram_usage = args.pcie = True
|
||||
|
||||
# Handle watch logic, will only enter this block once
|
||||
if args.watch:
|
||||
self.helpers.handle_watch(args=args, subcommand=self.monitor, logger=self.logger)
|
||||
return
|
||||
|
||||
# Handle multiple GPUs
|
||||
if isinstance(args.gpu, list):
|
||||
if len(args.gpu) > 1:
|
||||
# Deepcopy gpus as recursion will destroy the gpu list
|
||||
stored_gpus = []
|
||||
for gpu in args.gpu:
|
||||
stored_gpus.append(gpu)
|
||||
|
||||
# Store output from multiple devices
|
||||
for device_handle in args.gpu:
|
||||
self.monitor(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle)
|
||||
|
||||
# Reload original gpus
|
||||
args.gpu = stored_gpus
|
||||
|
||||
# Print multiple device output
|
||||
self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output, tabular=True)
|
||||
|
||||
# Add output to total watch output and clear multiple device output
|
||||
if watching_output:
|
||||
self.logger.store_watch_output(multiple_device_enabled=True)
|
||||
|
||||
# Flush the watching output
|
||||
self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output, tabular=True)
|
||||
|
||||
return
|
||||
elif len(args.gpu) == 1:
|
||||
args.gpu = args.gpu[0]
|
||||
else:
|
||||
raise IndexError("args.gpu should not be an empty list")
|
||||
|
||||
monitor_values = {}
|
||||
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
# Clear the table header; TODO make this a function
|
||||
self.logger.table_header = ''
|
||||
|
||||
# Store timestamp for watch output
|
||||
if watching_output:
|
||||
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
|
||||
self.logger.table_header += 'TIMESTAMP'.rjust(10)
|
||||
|
||||
self.logger.table_header += 'GPU'.rjust(3)
|
||||
|
||||
if args.power_usage:
|
||||
try:
|
||||
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
power_usage = gpu_metrics_info['current_socket_power']
|
||||
if power_usage >= 0xFFFFFFFF:
|
||||
power_usage = gpu_metrics_info['average_socket_power']
|
||||
if power_usage >= 0xFFFFFFFF:
|
||||
power_usage = "N/A"
|
||||
monitor_values['power_usage'] = power_usage
|
||||
if self.logger.is_human_readable_format() and monitor_values['power_usage'] != "N/A":
|
||||
monitor_values['power_usage'] = f"{monitor_values['power_usage']} W"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['power_usage'] = "N/A"
|
||||
logging.debug("Failed to get power usage on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'POWER'.rjust(7)
|
||||
if args.temperature:
|
||||
try:
|
||||
temperature = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['temperature_hotspot']
|
||||
monitor_values['hotspot_temperature'] = temperature
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['hotspot_temperature'] = "N/A"
|
||||
logging.debug("Failed to get hotspot temperature on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
temperature = amdsmi_interface.amdsmi_get_gpu_metrics_temp_mem(args.gpu)
|
||||
monitor_values['memory_temperature'] = temperature
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['memory_temperature'] = "N/A"
|
||||
logging.debug("Failed to get memory temperature on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if self.logger.is_human_readable_format() and monitor_values['hotspot_temperature'] != "N/A":
|
||||
monitor_values['hotspot_temperature'] = f"{monitor_values['hotspot_temperature']} \N{DEGREE SIGN}C"
|
||||
|
||||
if self.logger.is_human_readable_format() and monitor_values['memory_temperature'] != "N/A":
|
||||
monitor_values['memory_temperature'] = f"{monitor_values['memory_temperature']} \N{DEGREE SIGN}C"
|
||||
|
||||
self.logger.table_header += 'GPU_TEMP'.rjust(10)
|
||||
self.logger.table_header += 'MEM_TEMP'.rjust(10)
|
||||
if args.gfx:
|
||||
try:
|
||||
gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_gfx_activity(args.gpu)
|
||||
monitor_values['gfx'] = gfx_util
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['gfx'] = f"{monitor_values['gfx']} %"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['gfx'] = "N/A"
|
||||
logging.debug("Failed to get gfx utilization on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'GFX_UTIL'.rjust(10)
|
||||
|
||||
try:
|
||||
gfx_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_gfxclk']
|
||||
monitor_values['gfx_clock'] = gfx_clock
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['gfx_clock'] = f"{monitor_values['gfx_clock']} MHz"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['gfx_clock'] = "N/A"
|
||||
logging.debug("Failed to get gfx clock on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'GFX_CLOCK'.rjust(11)
|
||||
if args.mem:
|
||||
try:
|
||||
mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_umc_activity(args.gpu)
|
||||
monitor_values['mem'] = mem_util
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['mem'] = f"{monitor_values['mem']} %"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['mem'] = "N/A"
|
||||
logging.debug("Failed to get mem utilization on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'MEM_UTIL'.rjust(10)
|
||||
|
||||
try:
|
||||
mem_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_uclk']
|
||||
monitor_values['mem_clock'] = mem_clock
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['mem_clock'] = f"{monitor_values['mem_clock']} MHz"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['mem_clock'] = "N/A"
|
||||
logging.debug("Failed to get mem clock on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'MEM_CLOCK'.rjust(11)
|
||||
if args.encoder:
|
||||
try:
|
||||
# Get List of vcn activity values
|
||||
encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_vcn_activity(args.gpu)
|
||||
encoding_activity_avg = []
|
||||
for value in encoder_util:
|
||||
if value < 150: # each encoder chiplet's value range should be a percent
|
||||
encoding_activity_avg.append(value)
|
||||
# Averaging the possible encoding activity values
|
||||
if encoding_activity_avg:
|
||||
encoding_activity_avg = sum(encoding_activity_avg) / len(encoding_activity_avg)
|
||||
else:
|
||||
encoding_activity_avg = "N/A"
|
||||
monitor_values['encoder'] = encoding_activity_avg
|
||||
if self.logger.is_human_readable_format() and monitor_values['encoder'] != "N/A":
|
||||
monitor_values['encoder'] = f"{monitor_values['encoder']} %"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['encoder'] = "N/A"
|
||||
logging.debug("Failed to get encoder utilization on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'ENC_UTIL'.rjust(10)
|
||||
|
||||
try:
|
||||
encoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0']
|
||||
monitor_values['encoder_clock'] = encoder_clock
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['encoder_clock'] = f"{monitor_values['encoder_clock']} MHz"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['encoder_clock'] = "N/A"
|
||||
logging.debug("Failed to get encoder clock on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'ENC_CLOCK'.rjust(11)
|
||||
if args.decoder:
|
||||
try:
|
||||
decoder_util = "N/A" # Not yet implemented
|
||||
monitor_values['decoder'] = decoder_util
|
||||
# if self.logger.is_human_readable_format():
|
||||
# monitor_values['decoder'] = f"{monitor_values['decoder']} %"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['decoder'] = "N/A"
|
||||
logging.debug("Failed to get decoder utilization on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'DEC_UTIL'.rjust(10)
|
||||
|
||||
try:
|
||||
decoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_dclk0']
|
||||
monitor_values['decoder_clock'] = decoder_clock
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['decoder_clock'] = f"{monitor_values['decoder_clock']} MHz"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['decoder_clock'] = "N/A"
|
||||
logging.debug("Failed to get decoder clock on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'DEC_CLOCK'.rjust(11)
|
||||
if args.throttle_status:
|
||||
try:
|
||||
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_throttle_status(args.gpu)
|
||||
if throttle_status:
|
||||
throttle_status = "THROTTLED"
|
||||
else:
|
||||
throttle_status = "UNTHROTTLED"
|
||||
monitor_values['throttle_status'] = throttle_status
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['throttle_status'] = "N/A"
|
||||
logging.debug("Failed to get throttle status on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'THROTTLE'.rjust(13)
|
||||
if args.ecc:
|
||||
try:
|
||||
ecc = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu)
|
||||
monitor_values['single_bit_ecc'] = ecc['correctable_count']
|
||||
monitor_values['double_bit_ecc'] = ecc['uncorrectable_count']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['ecc'] = "N/A"
|
||||
logging.debug("Failed to get ecc on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'SINGLE_ECC'.rjust(12)
|
||||
self.logger.table_header += 'DOUBLE_ECC'.rjust(12)
|
||||
|
||||
try:
|
||||
pcie_replay = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
|
||||
monitor_values['pcie_replay'] = pcie_replay
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['pcie_replay'] = "N/A"
|
||||
logging.debug("Failed to get pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'PCIE_REPLAY'.rjust(13)
|
||||
if args.vram_usage:
|
||||
try:
|
||||
vram_usage = amdsmi_interface.amdsmi_get_gpu_vram_usage(args.gpu)
|
||||
monitor_values['vram_used'] = vram_usage['vram_used']
|
||||
monitor_values['vram_total'] = vram_usage['vram_total']
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['vram_used'] = f"{monitor_values['vram_used']} MB"
|
||||
monitor_values['vram_total'] = f"{monitor_values['vram_total']} MB"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['vram_used'] = "N/A"
|
||||
monitor_values['vram_total'] = "N/A"
|
||||
logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'VRAM_USED'.rjust(11)
|
||||
self.logger.table_header += 'VRAM_TOTAL'.rjust(12)
|
||||
if args.pcie:
|
||||
try:
|
||||
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
|
||||
sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
|
||||
received = pcie_bw['received'] * pcie_bw['max_pkt_sz']
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
if sent > 0:
|
||||
sent = sent // 1024 // 1024
|
||||
sent = f"{sent} MB/s"
|
||||
|
||||
if received > 0:
|
||||
received = received // 1024 // 1024
|
||||
received = f"{received} MB/s"
|
||||
pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} B"
|
||||
|
||||
monitor_values['pcie_tx'] = sent
|
||||
monitor_values['pcie_rx'] = received
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['pcie_tx'] = "N/A"
|
||||
monitor_values['pcie_rx'] = "N/A"
|
||||
logging.debug("Failed to get pci throughput on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
self.logger.table_header += 'PCIE_TX'.rjust(10)
|
||||
self.logger.table_header += 'PCIE_RX'.rjust(10)
|
||||
|
||||
self.logger.store_output(args.gpu, 'values', monitor_values)
|
||||
|
||||
if multiple_devices:
|
||||
self.logger.store_multiple_device_output()
|
||||
return # Skip printing when there are multiple devices
|
||||
|
||||
self.logger.print_output(watching_output=watching_output, tabular=True)
|
||||
|
||||
if watching_output: # End of single gpu add to watch_output
|
||||
self.logger.store_watch_output(multiple_device_enabled=False)
|
||||
|
||||
|
||||
def rocm_smi(self, args):
|
||||
print("Placeholder for rocm-smi legacy commands")
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ import csv
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from typing import Dict
|
||||
import yaml
|
||||
from enum import Enum
|
||||
|
||||
@@ -37,6 +38,8 @@ class AMDSMILogger():
|
||||
self.watch_output = []
|
||||
self.format = format # csv, json, or human_readable
|
||||
self.destination = destination # stdout, path to a file (append)
|
||||
self.table_header = ""
|
||||
self.added_table_header = False
|
||||
self.helpers = AMDSMIHelpers()
|
||||
|
||||
|
||||
@@ -95,7 +98,32 @@ class AMDSMILogger():
|
||||
return output_dict
|
||||
|
||||
|
||||
def _convert_json_to_human_readable(self, json_object):
|
||||
def _convert_json_to_human_readable(self, json_object: Dict[str, any], tabular=False):
|
||||
# TODO make dynamic
|
||||
if tabular:
|
||||
table_values = ''
|
||||
for key, value in json_object.items():
|
||||
value = str(value)
|
||||
if key == 'gpu':
|
||||
table_values += value.rjust(3)
|
||||
elif key == 'power_usage':
|
||||
table_values += value.rjust(7)
|
||||
elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock'):
|
||||
table_values += value.rjust(11)
|
||||
elif key == 'throttle_status':
|
||||
table_values += value.rjust(13)
|
||||
elif key == 'pcie_replay':
|
||||
table_values += value.rjust(13)
|
||||
elif key == 'vram_total':
|
||||
table_values += value.rjust(12)
|
||||
elif key == 'vram_used':
|
||||
table_values += value.rjust(11)
|
||||
elif 'ecc' in key:
|
||||
table_values += value.rjust(12)
|
||||
else:
|
||||
table_values += value.rjust(10)
|
||||
return table_values.rstrip()
|
||||
|
||||
# First Capitalize all keys in the json object
|
||||
capitalized_json = self._capitalize_keys(json_object)
|
||||
json_string = json.dumps(capitalized_json, indent=4)
|
||||
@@ -266,7 +294,7 @@ class AMDSMILogger():
|
||||
self.output = {}
|
||||
|
||||
|
||||
def print_output(self, multiple_device_enabled=False, watching_output=False):
|
||||
def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
|
||||
""" Print current output acording to format and then destination
|
||||
params:
|
||||
multiple_device_enabled (bool) - True if printing output from
|
||||
@@ -280,10 +308,11 @@ class AMDSMILogger():
|
||||
watching_output=watching_output)
|
||||
elif self.is_csv_format():
|
||||
self._print_csv_output(multiple_device_enabled=multiple_device_enabled,
|
||||
watching_output=watching_output)
|
||||
watching_output=watching_output)
|
||||
elif self.is_human_readable_format():
|
||||
self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled,
|
||||
watching_output=watching_output)
|
||||
watching_output=watching_output,
|
||||
tabular=tabular)
|
||||
|
||||
|
||||
def _print_json_output(self, multiple_device_enabled=False, watching_output=False):
|
||||
@@ -360,14 +389,18 @@ class AMDSMILogger():
|
||||
writer.writerows(stored_csv_output)
|
||||
|
||||
|
||||
def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False):
|
||||
def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
|
||||
human_readable_output = ''
|
||||
if tabular and not self.added_table_header:
|
||||
human_readable_output += self.table_header + '\n'
|
||||
self.added_table_header = True
|
||||
|
||||
if multiple_device_enabled:
|
||||
human_readable_output = ''
|
||||
for output in self.multiple_device_output:
|
||||
human_readable_output += self._convert_json_to_human_readable(output)
|
||||
human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
|
||||
human_readable_output += '\n'
|
||||
else:
|
||||
human_readable_output = self._convert_json_to_human_readable(self.output)
|
||||
human_readable_output += self._convert_json_to_human_readable(self.output, tabular=tabular)
|
||||
|
||||
if self.destination == 'stdout':
|
||||
try:
|
||||
@@ -380,9 +413,13 @@ class AMDSMILogger():
|
||||
if watching_output:
|
||||
with self.destination.open('w') as output_file:
|
||||
human_readable_output = ''
|
||||
if tabular:
|
||||
human_readable_output += self.table_header + '\n'
|
||||
for output in self.watch_output:
|
||||
human_readable_output += self._convert_json_to_human_readable(output)
|
||||
human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
|
||||
output_file.write(human_readable_output + '\n')
|
||||
else:
|
||||
with self.destination.open('a') as output_file:
|
||||
if tabular:
|
||||
human_readable_output += self.table_header + '\n'
|
||||
output_file.write(human_readable_output + '\n')
|
||||
|
||||
@@ -66,7 +66,8 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
argparse (ArgumentParser): argparse.ArgumentParser
|
||||
"""
|
||||
def __init__(self, version, list, static, firmware, bad_pages, metric,
|
||||
process, profile, event, topology, set_value, reset, rocmsmi):
|
||||
process, profile, event, topology, set_value, reset, monitor,
|
||||
rocmsmi):
|
||||
|
||||
# Helper variables
|
||||
self.helpers = AMDSMIHelpers()
|
||||
@@ -105,6 +106,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
self._add_topology_parser(self.subparsers, topology)
|
||||
self._add_set_value_parser(self.subparsers, set_value)
|
||||
self._add_reset_parser(self.subparsers, reset)
|
||||
self._add_monitor_parser(self.subparsers, monitor)
|
||||
self._add_rocm_smi_parser(self.subparsers, rocmsmi)
|
||||
|
||||
|
||||
@@ -813,6 +815,54 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help)
|
||||
|
||||
|
||||
def _add_monitor_parser(self, subparsers, func):
|
||||
if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
|
||||
# This subparser is only applicable to Baremetal Linux
|
||||
return
|
||||
|
||||
# Subparser help text
|
||||
monitor_help = "Monitor metrics for target devices"
|
||||
monitor_subcommand_help = "Monitor a target device for the specified arguments.\
|
||||
\nIf no arguments are provided, all arguments will be enabled.\
|
||||
\nUse the watch arguments to run continuously"
|
||||
monitor_optionals_title = "Monitor Arguments"
|
||||
|
||||
# Help text for Arguments only on Guest and BM platforms
|
||||
power_usage_help = "Monitor power usage in Watts"
|
||||
temperature_help = "Monitor temperature in Celsius"
|
||||
gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)"
|
||||
mem_util_help = "Monitor memory utilization (%%) and clock (MHz)"
|
||||
encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)"
|
||||
decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)"
|
||||
throttle_help = "Monitor thermal throttle status"
|
||||
ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts"
|
||||
mem_usage_help = "Monitor memory usage in MB"
|
||||
pcie_throughput_help = "Monitor PCIe Tx/Rx in MB/s"
|
||||
|
||||
# Create monitor subparser
|
||||
monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help)
|
||||
monitor_parser._optionals.title = monitor_optionals_title
|
||||
monitor_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog)
|
||||
monitor_parser.set_defaults(func=func)
|
||||
|
||||
# Add Universal Arguments
|
||||
self._add_command_modifiers(monitor_parser)
|
||||
self._add_device_arguments(monitor_parser, required=False)
|
||||
self._add_watch_arguments(monitor_parser)
|
||||
|
||||
# Add monitor arguments
|
||||
monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help)
|
||||
monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
|
||||
monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help)
|
||||
monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help)
|
||||
monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)
|
||||
monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help)
|
||||
monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help)
|
||||
monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
|
||||
monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help)
|
||||
monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_throughput_help)
|
||||
|
||||
|
||||
def _add_rocm_smi_parser(self, subparsers, func):
|
||||
return
|
||||
# Subparser help text
|
||||
|
||||
@@ -3244,7 +3244,7 @@ def amdsmi_get_gpu_metrics_temp_hotspot(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
|
||||
hotspot_value = ctypes.c_int16()
|
||||
hotspot_value = ctypes.c_uint16()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_hotspot(
|
||||
processor_handle, ctypes.byref(hotspot_value)
|
||||
@@ -3265,7 +3265,7 @@ def amdsmi_get_gpu_metrics_temp_mem(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
|
||||
mem_value = ctypes.c_int16()
|
||||
mem_value = ctypes.c_uint16()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_mem(
|
||||
processor_handle, ctypes.byref(mem_value)
|
||||
@@ -3286,7 +3286,7 @@ def amdsmi_get_gpu_metrics_temp_vrsoc(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
|
||||
vrsoc_value = ctypes.c_int16()
|
||||
vrsoc_value = ctypes.c_uint16()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrsoc(
|
||||
processor_handle, ctypes.byref(vrsoc_value)
|
||||
@@ -3754,7 +3754,7 @@ def amdsmi_get_gpu_metrics_vcn_activity(
|
||||
)
|
||||
)
|
||||
|
||||
return [vcn_activity.value for vcn_activity in vcn_activity_value]
|
||||
return vcn_activity_value
|
||||
|
||||
|
||||
def amdsmi_get_gpu_metrics_xgmi_read_data(
|
||||
@@ -3811,7 +3811,9 @@ def amdsmi_get_gpu_metrics_curr_gfxclk(
|
||||
)
|
||||
)
|
||||
|
||||
return [curr_gfxclk.value for curr_gfxclk in current_gfxclk_value]
|
||||
print([curr_gfxclk for curr_gfxclk in current_gfxclk_value])
|
||||
|
||||
return [curr_gfxclk for curr_gfxclk in current_gfxclk_value]
|
||||
|
||||
|
||||
def amdsmi_get_gpu_metrics_curr_socclk(
|
||||
@@ -3879,7 +3881,7 @@ def amdsmi_get_gpu_metrics_temp_edge(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
|
||||
edge_value = ctypes.c_int16()
|
||||
edge_value = ctypes.c_uint16()
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_edge(
|
||||
@@ -3901,7 +3903,7 @@ def amdsmi_get_gpu_metrics_temp_vrgfx(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
|
||||
vrgfx_value = ctypes.c_int16()
|
||||
vrgfx_value = ctypes.c_uint16()
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrgfx(
|
||||
@@ -3923,7 +3925,7 @@ def amdsmi_get_gpu_metrics_temp_vrmem(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
|
||||
vrmem_value = ctypes.c_int16()
|
||||
vrmem_value = ctypes.c_uint16()
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrmem(
|
||||
|
||||
Reference in New Issue
Block a user