diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index d6f7a33d65..b2d5930386 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -117,12 +117,12 @@ $ amd-smi - **Separated driver reload from `amdsmi_set_gpu_memory_partition()` / `amdsmi_set_gpu_memory_partition_mode()` and CLI (`sudo amd-smi set -M `)** - Providing new API (`amdsmi_gpu_driver_reload()`) and CLI (`sudo amd-smi reset -r` or `sudo amd-smi reset --reload-driver`) once user is ready to reload driver. We understand - the automatic reload could be at an inconvienient time. This is why we now provide this + the automatic reload could be at an inconvenient time. This is why we now provide this functionality in separate API/CLI commands to use when the time is right. - It is important to understand, the memory (NPS) partition change requires: - 1) Memory partition change request (`amdsmi_set_gpu_memory_partition()` / `amdsmi_set_gpu_memory_partition_mode()`) or CLI (`sudo amd-smi set -M `) - 2) Driver reload (`amdsmi_gpu_driver_reload()` / `sudo amd-smi reset -r` or `sudo amd-smi reset --reload-driver`) \[\*\] - \[\*\] Driver reload requires all GPU activity on all devices to be stopped. + 1) Memory partition change request (`amdsmi_set_gpu_memory_partition()` / `amdsmi_set_gpu_memory_partition_mode()`) or CLI (`sudo amd-smi set -M `) + 2) Driver reload (`amdsmi_gpu_driver_reload()` / `sudo amd-smi reset -r` or `sudo amd-smi reset --reload-driver`) \[\*\] + ***Driver reload requires all GPU activity on all devices to be stopped.*** - **Modified `amd-smi` CLI `monitor` and `metric` for violations**. - Disabled `amd-smi monitor --violation` on guests. @@ -164,9 +164,6 @@ $ amd-smi - `AMDSMI_EVT_NOTIF_PROCESS_START` - `AMDSMI_EVT_NOTIF_PROCESS_END` -- **Updated `amdsmi_get_clock_info` in `amdsmi_interface.py`**. - - The `clk_deep_sleep` field now returns the sleep integer value. - - **Added Power Cap to `amd-smi monitor`**. - `amd-smi monitor -p` will display the power cap along with power. @@ -357,6 +354,8 @@ $ amd-smi - **Removed duplicated GPU IDs when receiving events using the `amd-smi event` command**. +- **Fixed `amd-smi monitor` decoder utilization (`DEC%`) not showing up on MI3x ASICs**. + ### Upcoming changes - N/A @@ -800,7 +799,7 @@ Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to ### Changed -- **AMDSMI Library Version number to reflect changes in backwards compatability**. +- **AMDSMI Library Version number to reflect changes in backwards compatibility**. - Removed Year from AMDSMI Library version number. - Version changed from 25.2.0.0 (Year.Major.Minor.Patch) to 25.2.0 (Major.Minor.Patch) - Removed year in all version references @@ -852,7 +851,7 @@ Functions affected by struct change are: - **Added violation status output for Graphics Clock Below Host Limit to our CLI: `amdsmi_get_violation_status()`, `amd-smi metric --throttle`, and `amd-smi monitor --violation`**. ***Only available for MI300+ ASICs.*** Users can retrieve violation status' through either our Python or C++ APIs. - Additionally, we have added capability to view these outputs conviently through `amd-smi metric --throttle` and `amd-smi monitor --violation`. + Additionally, we have added capability to view these outputs conveniently through `amd-smi metric --throttle` and `amd-smi monitor --violation`. Example outputs are listed below (below is for reference, output is subject to change): ```console @@ -963,7 +962,7 @@ Functions affected by struct change are: ... ``` -- **Changed amd-smi partition --accelerator & `amdsmi_get_gpu_accelerator_partition_profile_config()` detect users running without root/sudo privledges** +- **Changed amd-smi partition --accelerator & `amdsmi_get_gpu_accelerator_partition_profile_config()` detect users running without root/sudo permissions** - Updated `amdsmi_get_gpu_accelerator_partition_profile_config()` to return `AMDSMI_STATUS_NO_PERM` immediately if users run without root/sudo permissions. - Updated `amd-smi partition --accelerator` to provide a warning for users without root/sudo permissions (see example below, ***output subject to change***). diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py index 94a6ee2222..5c0708ce1f 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py @@ -75,12 +75,64 @@ def _print_error(e, destination): f = open(destination, "w", encoding="utf-8") f.write(e) f.close() - print("Error occured. Result written to " + str(destination) + " file") + print("Error occurred. Result written to " + str(destination) + " file") + +def configure_logging_and_execute(args, amd_smi_commands): + """ + Configures logging based on the provided arguments and executes the subcommand. + + Args: + args: Parsed command-line arguments. + amd_smi_commands: Instance of AMDSMICommands. + """ + # Remove previous log handlers + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) + + # To enable debug logs in AMD SMI library: + # set RSMI_LOGGING = 1 for logging to files + # set RSMI_LOGGING = 2 for logging to stdout + # set RSMI_LOGGING = 3 for logging to stdout and files + # set RSMI_LOGGING = 0 to disable logging + # Files will be located in /var/log/amd_smi_lib/AMD-SMI-lib.log* + + # log string with the following format: + # loglevel | YYYY-MM-DD HH:MM:SS.ms | filename:line | message + logging_dict = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL + } + + time = '%(asctime)s.%(msecs)03d' + datefmt = '%Y-%m-%d %H:%M:%S' + logging.basicConfig(format='%(levelname)s | ' + time + ' | %(filename)s:%(lineno)d | %(message)s', + level=logging_dict[args.loglevel], datefmt=datefmt) + + # Disable traceback for non-debug log levels + if args.loglevel == "DEBUG": + sys.tracebacklimit = 10 + else: + sys.tracebacklimit = -1 + + logging.debug(args) + + # Execute subcommands + try: + args.func(args) + except amdsmi_cli_exceptions.AmdSmiException as e: + _print_error(str(e), amd_smi_commands.logger.destination) + except amdsmi_exception.AmdSmiLibraryException as e: + exc = amdsmi_cli_exceptions.AmdSmiLibraryErrorException(amd_smi_commands.logger.format, e.get_error_code()) + _print_error(str(exc), amd_smi_commands.logger.destination) if __name__ == "__main__": # Disable traceback before possible init errors in AMDSMICommands and AMDSMIParser - if "DEBUG" in sys.argv: + copy_argv = str(sys.argv.copy()).upper() + if "DEBUG" in copy_argv: sys.tracebacklimit = 10 else: sys.tracebacklimit = -1 @@ -107,57 +159,31 @@ if __name__ == "__main__": sys_argv=sys.argv, helpers=amd_smi_helpers) try: - try: - argcomplete.autocomplete(amd_smi_parser) - except NameError: - logging.debug("argcomplete module not found. Autocomplete will not work.") + argcomplete.autocomplete(amd_smi_parser) + except NameError: + logging.debug("argcomplete module not found. Autocomplete will not work.") - # Store possible subcommands & aliases for later errors - valid_commands = amd_smi_parser.possible_commands - valid_commands += ['--help', '-h'] + # Store possible subcommands & aliases for later errors + valid_commands = amd_smi_parser.possible_commands + valid_commands += ['--help', '-h'] - sys.argv = [arg.lower() if arg.startswith('--') or not arg.startswith('-') - else arg for arg in sys.argv] - if len(sys.argv) == 1: - args = amd_smi_parser.parse_args(args=['default']) - elif sys.argv[1] in valid_commands: - args = amd_smi_parser.parse_args(args=None) - else: - raise amdsmi_cli_exceptions.AmdSmiInvalidSubcommandException(sys.argv[1],amd_smi_commands.logger.destination) + sys.argv = [arg.lower() if arg.startswith('--') or not arg.startswith('-') + else arg for arg in sys.argv] + if len(sys.argv) == 1: + args = amd_smi_parser.parse_args(args=['default']) + elif sys.tracebacklimit == 10 and (sys.argv[1] == '--loglevel'): + args = amd_smi_parser.parse_args(args=['default', '--loglevel'] + sys.argv[2:]) + elif sys.argv[1] in valid_commands: + args = amd_smi_parser.parse_args(args=None) + else: + raise amdsmi_cli_exceptions.AmdSmiInvalidSubcommandException(sys.argv[1],amd_smi_commands.logger.destination) - # Handle command modifiers before subcommand execution - # human readable is the default output format - if hasattr(args, 'json') and args.json: - amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.json.value - if hasattr(args, 'csv') and args.csv: - amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.csv.value - if hasattr(args, 'file') and args.file: - amd_smi_commands.logger.destination = args.file - - # Remove previous log handlers - for handler in logging.root.handlers[:]: - logging.root.removeHandler(handler) - - logging_dict = {'DEBUG' : logging.DEBUG, - 'INFO' : logging.INFO, - 'WARNING': logging.WARNING, - 'ERROR': logging.ERROR, - 'CRITICAL': logging.CRITICAL} - # To enable debug logs on rocm-smi library set RSMI_LOGGING = 1 in environment - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_dict[args.loglevel]) - - # Disable traceback for non-debug log levels - if args.loglevel == "DEBUG": - sys.tracebacklimit = 10 - else: - sys.tracebacklimit = -1 - - logging.debug(args) - - # Execute subcommands - args.func(args) - except amdsmi_cli_exceptions.AmdSmiException as e: - _print_error(str(e), amd_smi_commands.logger.destination) - except amdsmi_exception.AmdSmiLibraryException as e: - exc = amdsmi_cli_exceptions.AmdSmiLibraryErrorException(amd_smi_commands.logger.format, e.get_error_code()) - _print_error(str(exc), amd_smi_commands.logger.destination) + # Handle command modifiers before subcommand execution + # human readable is the default output format + if hasattr(args, 'json') and args.json: + amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.json.value + if hasattr(args, 'csv') and args.csv: + amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.csv.value + if hasattr(args, 'file') and args.file: + amd_smi_commands.logger.destination = args.file + configure_logging_and_execute(args, amd_smi_commands) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 858d03b73e..f0ead15102 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -27,6 +27,7 @@ import os import sys import threading import time +import copy from _version import __version__ from amdsmi_cli_exceptions import AmdSmiInvalidParameterException, AmdSmiRequiredCommandException, AmdSmiInvalidCommandException @@ -1430,7 +1431,29 @@ class AMDSMICommands(): def build_xcp_dict(self, key, violation_status, num_partition): - return {f"xcp_{i}": violation_status[key][i] for i in range(num_partition)} + if not isinstance(violation_status[key], list): + if "active_" in key: + if violation_status[key] != "N/A": + if violation_status[key] is True: + violation_status[key] = "ACTIVE" + elif violation_status[key] is False: + violation_status[key] = "NOT ACTIVE" + ret = violation_status[key] + elif isinstance(violation_status[key], list): + for row in violation_status[key]: + for element in row: + if element != "N/A": + if "active_" in key: + if element is True: + row[row.index(element)] = "ACTIVE" + elif element is False: + row[row.index(element)] = "NOT ACTIVE" + elif ("per_" or "acc_") in key: + row[row.index(element)] = element + else: + continue + ret = {f"xcp_{i}": violation_status[key][i] for i in range(num_partition)} + return ret def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=None, usage=None, watch=None, watch_time=None, iterations=None, power=None, @@ -1469,7 +1492,7 @@ class AMDSMICommands(): guest_data (bool, optional): Value override for args.guest_data. Defaults to None. fb_usage (bool, optional): Value override for args.fb_usage. Defaults to None. xgmi (bool, optional): Value override for args.xgmi. Defaults to None. - throttle (bool, optional): Value override for args.violation. Defaults to None. + throttle (bool, optional): Value override for args.throttle. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -1506,6 +1529,8 @@ class AMDSMICommands(): args.clock = clock if temperature: args.temperature = temperature + if voltage: + args.voltage = voltage if pcie: args.pcie = pcie if ecc: @@ -1532,10 +1557,11 @@ class AMDSMICommands(): args.energy = energy if throttle: args.violation = throttle + args.throttle = throttle current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy", "throttle"] current_platform_values += [args.fan, args.voltage_curve, args.overdrive, - args.perf_level, args.xgmi_err, args.energy, args.violation, + args.perf_level, args.xgmi_err, args.energy, args.throttle, ] if self.helpers.is_hypervisor(): @@ -1636,88 +1662,22 @@ class AMDSMICommands(): gpu_metric = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("#3 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.get_error_info()) - gpu_metric = { - "temperature_edge": "N/A", - "temperature_hotspot": "N/A", - "temperature_mem": "N/A", - "temperature_vrgfx": "N/A", - "temperature_vrsoc": "N/A", - "temperature_vrmem": "N/A", - "average_gfx_activity": "N/A", - "average_umc_activity": "N/A", - "average_mm_activity": "N/A", - "average_socket_power": "N/A", - "energy_accumulator": "N/A", - "system_clock_counter": "N/A", - "average_gfxclk_frequency": "N/A", - "average_socclk_frequency": "N/A", - "average_uclk_frequency": "N/A", - "average_vclk0_frequency": "N/A", - "average_dclk0_frequency": "N/A", - "average_vclk1_frequency": "N/A", - "average_dclk1_frequency": "N/A", - "current_gfxclk": "N/A", - "current_socclk": "N/A", - "current_uclk": "N/A", - "current_vclk0": "N/A", - "current_dclk0": "N/A", - "current_vclk1": "N/A", - "current_dclk1": "N/A", - "throttle_status": "N/A", - "current_fan_speed": "N/A", - "pcie_link_width": "N/A", - "pcie_link_speed": "N/A", - "gfx_activity_acc": "N/A", - "mem_activity_acc": "N/A", - "temperature_hbm": "N/A", - "firmware_timestamp": "N/A", - "voltage_soc": "N/A", - "voltage_gfx": "N/A", - "voltage_mem": "N/A", - "indep_throttle_status": "N/A", - "current_socket_power": "N/A", - "vcn_activity": "N/A", - "gfxclk_lock_status": "N/A", - "xgmi_link_width": "N/A", - "xgmi_link_speed": "N/A", - "pcie_bandwidth_acc": "N/A", - "pcie_bandwidth_inst": "N/A", - "pcie_l0_to_recov_count_acc": "N/A", - "pcie_replay_count_acc": "N/A", - "pcie_replay_rover_count_acc": "N/A", - "xgmi_read_data_acc": "N/A", - "xgmi_write_data_acc": "N/A", - "current_gfxclks": "N/A", - "current_socclks": "N/A", - "current_vclk0s": "N/A", - "current_dclk0s": "N/A", - "jpeg_activity": "N/A", - "pcie_nak_sent_count_acc": "N/A", - "pcie_nak_rcvd_count_acc": "N/A", - "accumulation_counter": "N/A", - "prochot_residency_acc": "N/A", - "ppt_residency_acc": "N/A", - "socket_thm_residency_acc": "N/A", - "vr_thm_residency_acc": "N/A", - "hbm_thm_residency_acc": "N/A", - "num_partition": "N/A", - "xcp_stats.gfx_busy_inst": "N/A", - "xcp_stats.jpeg_busy": "N/A", - "xcp_stats.vcn_busy": "N/A", - "xcp_stats.gfx_busy_acc": "N/A", - "xcp_stats.gfx_below_host_limit_acc": "N/A", - "xcp_stats.gfx_below_host_limit_ppt_acc": "N/A", - "xcp_stats.gfx_below_host_limit_thm_acc": "N/A", - "xcp_stats.gfx_low_utilization_acc": "N/A", - "xcp_stats.gfx_below_host_limit_total_acc": "N/A", - "xcp_stats.gfx_below_host_limit_ppt_per": "N/A", - "xcp_stats.gfx_below_host_limit_thm_per": "N/A", - "xcp_stats.gfx_low_utilization_per": "N/A", - "xcp_stats.gfx_below_host_limit_total_per": "N/A", - "pcie_lc_perf_other_end_recovery": "N/A", - "vram_max_bandwidth": "N/A", - "xgmi_link_status": "N/A", - } + gpu_metric = amdsmi_interface._NA_amdsmi_get_gpu_metrics_info() + + # Workaround for XCP (partition) metrics not providing num_partition in v1.0 + # Confirmed with driver team that we can default to 1 if num_partition is not defined. + # Pending partitions exist, ie. partition_id > 0. See logic below. + try: + partition_id = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get current partition id for gpu %s | %s", gpu_id, e.get_error_info()) + partition_id = "N/A" + + num_partition = gpu_metric['num_partition'] + if num_partition == "N/A" and isinstance(partition_id, int) and partition_id > 0: + num_partition = 1 # Workaround for XCP metrics not providing num_partition in v1.0 + logging.debug(f"num_partition is N/A and partition_id: {partition_id} (greater > 0).\nModified num_partition: {num_partition} to adjust for XCP metrics.") + if self.logger.is_json_format(): values_dict['gpu'] = int(gpu_id) # Populate the pcie_dict first due to multiple gpu metrics calls incorrectly increasing bandwidth @@ -1821,7 +1781,6 @@ class AMDSMICommands(): # TODO: move vcn_activity and jpeg_activity into amdsmi_get_gpu_activity engine_usage['vcn_activity'] = gpu_metric['vcn_activity'] engine_usage['jpeg_activity'] = gpu_metric['jpeg_activity'] - num_partition = gpu_metric['num_partition'] engine_usage['gfx_busy_inst'] = "N/A" engine_usage['jpeg_busy'] = "N/A" engine_usage['vcn_busy'] = "N/A" @@ -2560,7 +2519,7 @@ class AMDSMICommands(): values_dict['mem_usage'] = memory_usage if "throttle" in current_platform_args: - if args.violation: + if args.throttle: throttle_status = { # Current values - counter/accumulated 'accumulation_counter': "N/A", @@ -2571,9 +2530,9 @@ class AMDSMICommands(): 'hbm_thermal_accumulated': "N/A", 'gfx_clk_below_host_limit_accumulated': "N/A", # deprecated 'gfx_clk_below_host_limit_power_accumulated': "N/A", - 'gfx_clk_below_host_limit_thermal_violation_accumulated': "N/A", - 'gfx_clk_below_host_limit_violation_accumulated': "N/A", - 'low_utilization_violation_accumulated': "N/A", + 'gfx_clk_below_host_limit_thermal_accumulated': "N/A", + 'total_gfx_clk_below_host_limit_accumulated': "N/A", + 'low_utilization_accumulated': "N/A", # violation status values - active/not active 'prochot_violation_status': "N/A", @@ -2581,9 +2540,10 @@ class AMDSMICommands(): 'socket_thermal_violation_status': "N/A", 'vr_thermal_violation_status': "N/A", 'hbm_thermal_violation_status': "N/A", + 'gfx_clk_below_host_limit_violation_status': "N/A", # deprecated 'gfx_clk_below_host_limit_power_violation_status': "N/A", 'gfx_clk_below_host_limit_thermal_violation_status': "N/A", - 'gfx_clk_below_host_limit_violation_status': "N/A", + 'total_gfx_clk_below_host_limit_violation_status': "N/A", 'low_utilization_violation_status': "N/A", # violation activity values - percent @@ -2592,12 +2552,12 @@ class AMDSMICommands(): 'socket_thermal_violation_activity': "N/A", 'vr_thermal_violation_activity': "N/A", 'hbm_thermal_violation_activity': "N/A", + 'gfx_clk_below_host_limit_violation_activity': "N/A", # deprecated 'gfx_clk_below_host_limit_power_violation_activity': "N/A", 'gfx_clk_below_host_limit_thermal_violation_activity': "N/A", - 'gfx_clk_below_host_limit_violation_activity': "N/A", + 'total_gfx_clk_below_host_limit_violation_activity': "N/A", 'low_utilization_violation_activity': "N/A", } - num_partition = gpu_metric['num_partition'] try: violation_status = amdsmi_interface.amdsmi_get_violation_status(args.gpu) @@ -2609,18 +2569,18 @@ class AMDSMICommands(): throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm'] throttle_status['gfx_clk_below_host_limit_accumulated'] = violation_status['acc_gfx_clk_below_host_limit'] #deprecated throttle_status['gfx_clk_below_host_limit_power_accumulated'] = self.build_xcp_dict('acc_gfx_clk_below_host_limit_pwr', violation_status, num_partition) - throttle_status['gfx_clk_below_host_limit_thermal_violation_accumulated'] = self.build_xcp_dict('acc_gfx_clk_below_host_limit_thm', violation_status, num_partition) - throttle_status['gfx_clk_below_host_limit_violation_accumulated'] = self.build_xcp_dict('acc_gfx_clk_below_host_limit_total', violation_status, num_partition) - throttle_status['low_utilization_violation_accumulated'] = self.build_xcp_dict('acc_low_utilization', violation_status, num_partition) - throttle_status['prochot_violation_status'] = violation_status['active_prochot_thrm'] - throttle_status['ppt_violation_status'] = violation_status['active_ppt_pwr'] - throttle_status['socket_thermal_violation_status'] = violation_status['active_socket_thrm'] - throttle_status['vr_thermal_violation_status'] = violation_status['active_vr_thrm'] - throttle_status['hbm_thermal_violation_status'] = violation_status['active_hbm_thrm'] - throttle_status['gfx_clk_below_host_limit_violation_status'] = violation_status['active_gfx_clk_below_host_limit'] # deprecated + throttle_status['gfx_clk_below_host_limit_thermal_accumulated'] = self.build_xcp_dict('acc_gfx_clk_below_host_limit_thrm', violation_status, num_partition) + throttle_status['total_gfx_clk_below_host_limit_accumulated'] = self.build_xcp_dict('acc_gfx_clk_below_host_limit_total', violation_status, num_partition) + throttle_status['low_utilization_accumulated'] = self.build_xcp_dict('acc_low_utilization', violation_status, num_partition) + throttle_status['prochot_violation_status'] = self.build_xcp_dict('active_prochot_thrm', violation_status, num_partition) + throttle_status['ppt_violation_status'] = self.build_xcp_dict('active_ppt_pwr', violation_status, num_partition) + throttle_status['socket_thermal_violation_status'] = self.build_xcp_dict('active_socket_thrm', violation_status, num_partition) + throttle_status['vr_thermal_violation_status'] = self.build_xcp_dict('active_vr_thrm', violation_status, num_partition) + throttle_status['hbm_thermal_violation_status'] = self.build_xcp_dict('active_hbm_thrm', violation_status, num_partition) + throttle_status['gfx_clk_below_host_limit_violation_status'] = self.build_xcp_dict('active_gfx_clk_below_host_limit', violation_status, num_partition) # deprecated throttle_status['gfx_clk_below_host_limit_power_violation_status'] = self.build_xcp_dict('active_gfx_clk_below_host_limit_pwr', violation_status, num_partition) - throttle_status['gfx_clk_below_host_limit_thermal_violation_status'] = self.build_xcp_dict('active_gfx_clk_below_host_limit_thm', violation_status, num_partition) - throttle_status['gfx_clk_below_host_limit_violation_status'] = self.build_xcp_dict('active_gfx_clk_below_host_limit_total', violation_status, num_partition) + throttle_status['gfx_clk_below_host_limit_thermal_violation_status'] = self.build_xcp_dict('active_gfx_clk_below_host_limit_thrm', violation_status, num_partition) + throttle_status['total_gfx_clk_below_host_limit_violation_status'] = self.build_xcp_dict('active_gfx_clk_below_host_limit_total', violation_status, num_partition) throttle_status['low_utilization_violation_status'] = self.build_xcp_dict('active_low_utilization', violation_status, num_partition) throttle_status['prochot_violation_activity'] = violation_status['per_prochot_thrm'] throttle_status['ppt_violation_activity'] = violation_status['per_ppt_pwr'] @@ -2629,20 +2589,15 @@ class AMDSMICommands(): throttle_status['hbm_thermal_violation_activity'] = violation_status['per_hbm_thrm'] throttle_status['gfx_clk_below_host_limit_violation_activity'] = violation_status['per_gfx_clk_below_host_limit'] # deprecated throttle_status['gfx_clk_below_host_limit_power_violation_activity'] = self.build_xcp_dict('per_gfx_clk_below_host_limit_pwr', violation_status, num_partition) - throttle_status['gfx_clk_below_host_limit_thermal_violation_activity'] = self.build_xcp_dict('per_gfx_clk_below_host_limit_thm', violation_status, num_partition) - throttle_status['gfx_clk_below_host_limit_violation_activity'] = self.build_xcp_dict('per_low_utilization', violation_status, num_partition) - throttle_status['low_utilization_violation_activity'] = self.build_xcp_dict('per_gfx_clk_below_host_limit_total', violation_status, num_partition) + throttle_status['gfx_clk_below_host_limit_thermal_violation_activity'] = self.build_xcp_dict('per_gfx_clk_below_host_limit_thrm', violation_status, num_partition) + throttle_status['total_gfx_clk_below_host_limit_violation_activity'] = self.build_xcp_dict('per_gfx_clk_below_host_limit_total', violation_status, num_partition) + throttle_status['low_utilization_violation_activity'] = self.build_xcp_dict('per_low_utilization', violation_status, num_partition) except amdsmi_exception.AmdSmiLibraryException as e: values_dict['throttle'] = throttle_status logging.debug("Failed to get violation status' for gpu %s | %s", gpu_id, e.get_error_info()) for key, value in throttle_status.items(): - if "_status" in key: - if value is True: - throttle_status[key] = "ACTIVE" - elif value is False: - throttle_status[key] = "NOT ACTIVE" activity_unit = '' if "_activity" in key: @@ -2651,21 +2606,18 @@ class AMDSMICommands(): if self.logger.is_human_readable_format(): if isinstance(value, (list, dict)): for k, v in value.items(): - for index, activity in enumerate(v): - if activity != "N/A": - value[k][index] = f"{activity} {activity_unit}" - value[k] = '[' + ", ".join(value[k]) + ']' + for index, activity in enumerate(v): + value[k][index] = self.helpers.unit_format(self.logger, activity, activity_unit) + value[k] = '[' + ", ".join(value[k]) + ']' elif value != "N/A": - throttle_status[key] = f"{value} {activity_unit}" + value = self.helpers.unit_format(self.logger, value, activity_unit) if self.logger.is_json_format(): - if isinstance(value, list): - for index, activity in enumerate(value): - if activity != "N/A": - throttle_status[key][index] = {"value" : activity, - "unit" : activity_unit} + if isinstance(value, (list, dict)): + for k, v in value.items(): + for index, activity in enumerate(v): + value[k][index] = self.helpers.unit_format(self.logger, activity, activity_unit) elif value != "N/A": - throttle_status[key] = {"value" : value, - "unit" : activity_unit} + throttle_status[key] = self.helpers.unit_format(self.logger, value, activity_unit) values_dict['throttle'] = throttle_status # Store timestamp first if watching_output is enabled @@ -5525,7 +5477,6 @@ class AMDSMICommands(): self.logger.clear_multiple_devices_output() return - def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None, watch=None, watch_time=None, iterations=None, power_usage=None, temperature=None, gfx_util=None, mem_util=None, encoder=None, @@ -5691,9 +5642,29 @@ class AMDSMICommands(): gpu_metric_debug_info = json.dumps(gpu_metrics_info, indent=4) logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, gpu_metric_debug_info) except amdsmi_exception.AmdSmiLibraryException as e: - gpu_metrics_info = {} # Empty dict to avoid NameError + gpu_metrics_info = amdsmi_interface._NA_amdsmi_get_gpu_metrics_info() logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.get_error_info()) + # Workaround for XCP (partition) metrics not providing num_partition in v1.0 + # Confirmed with driver team that we can default to 1 if num_partition is not defined. + # Pending partitions exist, ie. partition_id > 0. See logic below. + try: + partition_id = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get current partition id for gpu %s | %s", gpu_id, e.get_error_info()) + partition_id = "N/A" + + num_partition = gpu_metrics_info['num_partition'] + if num_partition == "N/A": + num_partition = partition_id + + num_xcp = num_partition # used later for XCP metrics + self.logger.table_header += 'XCP'.rjust(5, ' ') + self.logger.store_output(args.gpu, 'xcp', partition_id) # Starting with partition_id. + # Outputs which have xcp details + # will update this value via num_xcp. + # This value will help map to primary device. + # Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls if args.pcie: try: @@ -5725,10 +5696,11 @@ class AMDSMICommands(): self.logger.table_header += 'POWER'.rjust(7) if args.power_usage and not args.default_output: - # Get Max Power Cap + # Get Current Power Cap try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) - monitor_values['max_power'] = power_cap_info['max_power_cap'] + monitor_values['max_power'] = power_cap_info['power_cap'] # Get current power cap (`power_cap`) socket is set to + # `max_power_cap`, is the maximum value it can be set to monitor_values['max_power'] = self.helpers.convert_SI_unit(monitor_values['max_power'], AMDSMIHelpers.SI_Unit.MICRO) if self.logger.is_human_readable_format() and monitor_values['max_power'] != "N/A": @@ -5785,7 +5757,7 @@ class AMDSMICommands(): monitor_values['gfx_clk'] = f"{monitor_values['gfx_clk']} {freq_unit}" if self.logger.is_json_format(): monitor_values['gfx_clk'] = {"value" : monitor_values['gfx_clk'], - "unit" : freq_unit} + "unit" : freq_unit} except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: monitor_values['gfx_clk'] = "N/A" @@ -5795,13 +5767,13 @@ class AMDSMICommands(): try: gfx_util = gpu_metrics_info['average_gfx_activity'] - monitor_values['gfx'] = round(gfx_util) activity_unit = '%' if gfx_util != "N/A": - if self.logger.is_human_readable_format(): - monitor_values['gfx'] = f"{monitor_values['gfx']} {activity_unit}" - if self.logger.is_json_format(): - monitor_values['gfx'] = {"value" : monitor_values['gfx'], + monitor_values['gfx'] = gfx_util + if self.logger.is_human_readable_format(): + monitor_values['gfx'] = f"{monitor_values['gfx']} {activity_unit}" + if self.logger.is_json_format(): + monitor_values['gfx'] = {"value" : monitor_values['gfx'], "unit" : activity_unit} except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: monitor_values['gfx'] = "N/A" @@ -5812,14 +5784,14 @@ class AMDSMICommands(): if args.mem: try: mem_util = gpu_metrics_info['average_umc_activity'] - monitor_values['mem'] = round(mem_util) activity_unit = '%' if mem_util != "N/A": - if self.logger.is_human_readable_format(): - monitor_values['mem'] = f"{monitor_values['mem']} {activity_unit}" - if self.logger.is_json_format(): - monitor_values['mem'] = {"value" : monitor_values['mem'], - "unit" : activity_unit} + monitor_values['mem'] = mem_util + if self.logger.is_human_readable_format(): + monitor_values['mem'] = f"{monitor_values['mem']} {activity_unit}" + if self.logger.is_json_format(): + monitor_values['mem'] = {"value" : monitor_values['mem'], + "unit" : activity_unit} except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: monitor_values['mem'] = "N/A" logging.debug("Failed to get mem utilization on gpu %s | %s", gpu_id, e) @@ -5878,19 +5850,13 @@ class AMDSMICommands(): if args.decoder: try: # Get List of vcn activity values - # Note: MI3x ASICs only support decoding, so the vcn_activity is used for decoding activity. + # Note: MI3x ASICs only support decoding, so the vcn_activity/vcn_busy + # is used for decoding activity. decoder_util = gpu_metrics_info['vcn_activity'] - decoding_activity_avg = [] - for value in decoder_util: - if isinstance(value, int): - decoding_activity_avg.append(value) - - # Averaging the possible decoding activity values - if decoding_activity_avg: - decoding_activity_avg = round(sum(decoding_activity_avg) / len(decoding_activity_avg)) - else: - decoding_activity_avg = "N/A" - + if (gpu_metrics_info['vcn_activity'][0] == "N/A" and + gpu_metrics_info['xcp_stats.vcn_busy'][partition_id][0] != "N/A"): + decoder_util = gpu_metrics_info['xcp_stats.vcn_busy'][partition_id] + decoding_activity_avg = self.helpers.average_flattened_ints(decoder_util, context="decoder_util") monitor_values['decoder'] = decoding_activity_avg activity_unit = '%' @@ -6050,6 +6016,10 @@ class AMDSMICommands(): "vr_tviol": "N/A", "hbm_tviol": "N/A", "gfx_clkviol": "N/A", + "gfxclk_pviol": "N/A", + "gfxclk_tviol": "N/A", + "gfxclk_totalviol": "N/A", + "low_utilviol": "N/A" } try: violations = amdsmi_interface.amdsmi_get_violation_status(args.gpu) @@ -6060,6 +6030,10 @@ class AMDSMICommands(): violation_status['vr_tviol'] = violations['per_vr_thrm'] violation_status['hbm_tviol'] = violations['per_hbm_thrm'] violation_status['gfx_clkviol'] = violations['per_gfx_clk_below_host_limit'] + violation_status['gfxclk_pviol'] = violations['per_gfx_clk_below_host_limit_pwr'] + violation_status['gfxclk_tviol'] = violations['per_gfx_clk_below_host_limit_thrm'] + violation_status['gfxclk_totalviol'] = violations['per_gfx_clk_below_host_limit_total'] + violation_status['low_utilviol'] = violations['per_low_utilization'] except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['pviol'] = violation_status['pviol'] monitor_values['tviol'] = violation_status['tviol'] @@ -6068,6 +6042,10 @@ class AMDSMICommands(): monitor_values['vr_tviol'] = violation_status['vr_tviol'] monitor_values['hbm_tviol'] = violation_status['hbm_tviol'] monitor_values['gfx_clkviol'] = violation_status['gfx_clkviol'] + monitor_values['gfxclk_pviol'] = violation_status['gfxclk_pviol'] + monitor_values['gfxclk_tviol'] = violation_status['gfxclk_tviol'] + monitor_values['gfxclk_totalviol'] = violation_status['gfxclk_totalviol'] + monitor_values['low_utilviol'] = violation_status['low_utilviol'] logging.debug("Failed to get violation status on gpu %s | %s", gpu_id, e.get_error_info()) violation_status_unit = "%" kPVIOL_MAX_WIDTH = 7 @@ -6077,23 +6055,32 @@ class AMDSMICommands(): kVR_MAX_WIDTH = 10 kHBM_MAX_WIDTH = 11 kGFXC_MAX_WIDTH = 13 + kGFXC_PVIOL_MAX_WIDTH = 58 + kGFXC_TVIOL_MAX_WIDTH = kGFXC_PVIOL_MAX_WIDTH + kGFXC_TOTALVIOL_MAX_WIDTH = kGFXC_PVIOL_MAX_WIDTH + kLOW_UTILVIOL_MAX_WIDTH = kGFXC_PVIOL_MAX_WIDTH for key, value in violation_status.items(): - if value != "N/A": - if key == "tviol_active": - monitor_values[key] = value + if not isinstance(value, list): + if value != "N/A": + if key == 'tviol_active' or key == 'xcp': + monitor_values[key] = value + else: + monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit) else: - monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit) + monitor_values[key] = violation_status[key] else: - monitor_values[key] = violation_status[key] + if num_partition != "N/A": + # these are one after another, in order to display each in sub-sections + new_xcp_dict = {} + for current_xcp in range(num_partition): + new_xcp_dict[f"xcp_{current_xcp}"] = self.helpers.unit_format(self.logger, value[current_xcp], "%") + monitor_values[key] = new_xcp_dict + else: + monitor_values[key] = value[0] if value else "N/A" + # save deep copy of monitor values, used later to grab xcp specific values + monitor_values_deepcopy = copy.deepcopy(monitor_values) - if self.logger.is_human_readable_format(): - monitor_values['pviol'] = monitor_values['pviol'].rjust(kPVIOL_MAX_WIDTH, ' ') - monitor_values['tviol'] = monitor_values['tviol'].rjust(kTVIOL_MAX_WIDTH, ' ') - monitor_values['phot_tviol'] = monitor_values['phot_tviol'].rjust(kPHOT_MAX_WIDTH, ' ') - monitor_values['vr_tviol'] = monitor_values['vr_tviol'].rjust(kVR_MAX_WIDTH, ' ') - monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'].rjust(kHBM_MAX_WIDTH, ' ') - monitor_values['gfx_clkviol'] = monitor_values['gfx_clkviol'].rjust(kGFXC_MAX_WIDTH, ' ') self.logger.table_header += 'PVIOL'.rjust(kPVIOL_MAX_WIDTH, ' ') self.logger.table_header += 'TVIOL'.rjust(kTVIOL_MAX_WIDTH, ' ') self.logger.table_header += 'TVIOL_ACTIVE'.rjust(kTVIOL_ACTIVE_MAX_WIDTH, ' ') @@ -6101,9 +6088,69 @@ class AMDSMICommands(): self.logger.table_header += 'VR_TVIOL'.rjust(kVR_MAX_WIDTH, ' ') self.logger.table_header += 'HBM_TVIOL'.rjust(kHBM_MAX_WIDTH, ' ') self.logger.table_header += 'GFX_CLKVIOL'.rjust(kGFXC_MAX_WIDTH, ' ') + self.logger.table_header += 'GFXCLK_PVIOL'.rjust(kGFXC_PVIOL_MAX_WIDTH, ' ') + self.logger.table_header += 'GFXCLK_TVIOL'.rjust(kGFXC_TVIOL_MAX_WIDTH, ' ') + self.logger.table_header += 'GFXCLK_TOTALVIOL'.rjust(kGFXC_TOTALVIOL_MAX_WIDTH, ' ') + self.logger.table_header += 'LOW_UTILVIOL'.rjust(kLOW_UTILVIOL_MAX_WIDTH, ' ') - self.logger.store_output(args.gpu, 'values', monitor_values) + # Print/capture by XCPs + if num_partition != "N/A" and partition_id == 0: + current_xcp = 0 + while (current_xcp in range(num_partition) or current_xcp == 0): + if not multiple_devices and watching_output and current_xcp == 0: + # Need to clear output for single device, otherwise while watching output + # XCP detail will continue stacking on top of each other + self.logger.clear_multiple_devices_output() + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + + self.logger.store_output(args.gpu, 'xcp', current_xcp) + if current_xcp != 0: # set all other values without XCP stats to N/A + monitor_values['pviol'] = "N/A" + monitor_values['tviol'] = "N/A" + monitor_values['tviol_active'] = "N/A" + monitor_values['phot_tviol'] = "N/A" + monitor_values['vr_tviol'] = "N/A" + monitor_values['hbm_tviol'] = "N/A" + monitor_values['gfx_clkviol'] = "N/A" + for k, _ in monitor_values.items(): # change other keys to "N/A" since we should have all applicable XCP stats + # eg. amd-smi monitor -p -t -V should only show XCP info for violations + # below primary device + if k != 'xcp' and k not in ['gfxclk_pviol', 'gfxclk_tviol', 'gfxclk_totalviol', 'low_utilviol']: + monitor_values[k] = "N/A" + + if isinstance(monitor_values_deepcopy['gfxclk_pviol'], dict): + monitor_values['gfxclk_pviol'] = monitor_values_deepcopy['gfxclk_pviol'][f"xcp_{current_xcp}"] + if isinstance(monitor_values_deepcopy['gfxclk_tviol'], dict): + monitor_values['gfxclk_tviol'] = monitor_values_deepcopy['gfxclk_tviol'][f"xcp_{current_xcp}"] + if isinstance(monitor_values_deepcopy['gfxclk_totalviol'], dict): + monitor_values['gfxclk_totalviol'] = monitor_values_deepcopy['gfxclk_totalviol'][f"xcp_{current_xcp}"] + if isinstance(monitor_values_deepcopy['low_utilviol'], dict): + monitor_values['low_utilviol'] = monitor_values_deepcopy['low_utilviol'][f"xcp_{current_xcp}"] + + if self.logger.is_human_readable_format(): + monitor_values['pviol'] = monitor_values['pviol'].rjust(kPVIOL_MAX_WIDTH, ' ') + monitor_values['tviol'] = monitor_values['tviol'].rjust(kTVIOL_MAX_WIDTH, ' ') + monitor_values['phot_tviol'] = monitor_values['phot_tviol'].rjust(kPHOT_MAX_WIDTH, ' ') + monitor_values['vr_tviol'] = monitor_values['vr_tviol'].rjust(kVR_MAX_WIDTH, ' ') + monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'].rjust(kHBM_MAX_WIDTH, ' ') + monitor_values['gfx_clkviol'] = monitor_values['gfx_clkviol'].rjust(kGFXC_MAX_WIDTH, ' ') + monitor_values['gfxclk_pviol'] = str(monitor_values['gfxclk_pviol']).rjust(kGFXC_PVIOL_MAX_WIDTH, ' ').strip().replace('\'', '') + monitor_values['gfxclk_tviol'] = str(monitor_values['gfxclk_tviol']).rjust(kGFXC_TVIOL_MAX_WIDTH, ' ').strip().replace('\'', '') + monitor_values['gfxclk_totalviol'] = str(monitor_values['gfxclk_totalviol']).rjust(kGFXC_TOTALVIOL_MAX_WIDTH, ' ').strip().replace('\'', '') + monitor_values['low_utilviol'] = str(monitor_values['low_utilviol']).rjust(kLOW_UTILVIOL_MAX_WIDTH, ' ').strip().replace('\'', '') + self.logger.store_output(args.gpu, 'values', monitor_values) + self.logger.store_multiple_device_output() + current_xcp += 1 + else: + self.logger.store_output(args.gpu, 'xcp', num_xcp) + self.logger.store_output(args.gpu, 'values', monitor_values) + self.logger.store_multiple_device_output() + + # Store typical output for all commands (XCP data will be handled separately, eg. violation status) + if not args.violation: + self.logger.store_output(args.gpu, 'values', monitor_values) # intialize dual_csv_format; applicable to process only dual_csv_output = False @@ -6207,7 +6254,7 @@ class AMDSMICommands(): self.logger.store_watch_output(multiple_device_enabled=False) - self.logger.print_output(multiple_device_enabled=False, watching_output=watching_output, tabular=True, dual_csv_output=dual_csv_output) + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output, tabular=True, dual_csv_output=dual_csv_output) def xgmi(self, args, multiple_devices=False, gpu=None, metric=None, xgmi_link_status=None): @@ -6947,7 +6994,7 @@ class AMDSMICommands(): try: gpu_metrics = amdsmi_interface.amdsmi_get_gpu_metrics_info(processor) except amdsmi_exception.AmdSmiLibraryException as e: - gpu_metrics = "N/A" + gpu_metrics = amdsmi_interface._NA_amdsmi_get_gpu_metrics_info() # partition info try: @@ -6999,9 +7046,7 @@ class AMDSMICommands(): # mem utilization, GPU utilization, power usage, and temperature from gpu_metrics if gpu_metrics != "N/A": mem_util = gpu_metrics['average_umc_activity'] - mem_util = round(mem_util) gfx_util = gpu_metrics['average_gfx_activity'] - gfx_util = round(gfx_util) if gpu_metrics['current_socket_power'] != "N/A": current_power = gpu_metrics['current_socket_power'] else: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 8764009e56..aaa9053886 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1014,13 +1014,28 @@ class AMDSMIHelpers(): return: str or dict : formatted output """ - if value == "N/A": - return "N/A" - if logger.is_json_format(): - return {"value": value, "unit": unit} - if logger.is_human_readable_format(): - return f"{value} {unit}".rstrip() - return f"{value}" + if isinstance(value, list): + formatted_values = [] + for val in value: + if isinstance(val, str) and val == "N/A": + formatted_values.append("N/A") + else: + formatted_values.append(self.unit_format(logger, val, unit)) + return formatted_values + else: + if value == "N/A": + return "N/A" + if logger.is_json_format(): + if unit: + return {"value": value, "unit": unit} + else: + return value + if logger.is_human_readable_format(): + if unit: + return f"{value} {unit}".rstrip() + else: + return f"{value}".rstrip() + return f"{value}" def unit_unformat(self, logger, formatted_value): """ @@ -1483,3 +1498,22 @@ class AMDSMIHelpers(): ranges[cpu] = f"{start_setbit}-{end_setbit}" return ranges + + @staticmethod + def average_flattened_ints(data, context="data"): + """Calculate the average of flattened integers from a list or tuple + Args: + data (list or tuple): Data to calculate the average from + context (str, optional): Context for logging. Defaults to "data". + Returns: + float or str: Average of integers if available, otherwise "N/A" + """ + # Type validation - ensure data is list or tuple + # Note: Data can be nested list of lists and will filter out N/A values + if not isinstance(data, (list, tuple)): + logging.debug(f"Invalid data type for {context}: expected list/tuple, got {type(data)}") + return "N/A" + + # Flatten nested lists and filter integers + flat = [v for value in data for v in (value if isinstance(value, list) else [value]) if isinstance(v, int)] + return round(sum(flat) / len(flat)) if flat else "N/A" diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 3e5bc01f2e..49aa0f53e3 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -157,6 +157,9 @@ class AMDSMILogger(): elif key == 'gpu': stored_gpu = string_value table_values += string_value.rjust(3) + elif key == 'xcp': + stored_gpu = string_value + table_values += string_value.rjust(5) elif key == 'timestamp': stored_timestamp = string_value table_values += string_value.rjust(10) + ' ' @@ -170,6 +173,8 @@ class AMDSMILogger(): table_values += string_value.rjust(7) elif key in ('gfx_clk'): table_values += string_value.rjust(10) + elif key in ('vram_usage'): + table_values += string_value.rjust(16) elif key in ('mem_clock', 'vram_used'): table_values += string_value.rjust(11) elif key in ('vram_total', 'vram_free'): @@ -217,6 +222,8 @@ class AMDSMILogger(): table_values += string_value.rjust(11) elif key == "gfx_clkviol": table_values += string_value.rjust(13) + elif key in ("gfxclk_pviol", "gfxclk_tviol", "gfxclk_totalviol", "low_utilviol"): + table_values += string_value.rjust(58) elif key == "process_list": #Add an additional padding between the first instance of GPU and NAME table_values += ' ' diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 2f854c1982..0accb11520 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1014,8 +1014,8 @@ class AMDSMIParser(argparse.ArgumentParser): metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help) metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help) metric_parser.add_argument('-E', '--energy', action='store_true', required=False, help=energy_help) - metric_parser.add_argument('-v', '--violation', action='store_true', required=False, help=throttle_help) - metric_parser.add_argument('-T', '--throttle', dest='violation', action='store_true', required=False, help=argparse.SUPPRESS) + metric_parser.add_argument('-v', '--violation', dest='throttle', action='store_true', required=False, help=throttle_help) + metric_parser.add_argument('-T', '--throttle', dest='throttle', action='store_true', required=False, help=argparse.SUPPRESS) # Options to only display to Hypervisors if self.helpers.is_hypervisor(): diff --git a/projects/amdsmi/example/amd_smi_drm_example.cc b/projects/amdsmi/example/amd_smi_drm_example.cc index 7850b6a9e0..8e8919a5c4 100644 --- a/projects/amdsmi/example/amd_smi_drm_example.cc +++ b/projects/amdsmi/example/amd_smi_drm_example.cc @@ -872,6 +872,7 @@ int main() { // For each device of the socket, get name and temperature. for (uint32_t device_index = 0; device_index < device_count; device_index++) { std::cout << "Device Index: " << device_index << std::endl; + std::cout << "SMI gpu #: " << gpu_number << std::endl; // Commenting out the code to get CPU socket count and GPU count // Doesn't work on system with no supported CPU sockets @@ -884,6 +885,95 @@ int main() { std::cout << "GPU count: " << gpus << std::endl; #endif +// Commenting out since, not verified to work on all ASICs yet. +#if 0 + amdsmi_name_value_t *pm_metrics = {}; + uint32_t num_metrics = 0; + ret = amdsmi_get_gpu_pm_metrics_info(processor_handles[device_index], + &pm_metrics, &num_metrics); + const char* err_str; + amdsmi_status_code_to_string(ret, &err_str); + std::cout << " Output of amdsmi_get_gpu_pm_metrics_info:" << err_str << "\n"; + if (ret == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret) + std::cout << "\tNumber of PM metrics: " << num_metrics << std::endl; + for (uint32_t j = 0; j < num_metrics; j++) { + std::cout << "\tPM Metric Name: " << pm_metrics[j].name + << ", Value: " << pm_metrics[j].value << std::endl; + } + } + free(pm_metrics); + + // typedef enum { + // AMDSMI_REG_XGMI, //!< XGMI registers + // AMDSMI_REG_WAFL, //!< WAFL registers + // AMDSMI_REG_PCIE, //!< PCIe registers + // AMDSMI_REG_USR, //!< Usr registers + // AMDSMI_REG_USR1 //!< Usr1 registers + // } amdsmi_reg_type_t; + std::map reg_type_map = { + {AMDSMI_REG_XGMI, "XGMI"}, + {AMDSMI_REG_WAFL, "WAFL"}, + {AMDSMI_REG_PCIE, "PCIE"}, + {AMDSMI_REG_USR, "USR"}, + {AMDSMI_REG_USR1, "USR1"} + }; + + for (uint32_t j = static_cast(AMDSMI_REG_XGMI); + j <= static_cast(AMDSMI_REG_USR1); j++) { + amdsmi_name_value_t *reg_metrics = {}; + amdsmi_reg_type_t reg_type = static_cast(j); + std::string reg_type_str = "N/A"; + ret = amdsmi_get_gpu_reg_table_info(processor_handles[device_index], + reg_type, ®_metrics, &num_metrics); + if (auto it = reg_type_map.find(reg_type); it != reg_type_map.end()) { + reg_type_str = it->second; + } + // Skipping these for now due to some ASICS having issues + if (reg_type == AMDSMI_REG_USR1 || reg_type == AMDSMI_REG_XGMI || + reg_type == AMDSMI_REG_USR) { + std::cout << "\tSkipping " << reg_type_str << " registers for now." + << std::endl; + free(reg_metrics); + continue; + } + + amdsmi_status_code_to_string(ret, &err_str); + std::cout << " Output of amdsmi_get_gpu_reg_table_info(" << gpu_number << ", " + << reg_type_str << "): " << err_str << "\n"; + if (ret == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret) + std::cout << "\tNumber of Register metrics: " << num_metrics << std::endl; + for (uint32_t k = 0; k < num_metrics; k++) { + if (reg_metrics == nullptr) { + std::cout << "\tRegister Number: " << k + << ", Type: " << reg_type_str + << ", Register Metric Name: N/A, Value: N/A" << std::endl; + continue; + } + if (reg_metrics[k].name == nullptr) { + std::cout << "\tRegister Number: " << k + << ", Type: " << reg_type_str + << ", Register Metric Name: " + << (reg_metrics[k].name != nullptr ? + reg_metrics[k].name : "N/A") + << ", Value: N/A" << std::endl; + continue; + } + std::cout << "\tRegister Number: " << k + << ", Type: " << reg_type_str + << ", Register Metric Name: " + << (reg_metrics[k].name != nullptr ? + reg_metrics[k].name : "N/A") + << ", Value: " << reg_metrics[k].value << std::endl; + } + } + free(reg_metrics); + std::cout << std::endl; + } + std::cout << std::endl; +#endif + // Get device type. Since the amdsmi is initialized with // AMD_SMI_INIT_AMD_GPUS, the processor_type must be AMDSMI_PROCESSOR_TYPE_AMD_GPU. processor_type_t processor_type = {}; @@ -1909,8 +1999,8 @@ int main() { } } } - gpu_number++; - } + gpu_number++; + } } // Clean up resources allocated at amdsmi_init. It will invalidate sockets diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 00a8d9f502..fdc7c89bb6 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -714,17 +714,17 @@ typedef struct { Gfx clock below host limit violation; 1 = active 0 = not active; Max uint8 means unsupported.*/ //GPU metrics 1.8 violations uint64_t acc_gfx_clk_below_host_limit_pwr[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Current gfx clock below host limit power count; Max uint64 means unsupported - uint64_t acc_gfx_clk_below_host_limit_thm[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Current gfx clock below host limit thermal count; Max uint64 means unsupported + uint64_t acc_gfx_clk_below_host_limit_thrm[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Current gfx clock below host limit thermal count; Max uint64 means unsupported uint64_t acc_low_utilization[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Current low utilization count; Max uint64 means unsupported uint64_t acc_gfx_clk_below_host_limit_total[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Current gfx clock below host limit total count; Max uint64 means unsupported uint64_t per_gfx_clk_below_host_limit_pwr[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Gfx clock below host limit power violation % (greater than 0% is a violation); Max uint64 means unsupported - uint64_t per_gfx_clk_below_host_limit_thm[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Gfx clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported + uint64_t per_gfx_clk_below_host_limit_thrm[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Gfx clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported uint64_t per_low_utilization[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Low utilization violation % (greater than 0% is a violation); Max uint64 means unsupported uint64_t per_gfx_clk_below_host_limit_total[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Any Gfx clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported uint8_t active_gfx_clk_below_host_limit_pwr[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Gfx clock below host limit power violation; 1 = active 0 = not active; Max uint8 means unsupported - uint8_t active_gfx_clk_below_host_limit_thm[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Gfx clock below host limit thermal violation; 1 = active 0 = not active; Max uint8 means unsupported + uint8_t active_gfx_clk_below_host_limit_thrm[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Gfx clock below host limit thermal violation; 1 = active 0 = not active; Max uint8 means unsupported uint8_t active_low_utilization[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; //!< New Driver 1.8 fields: Low utilization violation; 1 = active 0 = not active; Max uint8 means unsupported uint8_t active_gfx_clk_below_host_limit_total[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC];//!< New Driver 1.8 fields: Any Gfx clock host limit violation; 1 = active 0 = not active; Max uint8 means unsupported uint64_t reserved[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]; // reserved for new violation info diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index f289f4fa39..2f35d637f5 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -787,6 +787,103 @@ def _notifyTypeToString(notify_type_b): else: return "Unknown" +def _NA_amdsmi_get_gpu_metrics_info() -> Dict[str, str]: + """ + Get 'N/A' metric values for gpu_metric, used for exception handling. + + Parameters: + None + + Returns: + Dict[str, str]: A dictionary with keys as metric names and values as 'N/A'. + This is used to indicate that the metric is not available or applicable. + + Raises: + N/A + """ + na_gpu_metrics_info = { + "common_header.structure_size": "N/A", + "common_header.format_revision": "N/A", + "common_header.content_revision": "N/A", + "temperature_edge": "N/A", + "temperature_hotspot": "N/A", + "temperature_mem": "N/A", + "temperature_vrgfx": "N/A", + "temperature_vrsoc": "N/A", + "temperature_vrmem": "N/A", + "average_gfx_activity": "N/A", + "average_umc_activity": "N/A", + "average_mm_activity": "N/A", + "average_socket_power": "N/A", + "energy_accumulator": "N/A", + "system_clock_counter": "N/A", + "average_gfxclk_frequency": "N/A", + "average_socclk_frequency": "N/A", + "average_uclk_frequency": "N/A", + "average_vclk0_frequency": "N/A", + "average_dclk0_frequency": "N/A", + "average_vclk1_frequency": "N/A", + "average_dclk1_frequency": "N/A", + "current_gfxclk": "N/A", + "current_socclk": "N/A", + "current_uclk": "N/A", + "current_vclk0": "N/A", + "current_dclk0": "N/A", + "current_vclk1": "N/A", + "current_dclk1": "N/A", + "throttle_status": "N/A", + "current_fan_speed": "N/A", + "pcie_link_width": "N/A", + "pcie_link_speed": "N/A", + "gfx_activity_acc": "N/A", + "mem_activity_acc": "N/A", + "temperature_hbm": "N/A", + "firmware_timestamp": "N/A", + "voltage_soc": "N/A", + "voltage_gfx": "N/A", + "voltage_mem": "N/A", + "indep_throttle_status": "N/A", + "current_socket_power": "N/A", + "vcn_activity": "N/A", + "gfxclk_lock_status": "N/A", + "xgmi_link_width": "N/A", + "xgmi_link_speed": "N/A", + "pcie_bandwidth_acc": "N/A", + "pcie_bandwidth_inst": "N/A", + "pcie_l0_to_recov_count_acc": "N/A", + "pcie_replay_count_acc": "N/A", + "pcie_replay_rover_count_acc": "N/A", + "xgmi_read_data_acc": "N/A", + "xgmi_write_data_acc": "N/A", + "current_gfxclks": "N/A", + "current_socclks": "N/A", + "current_vclk0s": "N/A", + "current_dclk0s": "N/A", + "jpeg_activity": "N/A", + "pcie_nak_sent_count_acc": "N/A", + "pcie_nak_rcvd_count_acc": "N/A", + "accumulation_counter": "N/A", + "prochot_residency_acc": "N/A", + "ppt_residency_acc": "N/A", + "socket_thm_residency_acc": "N/A", + "vr_thm_residency_acc": "N/A", + "hbm_thm_residency_acc": "N/A", + "num_partition": "N/A", + "xcp_stats.gfx_busy_inst": "N/A", + "xcp_stats.jpeg_busy": "N/A", + "xcp_stats.vcn_busy": "N/A", + "xcp_stats.gfx_busy_acc": "N/A", + "xcp_stats.gfx_below_host_limit_acc": "N/A", + "xcp_stats.gfx_below_host_limit_ppt_acc": "N/A", + "xcp_stats.gfx_below_host_limit_thm_acc": "N/A", + "xcp_stats.gfx_low_utilization_acc": "N/A", + "xcp_stats.gfx_below_host_limit_total_acc": "N/A", + "pcie_lc_perf_other_end_recovery": "N/A", + "vram_max_bandwidth": "N/A", + "xgmi_link_status": "N/A" + } + return na_gpu_metrics_info + def amdsmi_get_socket_handles() -> List[c_void_p]: """ @@ -2351,9 +2448,9 @@ def amdsmi_get_violation_status( "acc_hbm_thrm": _validate_if_max_uint(violation_status.acc_hbm_thrm, MaxUIntegerTypes.UINT64_T), "acc_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.acc_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T), "acc_gfx_clk_below_host_limit_pwr": list(violation_status.acc_gfx_clk_below_host_limit_pwr), - "acc_gfx_clk_below_host_limit_thm": list(violation_status.acc_gfx_clk_below_host_limit_thm), - "acc_low_utilization": list(violation_status.acc_low_utilization), + "acc_gfx_clk_below_host_limit_thrm": list(violation_status.acc_gfx_clk_below_host_limit_thrm), "acc_gfx_clk_below_host_limit_total": list(violation_status.acc_gfx_clk_below_host_limit_total), + "acc_low_utilization": list(violation_status.acc_low_utilization), "per_prochot_thrm": _validate_if_max_uint(violation_status.per_prochot_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_ppt_pwr": _validate_if_max_uint(violation_status.per_ppt_pwr, MaxUIntegerTypes.UINT64_T, isActivity=True), #PVIOL "per_socket_thrm": _validate_if_max_uint(violation_status.per_socket_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), #TVIOL @@ -2361,9 +2458,9 @@ def amdsmi_get_violation_status( "per_hbm_thrm": _validate_if_max_uint(violation_status.per_hbm_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.per_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T, isActivity=True), "per_gfx_clk_below_host_limit_pwr": list(violation_status.per_gfx_clk_below_host_limit_pwr), - "per_gfx_clk_below_host_limit_thm": list(violation_status.per_gfx_clk_below_host_limit_thm), - "per_low_utilization": list(violation_status.per_low_utilization), + "per_gfx_clk_below_host_limit_thrm": list(violation_status.per_gfx_clk_below_host_limit_thrm), "per_gfx_clk_below_host_limit_total": list(violation_status.per_gfx_clk_below_host_limit_total), + "per_low_utilization": list(violation_status.per_low_utilization), "active_prochot_thrm": _validate_if_max_uint(violation_status.active_prochot_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), "active_ppt_pwr": _validate_if_max_uint(violation_status.active_ppt_pwr, MaxUIntegerTypes.UINT8_T, isBool=True), #PVIOL "active_socket_thrm": _validate_if_max_uint(violation_status.active_socket_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), #TVIOL @@ -2371,9 +2468,9 @@ def amdsmi_get_violation_status( "active_hbm_thrm": _validate_if_max_uint(violation_status.active_hbm_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), "active_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.active_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT8_T, isBool=True), "active_gfx_clk_below_host_limit_pwr": list(violation_status.active_gfx_clk_below_host_limit_pwr), - "active_gfx_clk_below_host_limit_thm": list(violation_status.active_gfx_clk_below_host_limit_thm), - "active_low_utilization": list(violation_status.active_low_utilization), + "active_gfx_clk_below_host_limit_thrm": list(violation_status.active_gfx_clk_below_host_limit_thrm), "active_gfx_clk_below_host_limit_total": list(violation_status.active_gfx_clk_below_host_limit_total), + "active_low_utilization": list(violation_status.active_low_utilization), } # Create 2d array with each XCD's stats @@ -2381,25 +2478,25 @@ def amdsmi_get_violation_status( for xcp_index, xcp_metrics in enumerate(dict_return['acc_gfx_clk_below_host_limit_pwr']): xcp_detail = [] for val in xcp_metrics: - xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) dict_return['acc_gfx_clk_below_host_limit_pwr'][xcp_index] = xcp_detail - if 'acc_gfx_clk_below_host_limit_thm' in dict_return: - for xcp_index, xcp_metrics in enumerate(dict_return['acc_gfx_clk_below_host_limit_thm']): + if 'acc_gfx_clk_below_host_limit_thrm' in dict_return: + for xcp_index, xcp_metrics in enumerate(dict_return['acc_gfx_clk_below_host_limit_thrm']): xcp_detail = [] for val in xcp_metrics: - xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) - dict_return['acc_gfx_clk_below_host_limit_thm'][xcp_index] = xcp_detail + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) + dict_return['acc_gfx_clk_below_host_limit_thrm'][xcp_index] = xcp_detail if 'acc_low_utilization' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['acc_low_utilization']): xcp_detail = [] for val in xcp_metrics: - xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) dict_return['acc_low_utilization'][xcp_index] = xcp_detail if 'acc_gfx_clk_below_host_limit_total' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['acc_gfx_clk_below_host_limit_total']): xcp_detail = [] for val in xcp_metrics: - xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) dict_return['acc_gfx_clk_below_host_limit_total'][xcp_index] = xcp_detail if 'per_gfx_clk_below_host_limit_pwr' in dict_return: @@ -2408,12 +2505,12 @@ def amdsmi_get_violation_status( for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) dict_return['per_gfx_clk_below_host_limit_pwr'][xcp_index] = xcp_detail - if 'per_gfx_clk_below_host_limit_thm' in dict_return: - for xcp_index, xcp_metrics in enumerate(dict_return['per_gfx_clk_below_host_limit_thm']): + if 'per_gfx_clk_below_host_limit_thrm' in dict_return: + for xcp_index, xcp_metrics in enumerate(dict_return['per_gfx_clk_below_host_limit_thrm']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) - dict_return['per_gfx_clk_below_host_limit_thm'][xcp_index] = xcp_detail + dict_return['per_gfx_clk_below_host_limit_thrm'][xcp_index] = xcp_detail if 'per_low_utilization' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['per_low_utilization']): xcp_detail = [] @@ -2433,12 +2530,12 @@ def amdsmi_get_violation_status( for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT8_T, isBool=True)) dict_return['active_gfx_clk_below_host_limit_pwr'][xcp_index] = xcp_detail - if 'active_gfx_clk_below_host_limit_thm' in dict_return: - for xcp_index, xcp_metrics in enumerate(dict_return['active_gfx_clk_below_host_limit_thm']): + if 'active_gfx_clk_below_host_limit_thrm' in dict_return: + for xcp_index, xcp_metrics in enumerate(dict_return['active_gfx_clk_below_host_limit_thrm']): xcp_detail = [] for val in xcp_metrics: xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT8_T, isBool=True)) - dict_return['active_gfx_clk_below_host_limit_thm'][xcp_index] = xcp_detail + dict_return['active_gfx_clk_below_host_limit_thrm'][xcp_index] = xcp_detail if 'active_low_utilization' in dict_return: for xcp_index, xcp_metrics in enumerate(dict_return['active_low_utilization']): xcp_detail = [] @@ -4614,6 +4711,9 @@ def amdsmi_get_gpu_metrics_info( ) gpu_metrics_output = { + "common_header.structure_size": _validate_if_max_uint(gpu_metrics.common_header.structure_size, MaxUIntegerTypes.UINT16_T), + "common_header.format_revision": _validate_if_max_uint(gpu_metrics.common_header.format_revision, MaxUIntegerTypes.UINT8_T), + "common_header.content_revision": _validate_if_max_uint(gpu_metrics.common_header.content_revision, MaxUIntegerTypes.UINT8_T), "temperature_edge": _validate_if_max_uint(gpu_metrics.temperature_edge, MaxUIntegerTypes.UINT16_T), "temperature_hotspot": _validate_if_max_uint(gpu_metrics.temperature_hotspot, MaxUIntegerTypes.UINT16_T), "temperature_mem": _validate_if_max_uint(gpu_metrics.temperature_mem, MaxUIntegerTypes.UINT16_T), diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index b512ef4265..172aecfe4b 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -871,15 +871,15 @@ struct_amdsmi_violation_status_t._fields_ = [ ('active_gfx_clk_below_host_limit', ctypes.c_ubyte), ('PADDING_0', ctypes.c_ubyte * 2), ('acc_gfx_clk_below_host_limit_pwr', ctypes.c_uint64 * 8 * 8), - ('acc_gfx_clk_below_host_limit_thm', ctypes.c_uint64 * 8 * 8), + ('acc_gfx_clk_below_host_limit_thrm', ctypes.c_uint64 * 8 * 8), ('acc_low_utilization', ctypes.c_uint64 * 8 * 8), ('acc_gfx_clk_below_host_limit_total', ctypes.c_uint64 * 8 * 8), ('per_gfx_clk_below_host_limit_pwr', ctypes.c_uint64 * 8 * 8), - ('per_gfx_clk_below_host_limit_thm', ctypes.c_uint64 * 8 * 8), + ('per_gfx_clk_below_host_limit_thrm', ctypes.c_uint64 * 8 * 8), ('per_low_utilization', ctypes.c_uint64 * 8 * 8), ('per_gfx_clk_below_host_limit_total', ctypes.c_uint64 * 8 * 8), ('active_gfx_clk_below_host_limit_pwr', ctypes.c_ubyte * 8 * 8), - ('active_gfx_clk_below_host_limit_thm', ctypes.c_ubyte * 8 * 8), + ('active_gfx_clk_below_host_limit_thrm', ctypes.c_ubyte * 8 * 8), ('active_low_utilization', ctypes.c_ubyte * 8 * 8), ('active_gfx_clk_below_host_limit_total', ctypes.c_ubyte * 8 * 8), ('reserved', ctypes.c_uint64 * 8 * 8), diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h index d9325cf9ff..ea9e1fa5e5 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -705,7 +705,7 @@ struct AMDGpuMetrics_v18_t { uint16_t m_average_gfx_activity; uint16_t m_average_umc_activity; // memory controller - /* VRAM max bandwidthi (in GB/sec) at max memory clock */ + /* VRAM max bandwidth (in GB/sec) at max memory clock */ uint64_t m_mem_max_bandwidth; /* Energy (15.259uJ (2^-16) units) */ diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index ca0cc53d45..bbd860ae29 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1043,20 +1043,32 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha violation_status->active_hbm_thrm = std::numeric_limits::max(); violation_status->active_gfx_clk_below_host_limit = std::numeric_limits::max(); - fill_2d_array(violation_status->acc_gfx_clk_below_host_limit_pwr, std::numeric_limits::max()); - fill_2d_array(violation_status->acc_gfx_clk_below_host_limit_thm, std::numeric_limits::max()); - fill_2d_array(violation_status->acc_low_utilization, std::numeric_limits::max()); - fill_2d_array(violation_status->acc_gfx_clk_below_host_limit_total, std::numeric_limits::max()); + fill_2d_array(violation_status->acc_gfx_clk_below_host_limit_pwr, + std::numeric_limits::max()); + fill_2d_array(violation_status->acc_gfx_clk_below_host_limit_thrm, + std::numeric_limits::max()); + fill_2d_array(violation_status->acc_low_utilization, + std::numeric_limits::max()); + fill_2d_array(violation_status->acc_gfx_clk_below_host_limit_total, + std::numeric_limits::max()); - fill_2d_array(violation_status->per_gfx_clk_below_host_limit_pwr, std::numeric_limits::max()); - fill_2d_array(violation_status->per_gfx_clk_below_host_limit_thm, std::numeric_limits::max()); - fill_2d_array(violation_status->per_low_utilization, std::numeric_limits::max()); - fill_2d_array(violation_status->per_gfx_clk_below_host_limit_total, std::numeric_limits::max()); + fill_2d_array(violation_status->per_gfx_clk_below_host_limit_pwr, + std::numeric_limits::max()); + fill_2d_array(violation_status->per_gfx_clk_below_host_limit_thrm, + std::numeric_limits::max()); + fill_2d_array(violation_status->per_low_utilization, + std::numeric_limits::max()); + fill_2d_array(violation_status->per_gfx_clk_below_host_limit_total, + std::numeric_limits::max()); - fill_2d_array(violation_status->active_gfx_clk_below_host_limit_pwr, std::numeric_limits::max()); - fill_2d_array(violation_status->active_gfx_clk_below_host_limit_thm, std::numeric_limits::max()); - fill_2d_array(violation_status->active_low_utilization, std::numeric_limits::max()); - fill_2d_array(violation_status->active_gfx_clk_below_host_limit_total, std::numeric_limits::max()); + fill_2d_array(violation_status->active_gfx_clk_below_host_limit_pwr, + std::numeric_limits::max()); + fill_2d_array(violation_status->active_gfx_clk_below_host_limit_thrm, + std::numeric_limits::max()); + fill_2d_array(violation_status->active_low_utilization, + std::numeric_limits::max()); + fill_2d_array(violation_status->active_gfx_clk_below_host_limit_total, + std::numeric_limits::max()); const auto p1 = std::chrono::system_clock::now(); auto current_time = std::chrono::duration_cast( @@ -1081,14 +1093,14 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha } // default to 0xffffffff as not supported - uint32_t partitition_id = std::numeric_limits::max(); + uint32_t partition_id = std::numeric_limits::max(); auto tmp_partition_id = uint32_t(0); amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, 0, &(tmp_partition_id)); // Do not return early if this value fails // continue to try getting all info if (status == AMDSMI_STATUS_SUCCESS) { - partitition_id = tmp_partition_id; + partition_id = tmp_partition_id; } amdsmi_gpu_metrics_t metric_info_a = {}; @@ -1102,15 +1114,28 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha return status; } - // if all of these values are "undefined" then the feature is not supported on the ASIC + // Note: Both XCP and partition_id will default to 0, if gpu_metrics file is not present. + // This is why we can check elements in kFIRST_ELEMENT == 0 for both XCP and partition_id. + const uint32_t kFIRST_ELEMENT = 0; + + // Check if violation status is supported: + // If all of these values are "undefined" then the feature is not supported on the ASIC if (metric_info_a.accumulation_counter == std::numeric_limits::max() && metric_info_a.prochot_residency_acc == std::numeric_limits::max() && metric_info_a.ppt_residency_acc == std::numeric_limits::max() && metric_info_a.socket_thm_residency_acc == std::numeric_limits::max() && metric_info_a.vr_thm_residency_acc == std::numeric_limits::max() && metric_info_a.hbm_thm_residency_acc == std::numeric_limits::max() - && (metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id] - == std::numeric_limits::max())) { + && metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_below_host_limit_acc[kFIRST_ELEMENT] + == std::numeric_limits::max() + && metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_below_host_limit_ppt_acc[kFIRST_ELEMENT] + == std::numeric_limits::max() + && metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_below_host_limit_thm_acc[kFIRST_ELEMENT] + == std::numeric_limits::max() + && metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_low_utilization_acc[kFIRST_ELEMENT] + == std::numeric_limits::max() + && metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_below_host_limit_total_acc[kFIRST_ELEMENT] + == std::numeric_limits::max()) { ss << __PRETTY_FUNCTION__ << " | ASIC does not support throttle violations!, " << "returning AMDSMI_STATUS_NOT_SUPPORTED"; @@ -1136,8 +1161,26 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha violation_status->acc_socket_thrm = metric_info_b.socket_thm_residency_acc; violation_status->acc_vr_thrm = metric_info_b.vr_thm_residency_acc; violation_status->acc_hbm_thrm = metric_info_b.hbm_thm_residency_acc; - violation_status->acc_gfx_clk_below_host_limit //deprecated - = metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id]; + violation_status->acc_gfx_clk_below_host_limit // deprecated + = metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT]; + + // Copy XCP accumulators into 2D array + auto copy_xcp_metric = [](const auto& src, auto& dst, auto member_ptr) { + for (size_t i = 0; i < AMDSMI_MAX_NUM_XCP; ++i) { + std::copy( + std::begin(src[i].*member_ptr), + std::end(src[i].*member_ptr), + dst[i]); + } + }; + copy_xcp_metric(metric_info_b.xcp_stats, violation_status->acc_gfx_clk_below_host_limit_pwr, + &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_ppt_acc); + copy_xcp_metric(metric_info_b.xcp_stats, violation_status->acc_gfx_clk_below_host_limit_thrm, + &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_thm_acc); + copy_xcp_metric(metric_info_b.xcp_stats, violation_status->acc_low_utilization, + &amdsmi_gpu_xcp_metrics_t::gfx_low_utilization_acc); + copy_xcp_metric(metric_info_b.xcp_stats, violation_status->acc_gfx_clk_below_host_limit_total, + &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_total_acc); ss << __PRETTY_FUNCTION__ << " | " << "[gpu_metrics A] metric_info_a.accumulation_counter: " << std::dec @@ -1152,8 +1195,9 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha << metric_info_a.vr_thm_residency_acc << "\n" << "; metric_info_a.hbm_thm_residency_acc: " << std::dec << metric_info_a.hbm_thm_residency_acc << "\n" - << "; metric_info_b.xcp_stats->gfx_below_host_limit_acc[" << partitition_id << "]: " - << std::dec << metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id] << "\n" + << "; metric_info_a.xcp_stats[" << partition_id << "].gfx_below_host_limit_acc[" + << kFIRST_ELEMENT << "]: " << std::dec // deprecated + << metric_info_a.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT] << "\n" << " [gpu_metrics B] metric_info_b.accumulation_counter: " << std::dec << metric_info_b.accumulation_counter << "\n" << "; metric_info_b.prochot_residency_acc: " << std::dec @@ -1166,46 +1210,11 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha << metric_info_b.vr_thm_residency_acc << "\n" << "; metric_info_b.hbm_thm_residency_acc: " << std::dec << metric_info_b.hbm_thm_residency_acc << "\n" - << "; metric_info_b.xcp_stats->gfx_below_host_limit_acc[" << partitition_id << "]: " //deprecated - << std::dec << metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] << "\n"; + << "; metric_info_b.xcp_stats[" << partition_id << "].gfx_below_host_limit_acc[" + << kFIRST_ELEMENT << "]: " << std::dec // deprecated + << metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT] << "\n"; LOG_DEBUG(ss); - auto copy_gfx_acc = [](auto priv_it, auto priv_end, auto pub_it, auto gfx_acc_ptr) { - for (; priv_it != priv_end; ++priv_it, ++pub_it) { - std::copy(std::begin((*priv_it).*gfx_acc_ptr), - std::end((*priv_it).*gfx_acc_ptr), - std::begin(*pub_it)); - } - }; - - copy_gfx_acc( - std::begin(metric_info_b.xcp_stats), - std::end(metric_info_b.xcp_stats), - std::begin(violation_status->acc_gfx_clk_below_host_limit_pwr), - &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_ppt_acc - ); - - copy_gfx_acc( - std::begin(metric_info_b.xcp_stats), - std::end(metric_info_b.xcp_stats), - std::begin(violation_status->acc_gfx_clk_below_host_limit_thm), - &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_thm_acc - ); - - copy_gfx_acc( - std::begin(metric_info_b.xcp_stats), - std::end(metric_info_b.xcp_stats), - std::begin(violation_status->acc_low_utilization), - &amdsmi_gpu_xcp_metrics_t::gfx_low_utilization_acc - ); - - copy_gfx_acc( - std::begin(metric_info_b.xcp_stats), - std::end(metric_info_b.xcp_stats), - std::begin(violation_status->acc_gfx_clk_below_host_limit_total), - &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_total_acc - ); - if ( (metric_info_b.prochot_residency_acc != std::numeric_limits::max() || metric_info_a.prochot_residency_acc != std::numeric_limits::max()) && (metric_info_b.prochot_residency_acc >= metric_info_a.prochot_residency_acc) @@ -1309,15 +1318,19 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha << violation_status->active_hbm_thrm << "\n"; LOG_DEBUG(ss); } - /* //deprecated - if ((metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] != std::numeric_limits::max() || - metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id] != std::numeric_limits::max()) && - (metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] >= metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id]) && + // deprecated - design likely needs to include both [XCP][XCC], like the new metrics + if ((metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT] + != std::numeric_limits::max() || + metric_info_a.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT] + != std::numeric_limits::max()) && + (metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT] + >= metric_info_a.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT]) && ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0)) { violation_status->per_gfx_clk_below_host_limit = - (((metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] - - metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id]) * 100) / - (metric_info_b.accumulation_counter - metric_info_a.accumulation_counter)); + (((metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT] - + metric_info_a.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT]) + * 100) / + (metric_info_b.accumulation_counter - metric_info_a.accumulation_counter)); if (violation_status->per_gfx_clk_below_host_limit > 0) { violation_status->active_gfx_clk_below_host_limit = 1; @@ -1327,68 +1340,95 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha ss << __PRETTY_FUNCTION__ << " | " << "ENTERED gfx_below_host_limit_acc | per_gfx_clk_below_host_limit: " << std::dec << violation_status->per_gfx_clk_below_host_limit - << "%; active_ppt_pwr = " << std::dec + << "%; active_ppt_pwr = " << std::boolalpha << violation_status->active_gfx_clk_below_host_limit << "\n"; LOG_DEBUG(ss); } - */ - uint64_t counter_delta = metric_info_b.accumulation_counter - metric_info_a.accumulation_counter; - auto calc_viol_actv_percent = [](auto priv_it1, auto end1, auto priv_it2, auto pub_it, auto act_it, auto viol_ptr, uint64_t counter_delta) { - for (; priv_it1 != end1; ++priv_it1, ++priv_it2, ++pub_it, ++act_it) { - auto& priv_it_arr2 = (*priv_it2).*viol_ptr; - auto& priv_it_arr1 = (*priv_it1).*viol_ptr; - for (size_t i = 0; i < AMDSMI_MAX_NUM_XCC; ++i) { - uint64_t value2 = priv_it_arr2[i]; - uint64_t value1 = priv_it_arr1[i]; - if ((value2 != std::numeric_limits::max() || - value1 != std::numeric_limits::max()) && - (value2 > value1) && (counter_delta > 0)) { - (*pub_it)[i] = ((value2 - value1) * 100) / counter_delta; - (*act_it)[i] = (((*pub_it)[i]) > 0) ? 1 : 0; + + // one-shot processing of all XCP violation metrics + // using a lambda function to avoid code duplication + using MetricArrayType = uint64_t[AMDSMI_MAX_NUM_XCC]; + using MetricMemberPtr = MetricArrayType amdsmi_gpu_xcp_metrics_t::*; + + auto process_all_XCP_violation_metrics = [&]( + const std::vector>& metric_members, + std::vector> per_arrays, + std::vector> active_arrays) { + uint64_t counter_delta = static_cast(metric_info_b.accumulation_counter) + - static_cast(metric_info_a.accumulation_counter); + + ss << __PRETTY_FUNCTION__ << " | Processing all XCP metrics with counter_delta: " + << std::dec << counter_delta << "\n"; + LOG_DEBUG(ss); + + for (size_t metric_idx = 0; metric_idx < metric_members.size(); ++metric_idx) { + const auto& member_pair = metric_members[metric_idx]; + const std::string& member_name = member_pair.first; + MetricMemberPtr member_ptr = member_pair.second; + + auto& per_arr = per_arrays[metric_idx].get(); + auto& active_arr = active_arrays[metric_idx].get(); + + ss << " [Metric] " << member_name << "\n"; + for (uint32_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) { + const MetricArrayType& arr_a = metric_info_a.xcp_stats[xcp].*member_ptr; + const MetricArrayType& arr_b = metric_info_b.xcp_stats[xcp].*member_ptr; + ss << " xcp: " << xcp << " ("; + for (uint32_t xcc = 0; xcc < AMDSMI_MAX_NUM_XCC; ++xcc) { + uint64_t val_a = arr_a[xcc]; + uint64_t val_b = arr_b[xcc]; + + if (val_b == std::numeric_limits::max() || + val_a == std::numeric_limits::max() || + counter_delta <= 0 || + val_b < val_a) { + per_arr[xcp][xcc] = std::numeric_limits::max(); + active_arr[xcp][xcc] = std::numeric_limits::max(); + ss << "[Invalid] (" << std::dec << per_arr[xcp][xcc] + << ", " << static_cast(active_arr[xcp][xcc]) << ") "; + continue; + } + + uint64_t percent = ((val_b - val_a) * 100) / counter_delta; + per_arr[xcp][xcc] = percent; + active_arr[xcp][xcc] = (percent > 0) ? 1 : 0; + ss << "[Valid] (" << std::dec << percent << "%, " + << std::boolalpha << static_cast(active_arr[xcp][xcc]) + << ") | val_b: " << std::dec << val_b + << ", val_a: " << std::dec << val_a + << ", counter_delta: " << std::dec << counter_delta << " "; } + ss << ")\n"; } } + LOG_DEBUG(ss); }; - calc_viol_actv_percent( - std::begin(metric_info_a.xcp_stats), - std::end(metric_info_a.xcp_stats), - std::begin(metric_info_b.xcp_stats), - std::begin(violation_status->per_gfx_clk_below_host_limit_pwr), - std::begin(violation_status->active_gfx_clk_below_host_limit_pwr), - &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_ppt_acc, - counter_delta - ); + // Prepare metric members and arrays for processing + const std::vector> metric_members = { + {"gfx_below_host_limit_ppt_acc", &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_ppt_acc}, + {"gfx_below_host_limit_thm_acc", &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_thm_acc}, + {"gfx_low_utilization_acc", &amdsmi_gpu_xcp_metrics_t::gfx_low_utilization_acc}, + {"gfx_below_host_limit_total_acc", + &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_total_acc} + }; - calc_viol_actv_percent( - std::begin(metric_info_a.xcp_stats), - std::end(metric_info_a.xcp_stats), - std::begin(metric_info_b.xcp_stats), - std::begin(violation_status->per_gfx_clk_below_host_limit_thm), - std::begin(violation_status->active_gfx_clk_below_host_limit_thm), - &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_thm_acc, - counter_delta - ); - - calc_viol_actv_percent( - std::begin(metric_info_a.xcp_stats), - std::end(metric_info_a.xcp_stats), - std::begin(metric_info_b.xcp_stats), - std::begin(violation_status->per_low_utilization), - std::begin(violation_status->active_low_utilization), - &amdsmi_gpu_xcp_metrics_t::gfx_low_utilization_acc, - counter_delta - ); - - calc_viol_actv_percent( - std::begin(metric_info_a.xcp_stats), - std::end(metric_info_a.xcp_stats), - std::begin(metric_info_b.xcp_stats), - std::begin(violation_status->per_gfx_clk_below_host_limit_total), - std::begin(violation_status->active_gfx_clk_below_host_limit_total), - &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_total_acc, - counter_delta - ); + process_all_XCP_violation_metrics( + metric_members, + { + std::ref(violation_status->per_gfx_clk_below_host_limit_pwr), + std::ref(violation_status->per_gfx_clk_below_host_limit_thrm), + std::ref(violation_status->per_low_utilization), + std::ref(violation_status->per_gfx_clk_below_host_limit_total) + }, + { + std::ref(violation_status->active_gfx_clk_below_host_limit_pwr), + std::ref(violation_status->active_gfx_clk_below_host_limit_thrm), + std::ref(violation_status->active_low_utilization), + std::ref(violation_status->active_gfx_clk_below_host_limit_total) + }); ss << __PRETTY_FUNCTION__ << " | " << "RETURNING AMDSMI_STATUS_SUCCESS | " @@ -1406,20 +1446,20 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha << violation_status->per_vr_thrm << "; violation_status->per_hbm_thrm (%): " << std::dec << violation_status->per_hbm_thrm - << "; violation_status->per_gfx_clk_below_host_limit (%): " << std::dec //deprecated + << "; violation_status->per_gfx_clk_below_host_limit (%): " << std::dec // deprecated << violation_status->per_gfx_clk_below_host_limit - << "; violation_status->active_prochot_thrm (bool): " << std::dec + << "; violation_status->active_prochot_thrm (bool): " << std::boolalpha << static_cast(violation_status->active_prochot_thrm) - << "; violation_status->active_ppt_pwr (bool): " << std::dec + << "; violation_status->active_ppt_pwr (bool): " << std::boolalpha << static_cast(violation_status->active_ppt_pwr) - << "; violation_status->active_socket_thrm (bool): " << std::dec + << "; violation_status->active_socket_thrm (bool): " << std::boolalpha << static_cast(violation_status->active_socket_thrm) - << "; violation_status->active_vr_thrm (bool): " << std::dec + << "; violation_status->active_vr_thrm (bool): " << std::boolalpha << static_cast(violation_status->active_vr_thrm) - << "; violation_status->active_hbm_thrm (bool): " << std::dec + << "; violation_status->active_hbm_thrm (bool): " << std::boolalpha << static_cast(violation_status->active_hbm_thrm) - << "; violation_status->active_gfx_clk_below_host_limit (bool): " << std::dec //deprecated - << static_cast(violation_status->active_gfx_clk_below_host_limit) + << "; violation_status->active_gfx_clk_below_host_limit (bool): " // deprecated + << std::boolalpha << static_cast(violation_status->active_gfx_clk_below_host_limit) << "\n"; LOG_INFO(ss);