From 7ea98e06dd475f8ea638ced6f84b0bee24fb1c38 Mon Sep 17 00:00:00 2001 From: "Arif, Maisam" Date: Sat, 12 Apr 2025 01:54:57 -0500 Subject: [PATCH] [SWDEV-511234] Added amdsmi_get_gpu_cper_entries & CLI implementation Added amdsmi_get_gpu_cper_entries() in the python and C APIs Signed-off-by: Maisam Arif Signed-off-by: Oliveira, Daniel Co-authored-by: Saeed, Oosman Co-authored-by: AL Musaffar, Yazen [ROCm/amdsmi commit: d81871ef16bb99c5910fdf60a4ba94b50d947ef4] --- projects/amdsmi/CHANGELOG.md | 183 +++++---- projects/amdsmi/amdsmi_cli/amdsmi_cli.py | 12 +- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 105 ++++- projects/amdsmi/amdsmi_cli/amdsmi_helpers.py | 95 ++++- projects/amdsmi/amdsmi_cli/amdsmi_logger.py | 21 + projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 80 +++- .../amdsmi/docs/reference/amdsmi-py-api.md | 50 +++ projects/amdsmi/include/amd_smi/amdsmi.h | 125 +++++- projects/amdsmi/py-interface/__init__.py | 1 + .../amdsmi/py-interface/amdsmi_interface.py | 137 ++++++- .../amdsmi/py-interface/amdsmi_wrapper.py | 207 ++++++++-- projects/amdsmi/src/amd_smi/amd_smi.cc | 272 +++++++++++++ .../amdsmi/tests/amd_smi_test/CMakeLists.txt | 5 +- .../amdsmi_get_gpu_cper_entries.cc | 365 ++++++++++++++++++ .../sys/kernel/debug/dri/1/amdgpu_ring_cper | Bin 0 -> 8736 bytes .../sys/kernel/debug/dri/17/amdgpu_ring_cper | Bin 0 -> 3304 bytes .../sys/kernel/debug/dri/25/amdgpu_ring_cper | Bin 0 -> 1504 bytes .../sys/kernel/debug/dri/33/amdgpu_ring_cper | Bin 0 -> 3456 bytes .../sys/kernel/debug/dri/41/amdgpu_ring_cper | Bin 0 -> 376 bytes .../sys/kernel/debug/dri/49/amdgpu_ring_cper | Bin 0 -> 848 bytes .../sys/kernel/debug/dri/57/amdgpu_ring_cper | Bin 0 -> 12 bytes .../sys/kernel/debug/dri/9/amdgpu_ring_cper | Bin 0 -> 3680 bytes projects/amdsmi/tests/amd_smi_test/main.cc | 1 - 23 files changed, 1532 insertions(+), 127 deletions(-) create mode 100644 projects/amdsmi/tests/amd_smi_test/amdsmi_get_gpu_cper_entries.cc create mode 100644 projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/1/amdgpu_ring_cper create mode 100644 projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/17/amdgpu_ring_cper create mode 100644 projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/25/amdgpu_ring_cper create mode 100644 projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/33/amdgpu_ring_cper create mode 100644 projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/41/amdgpu_ring_cper create mode 100644 projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/49/amdgpu_ring_cper create mode 100644 projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/57/amdgpu_ring_cper create mode 100644 projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/9/amdgpu_ring_cper diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index ff45dc48ae..1bb1d57005 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -55,50 +55,96 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Added -- N/A +- **Added dumping CPER entries from RAS tool `amdsmi_get_gpu_cper_entries()` to Python & C APIs.** + - CPER entries consist of `amdsmi_cper_hdr_t` + ```shell + typedef struct { + char signature[4]; /* "CPER" */ + uint16_t revision; + uint32_t signature_end; /* 0xFFFFFFFF */ + uint16_t sec_cnt; + amdsmi_cper_sev_t error_severity; + //valid_bits_t valid_bits; + //uint32_t valid_mask; + amdsmi_cper_valid_bits_t cper_valid_bits; + uint32_t record_length; /* Total size of CPER Entry */ + amdsmi_cper_timestamp_t timestamp; + char platform_id[16]; + amdsmi_cper_guid_t partition_id; /* Reserved */ + char creator_id[16]; + amdsmi_cper_guid_t notify_type; /* CMC, MCE, can use amdsmi_cper_notifiy_type_t to decode*/ + char record_id[8]; /* Unique CPER Entry ID */ + uint32_t flags; /* Reserved */ + uint64_t persistence_info; /* Reserved */ + uint8_t reserved[12]; /* Reserved */ + } amdsmi_cper_hdr_t; + ``` + + - Dumping CPER entires is also enabled in the CLI interface via `sudo amd-smi ras --cper` + + ```shell + $ sudo amd-smi ras --cper + Dumping CPER file header entries for GPU 0: + "0": { + "error_severity": "non_fatal_corrected", + "notify_type": "CMC", + "timestamp": "2025/04/08 18:23:44", + "signature": "CPER", + "revision": 256, + "signature_end": "0xffffffff", + "sec_cnt": 1, + "record_length": 472, + "platform_id": "0x1002:0x74A2", + "creator_id": "amdgpu", + "record_id": "5:1", + "flags": 0, + "persistence_info": 0 + } + ``` ### Changed - **Changed amd-smi partition --accelerator & `amdsmi_get_gpu_accelerator_partition_profile_config()` detect users running without root/sudo privledges** - Updated `amdsmi_get_gpu_accelerator_partition_profile_config()` to return `AMDSMI_STATUS_NO_PERM` immediately if users run without root/sudo permissions. - Updated `amd-smi partition --accelerator` to provide a warning for users without root/sudo permissions (see example below, ***output subject to change***). -```shell -$ amd-smi partition --accelerator -ACCELERATOR_PARTITION_PROFILES: + ```shell + $ amd-smi partition --accelerator -*************************************************************************** -** WARNING: ** -** ACCELERATOR_PARTITION_PROFILES requires sudo/root permissions to run. ** -** Please run the command with sudo permissions to get accurate results. ** -*************************************************************************** + ACCELERATOR_PARTITION_PROFILES: -GPU_ID PROFILE_INDEX MEMORY_PARTITION_CAPS ACCELERATOR_TYPE PARTITION_ID NUM_PARTITIONS NUM_RESOURCES RESOURCE_INDEX RESOURCE_TYPE RESOURCE_INSTANCES RESOURCES_SHARED -N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A -N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A -N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A -N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A -N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A -N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A -N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A -N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + *************************************************************************** + ** WARNING: ** + ** ACCELERATOR_PARTITION_PROFILES requires sudo/root permissions to run. ** + ** Please run the command with sudo permissions to get accurate results. ** + *************************************************************************** -ACCELERATOR_PARTITION_RESOURCES: -RESOURCE_INDEX RESOURCE_TYPE RESOURCE_INSTANCES RESOURCES_SHARED -N/A N/A N/A N/A -N/A N/A N/A N/A -N/A N/A N/A N/A -N/A N/A N/A N/A -N/A N/A N/A N/A -N/A N/A N/A N/A -N/A N/A N/A N/A -N/A N/A N/A N/A + GPU_ID PROFILE_INDEX MEMORY_PARTITION_CAPS ACCELERATOR_TYPE PARTITION_ID NUM_PARTITIONS NUM_RESOURCES RESOURCE_INDEX RESOURCE_TYPE RESOURCE_INSTANCES RESOURCES_SHARED + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + + ACCELERATOR_PARTITION_RESOURCES: + RESOURCE_INDEX RESOURCE_TYPE RESOURCE_INSTANCES RESOURCES_SHARED + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A -Legend: - * = Current mode -``` + Legend: + * = Current mode + ``` - **Changed `amd-smi partition --current`, `amd-smi partition --accelerator`, and `amdsmi_get_gpu_accelerator_partition_profile()` to display partition ID for each individual partition** - Host will continue to display in the full array format, they do not display the individual partitions as Baremetal/Guest setups. @@ -106,44 +152,46 @@ Legend: reflect each individual partition ID, now provided in `partition_id[0]` location (as seen in other amd-smi CLI commands). This change was needed for BM/Guest setups due to other related partition outputs seen in (`amd-smi list` and `amd-smi static --partition`) and individual logical partition devices displayed. ***See examples below for reference.*** -Previous output: -```shell -$ amd-smi partition --current + Previous output: -CURRENT_PARTITION: -GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID -0 NPS1 CPX 3 0,1,2,3,4,5,6,7 -1 NPS1 CPX 3 N/A -2 NPS1 CPX 3 N/A -3 NPS1 CPX 3 N/A -4 NPS1 CPX 3 N/A -5 NPS1 CPX 3 N/A -6 NPS1 CPX 3 N/A -7 NPS1 CPX 3 N/A -8 NPS1 CPX 3 0,1,2,3,4,5,6,7 -9 NPS1 CPX 3 N/A -10 NPS1 CPX 3 N/A -... -``` + ```shell + $ amd-smi partition --current -New output: -```shell -amd-smi partition --current -CURRENT_PARTITION: -GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID -0 NPS1 CPX 3 0 -1 NPS1 CPX 3 1 -2 NPS1 CPX 3 2 -3 NPS1 CPX 3 3 -4 NPS1 CPX 3 4 -5 NPS1 CPX 3 5 -6 NPS1 CPX 3 6 -7 NPS1 CPX 3 7 -8 NPS1 CPX 3 0 -9 NPS1 CPX 3 1 -10 NPS1 CPX 3 2 -... -``` + CURRENT_PARTITION: + GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID + 0 NPS1 CPX 3 0,1,2,3,4,5,6,7 + 1 NPS1 CPX 3 N/A + 2 NPS1 CPX 3 N/A + 3 NPS1 CPX 3 N/A + 4 NPS1 CPX 3 N/A + 5 NPS1 CPX 3 N/A + 6 NPS1 CPX 3 N/A + 7 NPS1 CPX 3 N/A + 8 NPS1 CPX 3 0,1,2,3,4,5,6,7 + 9 NPS1 CPX 3 N/A + 10 NPS1 CPX 3 N/A + ... + ``` + + New output: + + ```shell + amd-smi partition --current + CURRENT_PARTITION: + GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID + 0 NPS1 CPX 3 0 + 1 NPS1 CPX 3 1 + 2 NPS1 CPX 3 2 + 3 NPS1 CPX 3 3 + 4 NPS1 CPX 3 4 + 5 NPS1 CPX 3 5 + 6 NPS1 CPX 3 6 + 7 NPS1 CPX 3 7 + 8 NPS1 CPX 3 0 + 9 NPS1 CPX 3 1 + 10 NPS1 CPX 3 2 + ... + ``` ### Removed @@ -165,6 +213,7 @@ GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID - N/A + ## amd_smi_lib for ROCm 6.4.0 ### Added diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py index 8e69c0baa2..796121a53e 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py @@ -96,7 +96,8 @@ if __name__ == "__main__": amd_smi_commands.monitor, amd_smi_commands.rocm_smi, amd_smi_commands.xgmi, - amd_smi_commands.partition) + amd_smi_commands.partition, + amd_smi_commands.ras) try: try: argcomplete.autocomplete(amd_smi_parser) @@ -105,7 +106,7 @@ if __name__ == "__main__": valid_commands = ['version', 'list', 'static', 'firmware', 'bad-pages', 'metric', 'process', 'profile', 'event', 'topology', 'set', - 'reset', 'monitor', 'xgmi', 'partition', '--help', '-h'] + 'reset', 'monitor', 'xgmi', 'partition', 'ras', '--help', '-h'] sys.argv = [arg.lower() if arg.startswith('--') or not arg.startswith('-') else arg for arg in sys.argv] @@ -117,11 +118,12 @@ if __name__ == "__main__": raise amdsmi_cli_exceptions.AmdSmiInvalidSubcommandException(sys.argv[1],amd_smi_commands.logger.destination) # Handle command modifiers before subcommand execution - if args.json: + # human readable is the default output format + if hasattr(args, 'json') and args.json: amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.json.value - if args.csv: + if hasattr(args, 'csv') and args.csv: amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.csv.value - if args.file: + if hasattr(args, 'file') and args.file: amd_smi_commands.logger.destination = args.file # Remove previous log handlers diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 48cb58a84e..5ee21b5bad 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -34,12 +34,12 @@ from amdsmi_helpers import AMDSMIHelpers from amdsmi_logger import AMDSMILogger from amdsmi import amdsmi_exception, amdsmi_interface - class AMDSMICommands(): """This class contains all the commands corresponding to AMDSMIParser Each command function will interact with AMDSMILogger to handle displaying the output to the specified format and destination. """ + def __init__(self, format='human_readable', destination='stdout') -> None: self.helpers = AMDSMIHelpers() self.logger = AMDSMILogger(format=format, destination=destination) @@ -175,6 +175,7 @@ class AMDSMICommands(): elif self.logger.is_json_format() or self.logger.is_csv_format(): self.logger.print_output() + def list(self, args, multiple_devices=False, gpu=None): """List information for target gpu @@ -6160,6 +6161,108 @@ class AMDSMICommands(): with self.logger.destination.open('a', encoding="utf-8") as output_file: output_file.write(legend_output + '\n') + + def ras(self, args, multiple_devices=False, gpu=None, cper=None, + severity=None, folder=None, file_limit=None, follow=None): + """ + Retrieve and process CPER (RAS) entries for a target GPU. + + Expected command (all options only): + amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder --file_limit=1000 --follow + + Since no timestamp is provided on the command line, the function starts from a default cursor of 0. + The output file name is auto-generated using the timestamp from the CPER header data (converted from + the header’s "YYYY/MM/DD HH:MM:SS" format), along with the GPU/platform ID and error severity. + """ + # GPU handle logic. + if gpu: + args.gpu = gpu + if cper: + args.cper = cper + if severity: + args.severity = severity + if folder: + args.folder = folder + if file_limit: + args.file_limit = file_limit + if follow: + args.follow = follow + + if args.gpu == None: + args.gpu = self.device_handles + + self.helpers.check_required_groups() + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras) + if handled_multiple_gpus: + return + + args.gpu = device_handle + + # Parse severity mask dynamically from the --severity option. + severity_mask = 0 + # drop duplicates of args + logging.debug(args) + for sev in list(set(args.severity)): + if sev == "all": + # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) + severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) + elif sev == "fatal": + # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) + severity_mask |= (1 << 1) + elif sev in ("nonfatal", "nonfatal-uncorrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) + severity_mask |= (1 << 0) + elif sev in ("nonfatal-corrected", "corrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) + severity_mask |= (1 << 2) + + if args.cper: + # Start from cursor 0 (no timestamp argument provided). + cursor = 0 + buffer_size = 1048576 + file_limit = int(args.file_limit) if args.file_limit else 1000 + + # Print exit message only once and only when follow is set + if self.logger.cper_exit_message() and args.follow: + print('Press q and hit ENTER when you want to stop.') + self.logger.set_cper_exit_message(False) + + # Main loop: continuously retrieve CPER entries if --follow is set. + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + if args.folder: + print(f'Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}\n') + else: + print(f'Dumping CPER file header entries for GPU {gpu_id}:\n') + + self.stop = False + while True: + try: + entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries( + args.gpu, severity_mask, buffer_size, cursor) + logging.debug(f"cper_entries | entries: {entries}") + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Error opening CPER file. This command requires elevation') from e + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_NOT_FOUND: + raise FileNotFoundError('Error opening CPER file. This command requires a CPER to be enabled.') from e + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR: + raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e + else: + logging.debug(f"Error retrieving CPER entries: {e}") + break + if entries: + self.helpers.dump_entries(args.folder, entries, cper_data) + if len(entries) == 0 or not args.follow: + break + cursor = new_cursor + time.sleep(5) + user_input = input() + if user_input == 'q': + print("Escape Sequence Detected; Exiting") + self.stop = True + break + + def _event_thread(self, commands, i): devices = commands.device_handles if len(devices) == 0: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index f110c1339d..6f869304ec 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -19,18 +19,19 @@ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import grp +import json import logging import math +import multiprocessing import os -import grp import platform +import re import sys import time -import re -import multiprocessing -import json from enum import Enum +from pathlib import Path from typing import List, Set, Union from amdsmi_init import * @@ -55,7 +56,11 @@ class AMDSMIHelpers(): self._is_linux = False self._is_windows = False + + # Counts and Tracking variables self._count_of_sets_called = 0 + self._count_of_cper_files = 0 + # Check if the system is a virtual OS if self.operating_system.startswith("Linux"): @@ -95,6 +100,7 @@ class AMDSMIHelpers(): except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Unable to determine virtualization status: " + str(e.get_error_code())) + def increment_set_count(self): self._count_of_sets_called += 1 @@ -103,6 +109,14 @@ class AMDSMIHelpers(): return self._count_of_sets_called + def increment_cper_count(self): + self._count_of_cper_files += 1 + + + def get_cper_count(self): + return self._count_of_cper_files + + def is_virtual_os(self): return self._is_virtual_os @@ -116,6 +130,7 @@ class AMDSMIHelpers(): # Returns True if system is baremetal, if system is hypervisor this should return False return self._is_baremetal + def is_passthrough(self): return self._is_passthrough @@ -197,7 +212,7 @@ class AMDSMIHelpers(): """ cpu_choices = {} cpu_choices_str = "" - #import pdb;pdb.set_trace() + try: cpu_handles = [] # amdsmi_get_cpusocket_handles() returns the cpu socket handles stored for cpu_id @@ -230,6 +245,7 @@ class AMDSMIHelpers(): return (cpu_choices, cpu_choices_str) + def get_core_choices(self): """Return dictionary of possible Core choices and string of the output: Dictionary will be in format: coress[ID]: Device Handle) @@ -705,11 +721,13 @@ class AMDSMIHelpers(): except: return False + def get_perf_levels(self): perf_levels_str = [clock.name for clock in amdsmi_interface.AmdSmiDevPerfLevel] perf_levels_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiDevPerfLevel)) return perf_levels_str, perf_levels_int + def get_accelerator_partition_profile_config(self): device_handles = amdsmi_interface.amdsmi_get_processor_handles() accelerator_partition_profiles = {'profile_indices':[], 'profile_types':[], 'memory_caps': []} @@ -726,6 +744,7 @@ class AMDSMIHelpers(): break return accelerator_partition_profiles + def get_accelerator_choices_types_indices(self): return_val = ("N/A", {'profile_indices':[], 'profile_types':[]}) accelerator_partition_profiles = self.get_accelerator_partition_profile_config() @@ -735,6 +754,7 @@ class AMDSMIHelpers(): return_val = (accelerator_choices, accelerator_partition_profiles) return return_val + def get_memory_partition_types(self): memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType] if 'UNKNOWN' in memory_partitions_str: @@ -854,6 +874,7 @@ class AMDSMIHelpers(): else: sys.exit('Confirmation not given. Exiting without setting value') + def confirm_changing_memory_partition_gpu_reload_warning(self, auto_respond=False): """ Print the warning for running outside of specification and prompt user to accept the terms. @@ -879,6 +900,7 @@ class AMDSMIHelpers(): print('Confirmation not given. Exiting without setting value') sys.exit(1) + def is_valid_profile(self, profile): profile_presets = amdsmi_interface.amdsmi_wrapper.amdsmi_power_profile_preset_masks_t__enumvalues if profile in profile_presets: @@ -924,6 +946,7 @@ class AMDSMIHelpers(): return f"{value} {unit}".rstrip() return f"{value}" + class SI_Unit(float, Enum): GIGA = 1000000000 # 10^9 MEGA = 1000000 # 10^6 @@ -937,6 +960,7 @@ class AMDSMIHelpers(): MICRO = 0.000001 # 10^-6 NANO = 0.000000001 # 10^-9 + def convert_SI_unit(self, val: Union[int, float], unit_in: SI_Unit, unit_out = SI_Unit.BASE) -> Union[int, float]: """This function will convert a value into another scientific (SI) unit. Defaults unit_out to SI_Unit.BASE @@ -956,6 +980,7 @@ class AMDSMIHelpers(): else: raise TypeError("val must be an int or float") + def get_pci_device_ids(self) -> Set[str]: pci_devices_path = "/sys/bus/pci/devices" pci_devices: set[str] = set() @@ -969,6 +994,7 @@ class AMDSMIHelpers(): continue return pci_devices + def progressbar(self, it, prefix="", size=60, out=sys.stdout, add_newline=False): count = len(it) if (add_newline): @@ -985,12 +1011,14 @@ class AMDSMIHelpers(): show(i+1) print("\n\n", end='\r', flush=True, file=out) + def showProgressbar(self, title="", timeInSeconds=13, add_newline=False): if title != "": title += " " for i in self.progressbar(range(timeInSeconds), title, 40, add_newline=add_newline): time.sleep(1) + def check_required_groups(self): """ Check if the current user is a member of the required groups. @@ -1016,3 +1044,60 @@ class AMDSMIHelpers(): ) % ", ".join(sorted(missing_groups)) print(msg) logging.warning(msg) + + def hexdump(self, data, size, filepath): + """ + Converts binary data to a hex dump string, similar to the hexdump utility. + """ + def to_printable_ascii(byte): + return chr(byte) if 32 <= byte <= 126 else "." + + with open(filepath, 'w') as f: + offset = 0 + while offset < size: + chunk = data[offset:offset + 16] + hex_values = " ".join(f"{byte:02x}" for byte in chunk) + ascii_values = "".join(to_printable_ascii(byte) for byte in chunk) + print(f"{offset:08x} {hex_values:<48} |{ascii_values}|", file=f) + offset += 16 + + def dump_entries(self, folder, entries, cper_data): + if folder: + folder = Path(folder) + folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists + + # Loop through all entries in the dictionary. + for entry_index, entry in enumerate(entries.values()): + # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". + error_severity = entry.get("error_severity", "Unknown") + notify_type = entry.get("notify_type", "Unknown") + + if error_severity == "non_fatal_uncorrected": + prefix = "uncorrected" + elif error_severity == "non_fatal_corrected": + prefix = "corrected" + elif error_severity == "fatal": + prefix = "fatal" + if notify_type == "BOOT": + prefix = "boot" + + # Construct a unique file name using the key to avoid overwriting + entry_file = f"{prefix}_{self.get_cper_count()}.json" + output_path = folder / entry_file + + cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" + cper_data_file_path = folder / cper_data_file + self.hexdump(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path) + + try: + with output_path.open("w") as f: + logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}") + # Dump the single entry as JSON, handling bytes via the lambda. + f.write(json.dumps(entry, indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) + except Exception as e: + logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}") + else: + print(json.dumps(entries, indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) + self.increment_cper_count() \ No newline at end of file diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 99b47e876d..90d8de9323 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -42,6 +42,7 @@ class AMDSMILogger(): self.secondary_table_header = "" self.warning_message = "" self.helpers = AMDSMIHelpers() + self._cper_exit_message = True class LoggerFormat(Enum): @@ -78,6 +79,26 @@ class AMDSMILogger(): self.multiple_device_output.clear() + def cper_exit_message(self): + """ Store the cper exit message + params: + message (str) - message to store + return: + cper_exit_message (bool) - True if cper exit message is set + """ + return self._cper_exit_message + + + def set_cper_exit_message(self, flag:bool): + """ Set the cper exit message + params: + flag (bool) - True if cper exit message is set + return: + Nothing + """ + self._cper_exit_message = flag + + def _capitalize_keys(self, input_dict): output_dict = {} for key in input_dict.keys(): diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 46c19c06b7..ff006cb1a1 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -69,7 +69,7 @@ class AMDSMIParser(argparse.ArgumentParser): """ def __init__(self, version, list, static, firmware, bad_pages, metric, process, profile, event, topology, set_value, reset, monitor, - rocmsmi, xgmi, partition): + rocmsmi, xgmi, partition, ras): # Helper variables self.helpers = AMDSMIHelpers() @@ -115,7 +115,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Store possible subcommands & aliases for later errors self.possible_commands = ['version', 'list', 'static', 'firmware', 'ucode', 'bad-pages', 'metric', 'process', 'profile', 'event', 'topology', 'set', - 'reset', 'monitor', 'dmon', 'xgmi', 'partition'] + 'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras'] # Add all subparsers self._add_version_parser(self.subparsers, version) @@ -134,6 +134,7 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_rocm_smi_parser(self.subparsers, rocmsmi) self._add_xgmi_parser(self.subparsers, xgmi) self._add_partition_parser(self.subparsers, partition) + self._add_ras_parser(self.subparsers, ras) def _not_negative_int(self, int_value, sub_arg=None): @@ -241,6 +242,24 @@ class AMDSMIParser(argparse.ArgumentParser): return AMDSMIFreqArgs + def _check_folder_path(self): + """ Argument action validator: + Returns a path to folder from the folder path provided. + If the path doesn't exist create it. + """ + class CheckOutputFilePath(argparse.Action): + outputformat = self.helpers.get_output_format() + # Checks the values + def __call__(self, parser, args, values, option_string=None): + path = Path(values) + path.mkdir(parents=True, exist_ok=True) + if not path.exists(): + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat) + elif path.is_dir(): + setattr(args, self.dest, path) + return CheckOutputFilePath + + def _check_output_file_path(self): """ Argument action validator: Returns a path to a file from the output file path provided. @@ -408,7 +427,7 @@ class AMDSMIParser(argparse.ArgumentParser): return _CoreSelectAction - def _add_command_modifiers(self, subcommand_parser: argparse.ArgumentParser): + def _add_command_modifiers(self, subcommand_parser: argparse.ArgumentParser, logging_only=False): json_help = "Displays output in JSON format (human readable by default)." csv_help = "Displays output in CSV format (human readable by default)." file_help = "Saves output into a file on the provided path (stdout by default)." @@ -418,12 +437,14 @@ class AMDSMIParser(argparse.ArgumentParser): command_modifier_group = subcommand_parser.add_argument_group('Command Modifiers') - # Output Format options - logging_args = command_modifier_group.add_mutually_exclusive_group() - logging_args.add_argument('--json', action='store_true', required=False, help=json_help) - logging_args.add_argument('--csv', action='store_true', required=False, help=csv_help) + if not logging_only: + # Output Format options + logging_args = command_modifier_group.add_mutually_exclusive_group() + logging_args.add_argument('--json', action='store_true', required=False, help=json_help) + logging_args.add_argument('--csv', action='store_true', required=False, help=csv_help) + + command_modifier_group.add_argument('--file', action=self._check_output_file_path(), type=str, required=False, help=file_help) - command_modifier_group.add_argument('--file', action=self._check_output_file_path(), type=str, required=False, help=file_help) # Placing loglevel outside the subcommands so it can be used with any subcommand command_modifier_group.add_argument('--loglevel', action='store', type=str.upper, required=False, help=loglevel_help, default='ERROR', metavar='LEVEL', choices=loglevel_choices) @@ -1398,6 +1419,49 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_command_modifiers(partition_parser) + def _add_ras_parser(self, subparsers: argparse._SubParsersAction, func): + """ + Adds the 'ras' subcommand. + + Expected command: + amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder --file_limit=1000 --follow + + All parameters are provided via options; no positional arguments or optional --file/--gpu are used. + """ + # Subparser help text + ras_help = "Retrieve CPER (RAS) entries from the driver" + ras_description = ( + "Retrieve and decode CPER (RAS) entries from the kernel driver.\n" + "Supports filtering by severity, exporting to different formats, and continuous monitoring.\n" + "This command accepts options only; no positional arguments are required." + ) + + # Help text for RAS arguments + cper_help = "Trigger CPER data retrieval" + + severity_choices = ["nonfatal-uncorrected", "fatal", "nonfatal-corrected", "all"] + severity_choices_str = ", ".join(severity_choices) + severity_help = f"Set the SEVERITY filters from the following:\n {severity_choices_str}" + folder_help = "Folder to dump CPER report files" + file_limit_help = "Maximum number of entries per output file" + follow_help = "Continuously monitor for new entries" + + ras_parser = subparsers.add_parser("ras", help=ras_help, description=ras_description) + ras_parser.formatter_class = lambda prog: AMDSMISubparserHelpFormatter(prog) + ras_parser.set_defaults(func=func) + + # Required flags and arguments: + ras_parser.add_argument("--cper", action="store_true", required=True, help=cper_help) + ras_parser.add_argument("--severity", type=str.lower, nargs='+', default=['all'], help=severity_help, choices=severity_choices, metavar='SEVERITY') + ras_parser.add_argument("--folder", type=str, action=self._check_folder_path(), default=False, help=folder_help) + ras_parser.add_argument("--file_limit", type=self._positive_int, action='store', default=1000, help=file_limit_help) + ras_parser.add_argument("--follow", action="store_true", default=False, help=follow_help) + + # Add common modifiers and device selection arguments. + self._add_device_arguments(ras_parser, required=False) + self._add_command_modifiers(ras_parser, logging_only=True) + + def error(self, message): outputformat = self.helpers.get_output_format() diff --git a/projects/amdsmi/docs/reference/amdsmi-py-api.md b/projects/amdsmi/docs/reference/amdsmi-py-api.md index 5a97e49c61..9ac67f91ec 100644 --- a/projects/amdsmi/docs/reference/amdsmi-py-api.md +++ b/projects/amdsmi/docs/reference/amdsmi-py-api.md @@ -1123,6 +1123,56 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_gpu_cper_entries + +Description: Dump CPER entries for a given GPU in a file using from CPER header file from RAS tool. + +Input parameters: +* `processor_handle` device which to query +* `severity_mask` the severity mask of the entries to be retrieved +* `buffer_size` pointer to a variable that specifies the size of the cper_data +* `cursor` pointer to a variable that will contain the cursor for the next call + +Output: Dictionary with fields + +Field | Description +---|--- +`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. | +`notify_type` | The notification type associated with the CPER entry. | +`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. | +`signature` | A 4-byte signature identifying the entry, typically `CPER`. | +`revision` | The revision number of the CPER record format. | +`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. | +`sec_cnt` | The count of sections included in the CPER entry. | +`record_length` | The total length in bytes of the CPER entry. | +`platform_id` | A character array identifying the GPU or platform. | +`creator_id` | A character array indicating the creator of the CPER entry. | +`record_id` | A unique identifier for the CPER entry. | +`flags` | Reserved flags related to the CPER entry. | +`persistence_info` | Reserved information related to persistence. | + +Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: + +* `AmdSmiLibraryException` +* `AmdSmiParameterException` + +Example: + +```python +for device in devices: + entries, new_cursor = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor) + print("CPER entries for device", device) + for key, entry in entries.items(): + print("Entry", key) + print(" Error Severity:", entry.get("error_severity", "Unknown")) + print(" Notify Type:", entry.get("notify_type", "Unknown")) + print(" Timestamp:", entry.get("timestamp", "")) + print() + print("New Cursor Position:", new_cursor) +except AmdSmiException as e: + print(e) +``` + ### amdsmi_get_gpu_board_info Description: Returns board info for the given GPU diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 1cff88efd5..cef79c2d6e 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -333,6 +333,7 @@ typedef enum { AMDSMI_STATUS_AMDGPU_RESTART_ERR = 54, //!< AMDGPU restart failed AMDSMI_STATUS_SETTING_UNAVAILABLE = 55, //!< Setting is not available AMDSMI_STATUS_CORRUPTED_EEPROM = 56, //!< EEPROM is corrupted + AMDSMI_STATUS_MORE_DATA = 57, //!< There is more data than the buffer size the user passed // General errors AMDSMI_STATUS_MAP_ERROR = 0xFFFFFFFE, //!< The internal library error did not map to a status code AMDSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred @@ -1408,6 +1409,29 @@ typedef enum { CLK_LIMIT_MAX //!< Clock values in MHz } amdsmi_clk_limit_type_t; +typedef enum { + AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED = 0, + AMDSMI_CPER_SEV_FATAL = 1, + AMDSMI_CPER_SEV_NON_FATAL_CORRECTED = 2, + AMDSMI_CPER_SEV_NUM = 3, + AMDSMI_CPER_SEV_UNUSED = 10, +} amdsmi_cper_sev_t; + +typedef enum { + AMDSMI_CPER_NOTIFY_TYPE_CMC = 0x450eBDD72DCE8BB1, + AMDSMI_CPER_NOTIFY_TYPE_CPE = 0x4a55D8434E292F96, + AMDSMI_CPER_NOTIFY_TYPE_MCE = 0x4cc5919CE8F56FFE, + AMDSMI_CPER_NOTIFY_TYPE_PCIE = 0x4dfc1A16CF93C01F, + AMDSMI_CPER_NOTIFY_TYPE_INIT = 0x454a9308CC5263E8, + AMDSMI_CPER_NOTIFY_TYPE_NMI = 0x42c9B7E65BAD89FF, + AMDSMI_CPER_NOTIFY_TYPE_BOOT = 0x409aAB403D61A466, + AMDSMI_CPER_NOTIFY_TYPE_DMAR = 0x4c27C6B3667DD791, + AMDSMI_CPER_NOTIFY_TYPE_SEA = 0x11E4BBE89A78788A, + AMDSMI_CPER_NOTIFY_TYPE_SEI = 0x4E87B0AE5C284C81, + AMDSMI_CPER_NOTIFY_TYPE_PEI = 0x4214520409A9D5AC, + AMDSMI_CPER_NOTIFY_TYPE_CXL_COMPONENT = 0x49A341DF69293BC9, +} amdsmi_cper_notify_type_t; + /** * @brief The current ECC state * @@ -3360,6 +3384,7 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t amdsmi_status_t amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold); + /** * @brief Verify the checksum of RAS EEPROM. It is not supported on virtual * machine guest @@ -4645,6 +4670,104 @@ amdsmi_status_t amdsmi_get_gpu_ecc_enabled(amdsmi_processor_handle processor_han amdsmi_status_t amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_error_count_t *ec); + +#pragma pack(push, 1) +typedef struct { + unsigned char b[16]; +} amdsmi_cper_guid_t; + +typedef struct { + uint8_t seconds; + uint8_t minutes; + uint8_t hours; + uint8_t flag; + uint8_t day; + uint8_t month; + uint8_t year; + uint8_t century; +} amdsmi_cper_timestamp_t; + +typedef struct { + uint32_t platform_id : 1; + uint32_t timestamp : 1; + uint32_t partition_id : 1; + uint32_t reserved : 29; +} valid_bits_t; + +typedef union { + struct valid_bits_ { + uint32_t platform_id : 1; + uint32_t timestamp : 1; + uint32_t partition_id : 1; + uint32_t reserved : 29; + } valid_bits; + uint32_t valid_mask; +} amdsmi_cper_valid_bits_t; + +typedef struct { + char signature[4]; /* "CPER" */ + uint16_t revision; + uint32_t signature_end; /* 0xFFFFFFFF */ + uint16_t sec_cnt; + amdsmi_cper_sev_t error_severity; + + // valid_bits_t valid_bits; + // uint32_t valid_mask; + amdsmi_cper_valid_bits_t cper_valid_bits; + + uint32_t record_length; /* Total size of CPER Entry */ + amdsmi_cper_timestamp_t timestamp; + char platform_id[16]; + amdsmi_cper_guid_t partition_id; /* Reserved */ + char creator_id[16]; + amdsmi_cper_guid_t notify_type; /* CMC, MCE, can use amdsmi_cper_notifiy_type_t to decode*/ + char record_id[8]; /* Unique CPER Entry ID */ + uint32_t flags; /* Reserved */ + uint64_t persistence_info; /* Reserved */ + uint8_t reserved[12]; /* Reserved */ +} amdsmi_cper_hdr_t; + +#pragma pack(pop) +/** + * @brief Retrieve CPER entries cached in the driver. + * + * The user will pass buffers to hold the CPER data and CPER headers. The library will + * fill the buffer based on the severity_mask user passed. It will also parse the CPER header + * and stored in the cper_hdrs array. The user can use the cper_hdrs to get the timestamp and other header information. + * A cursor is also returned to the user, which can be used to get the next set of CPER entries. + * + * If there are more data than any of the buffers user pass, the library will return AMDSMI_STATUS_MORE_DATA. + * User can call the API again with the cursor returned at previous call to get more data. + * If the buffer size is too small to even hold one entry, the library + * will return AMDSMI_STATUS_OUT_OF_RESOURCES. + * + * Even if the API returns AMDSMI_STATUS_MORE_DATA, the 2nd call may still get the entry_count == 0 as the driver + * cache may not contain the serverity user is interested in. The API should return AMDSMI_STATUS_SUCCESS in this case + * so that user can ignore that call. + * + * @ingroup tagECCInfo + * + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} + * + * @param[in] processor_handle Handle to the processor for which CPER entries are to be retrieved. + * @param[in] severity_mask The severity mask of the entries to be retrieved. + * @param[in,out] cper_data Pointer to a buffer where the CPER data will be stored. User must allocate the buffer + * and set the buf_size correctly. + * @param[in,out] buf_size Pointer to a variable that specifies the size of the cper_data. + * On return, it will contain the actual size of the data written to the cper_data. + * @param[in,out] cper_hdrs Array of the parsed headers of the cper_data. The user must allocate + * the array of pointers to cper_hdr. The library will fill the array with the pointers to the parsed + * headers. The underlying data is in the cper_data buffer and only pointer is stored in this array. + * @param[in,out] entry_count Pointer to a variable that specifies the array length of the cper_hdrs user allocated. + * On return, it will contain the actual entries written to the cper_hdrs. + * @param[in,out] cursor Pointer to a variable that will contain the cursor for the next call. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_get_gpu_cper_entries(amdsmi_processor_handle processor_handle, uint32_t severity_mask, char *cper_data, + uint64_t *buf_size, amdsmi_cper_hdr_t** cper_hdrs, uint64_t *entry_count, uint64_t *cursor); + /** @} End tagECCInfo */ /*****************************************************************************/ @@ -5904,7 +6027,7 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a * @brief Returns the 'xcd_counter' from the GPU metrics associated with the device * * @ingroup tagAsicBoardInfo - * + * * @platform{gpu_bm_linux} @platform{guest_1vf} @platform{guest_mvf} * * @param[in] processor_handle Device which to query diff --git a/projects/amdsmi/py-interface/__init__.py b/projects/amdsmi/py-interface/__init__.py index 41484920d7..e89e1a957f 100644 --- a/projects/amdsmi/py-interface/__init__.py +++ b/projects/amdsmi/py-interface/__init__.py @@ -124,6 +124,7 @@ from .amdsmi_interface import amdsmi_get_gpu_board_info # # Ras Information from .amdsmi_interface import amdsmi_get_gpu_ras_feature_info from .amdsmi_interface import amdsmi_get_gpu_ras_block_features_enabled +from .amdsmi_interface import amdsmi_get_gpu_cper_entries # # Unsupported Functions In Virtual Environment from .amdsmi_interface import amdsmi_set_gpu_pci_bandwidth diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 321e5e7c62..1940acb669 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -25,7 +25,7 @@ import os import re import sys from collections.abc import Iterable -from enum import IntEnum +from enum import IntEnum, Enum from pathlib import Path from time import asctime, localtime, time from typing import Any, Dict, List, Tuple, Union @@ -386,6 +386,21 @@ class AmdSmiRasErrState(IntEnum): INVALID = amdsmi_wrapper.AMDSMI_RAS_ERR_STATE_INVALID +class AmdSmiCperNotifyType(Enum): + CMC = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_CMC + CPE = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_CPE + MCE = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_MCE + PCIE = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_PCIE + INIT = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_INIT + NMI = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_NMI + BOOT = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_BOOT + DMAr = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_DMAR + SEA = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_SEA + SEI = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_SEI + PEI = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_PEI + CXL_COMPONENT = amdsmi_wrapper.AMDSMI_CPER_NOTIFY_TYPE_CXL_COMPONENT + + class AmdSmiMemoryType(IntEnum): VRAM = amdsmi_wrapper.AMDSMI_MEM_TYPE_VRAM VIS_VRAM = amdsmi_wrapper.AMDSMI_MEM_TYPE_VIS_VRAM @@ -460,6 +475,7 @@ class AmdSmiVirtualizationMode(IntEnum): GUEST = amdsmi_wrapper.AMDSMI_VIRTUALIZATION_MODE_GUEST PASSTHROUGH = amdsmi_wrapper.AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH + class AmdSmiVramType(IntEnum): UNKNOWN = amdsmi_wrapper.AMDSMI_VRAM_TYPE_UNKNOWN HBM = amdsmi_wrapper.AMDSMI_VRAM_TYPE_HBM @@ -478,6 +494,7 @@ class AmdSmiVramType(IntEnum): GDDR7 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR7 MAX = amdsmi_wrapper.AMDSMI_VRAM_TYPE__MAX + class AmdSmiVramVendor(IntEnum): SAMSUNG = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_SAMSUNG INFINEON = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_INFINEON @@ -491,6 +508,7 @@ class AmdSmiVramVendor(IntEnum): MICRON = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_MICRON UNKNOWN = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_UNKNOWN + class AmdSmiEventReader: def __init__( self, processor_handle: amdsmi_wrapper.amdsmi_processor_handle, @@ -699,6 +717,21 @@ def _validate_if_max_uint(value, uint_type: MaxUIntegerTypes, isActivity=False, else: return return_val + +def _notifyTypeToString(notify_type_b): + guid = [] + # Iterate over only the first 8 bytes, but backwards + for i in notify_type_b[7::-1]: + guid.append(format(i, '02x')) + hex_string = "".join(guid) + hex_value = int(hex_string, 16) + if hex_value in AmdSmiCperNotifyType._value2member_map_: + # Convert to the corresponding enum name + return AmdSmiCperNotifyType(hex_value).name + else: + return "Unknown" + + def amdsmi_get_socket_handles() -> List[amdsmi_wrapper.amdsmi_socket_handle]: """ Function that gets socket handles. Wraps the same named function call. @@ -1782,7 +1815,7 @@ def amdsmi_get_gpu_enumeration_info(processor_handle: amdsmi_wrapper.amdsmi_proc # Call the C function to populate the struct status = amdsmi_wrapper.amdsmi_get_gpu_enumeration_info(processor_handle, ctypes.byref(enumeration_info)) - + # Validate the status result _check_res(status) @@ -2238,6 +2271,96 @@ def amdsmi_get_gpu_total_ecc_count( "deferred_count": ec.deferred_count, } +def notifyTypeToString(notify_type_b): + idx = 0 + guid = [] + for i in notify_type_b: + guid.append(format(i, '02x')) + if idx == 7: + break + idx = idx +1 + return "".join(guid[::-1]) + +def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + severity_mask: int, + buffer_size: int = 4*1048576, + cursor: int = 0 +) -> Tuple[List[Dict[str, Any]], int]: + + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + # Allocate a buffer for CPER data. + buf = ctypes.create_string_buffer(buffer_size) + buf_size = ctypes.c_uint64(buffer_size) + entry_count = ctypes.c_uint64(20) + cur = ctypes.c_uint64(cursor) + # Allocate a pointer for the CPER header array. + cper_hdrs_array = (ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * 20)() + cper_hdrs = ctypes.cast(cper_hdrs_array, ctypes.POINTER(ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t))) + + # Call the underlying AMD-SMI API. + ret = amdsmi_wrapper.amdsmi_get_gpu_cper_entries( + processor_handle, + ctypes.c_uint32(severity_mask), + buf, + ctypes.byref(buf_size), + cper_hdrs, + ctypes.byref(entry_count), + ctypes.byref(cur) + ) + if ret != amdsmi_wrapper.AMDSMI_STATUS_SUCCESS: + raise AmdSmiLibraryException(ret) + + entries = {} + cper_data = [] + offset = 0 + # Iterate over each entry using its variable record_length. + for i in range(entry_count.value): + entry_address = ctypes.addressof(buf) + offset + entry_ptr = ctypes.cast(entry_address, ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t)) + cper_data.append({ + "bytes":list((entry_ptr.contents.record_length * ctypes.c_byte).from_address(entry_address)), + "size":entry_ptr.contents.record_length + }) + # Extract the timestamp fields. + year = entry_ptr.contents.timestamp.year + # Adjust the year if it's less than 100. You can tweak this logic based on your expected data. + if year < 100: + year += 2000 + formatted_timestamp = ( + f"{year:04d}/" + f"{entry_ptr.contents.timestamp.month:02d}/" + f"{entry_ptr.contents.timestamp.day:02d} " + f"{entry_ptr.contents.timestamp.hours:02d}:" + f"{entry_ptr.contents.timestamp.minutes:02d}:" + f"{entry_ptr.contents.timestamp.seconds:02d}" + ) + cper_entry = { + "error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get(entry_ptr.contents.error_severity, "AMDSMI_CPER_SEV_UNUSED").replace("AMDSMI_CPER_SEV_", "").lower(), + "notify_type": _notifyTypeToString(entry_ptr.contents.notify_type.b), + "timestamp": formatted_timestamp, + "signature" : entry_ptr.contents.signature, + "revision" : entry_ptr.contents.revision, + "signature_end" : hex(entry_ptr.contents.signature_end), + "sec_cnt" : entry_ptr.contents.sec_cnt, + "record_length" : entry_ptr.contents.record_length, + "platform_id" : entry_ptr.contents.platform_id, + "creator_id" : entry_ptr.contents.creator_id, + "record_id" : entry_ptr.contents.record_id, + "flags" : entry_ptr.contents.flags, + "persistence_info" : entry_ptr.contents.persistence_info, + #"reserved" : entry_ptr.contents.reserved + #"cper_valid_bit" : entry_ptr.contents.cper_valid_bits, + #"partition_id" : entry_ptr.contents.partition_id, + } + entries[i] = cper_entry.copy() + offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset + + return entries, cur.value, cper_data + def amdsmi_get_gpu_board_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, @@ -2938,7 +3061,7 @@ def amdsmi_get_gpu_memory_partition_config(processor_handle: amdsmi_wrapper.amds raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - + config = amdsmi_wrapper.amdsmi_memory_partition_config_t() _check_res( @@ -3017,10 +3140,10 @@ def amdsmi_get_gpu_accelerator_partition_profile( ) profile_type_ret = amdsmi_wrapper.amdsmi_accelerator_partition_type_t__enumvalues[profile.profile_type].replace("AMDSMI_ACCELERATOR_PARTITION_", "") profile_type_ret = profile_type_ret.replace("INVALID", "N/A") - + length = profile.num_partitions partition_ids = [] - + #partition_id[0] will contain the partition id of each device #BM/Guest will include this logic. Host will only display primary partition ids. kPOSITION_OF_PARTITION_ID = 0 @@ -3079,7 +3202,7 @@ def amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle: amdsmi profile_type_ret = profile_type_ret.replace("INVALID", "N/A") resources = [] - + mem_caps_list = [] if profile.memory_caps.nps_flags.nps1_cap == 1: mem_caps_list.append("NPS1") @@ -3104,7 +3227,7 @@ def amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle: amdsmi logging.debug("\namdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile_config | resource_profile_dict = " + str(resource_profile_dict)) resources.append(resource_profile_dict) resource_idx += 1 - + profile_dict = { "profile_type": profile_type_ret, "num_partitions": profile.num_partitions, diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index e39bd3e881..fd1acf4125 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -324,6 +324,7 @@ amdsmi_status_t__enumvalues = { 54: 'AMDSMI_STATUS_AMDGPU_RESTART_ERR', 55: 'AMDSMI_STATUS_SETTING_UNAVAILABLE', 56: 'AMDSMI_STATUS_CORRUPTED_EEPROM', + 57: 'AMDSMI_STATUS_MORE_DATA', 4294967294: 'AMDSMI_STATUS_MAP_ERROR', 4294967295: 'AMDSMI_STATUS_UNKNOWN_ERROR', } @@ -369,6 +370,7 @@ AMDSMI_STATUS_ARG_PTR_NULL = 53 AMDSMI_STATUS_AMDGPU_RESTART_ERR = 54 AMDSMI_STATUS_SETTING_UNAVAILABLE = 55 AMDSMI_STATUS_CORRUPTED_EEPROM = 56 +AMDSMI_STATUS_MORE_DATA = 57 AMDSMI_STATUS_MAP_ERROR = 4294967294 AMDSMI_STATUS_UNKNOWN_ERROR = 4294967295 amdsmi_status_t = ctypes.c_uint32 # enum @@ -856,21 +858,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('max_pcie_interface_version', ctypes.c_uint32), - ('PADDING_1', ctypes.c_ubyte * 4), - ('reserved', ctypes.c_uint64 * 9), -] - class struct_pcie_metric_(Structure): pass @@ -891,6 +878,21 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 12), ] +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('max_pcie_interface_version', ctypes.c_uint32), + ('PADDING_1', ctypes.c_ubyte * 4), + ('reserved', ctypes.c_uint64 * 9), +] + struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -1601,6 +1603,50 @@ CLK_LIMIT_MIN = 0 CLK_LIMIT_MAX = 1 amdsmi_clk_limit_type_t = ctypes.c_uint32 # enum +# values for enumeration 'amdsmi_cper_sev_t' +amdsmi_cper_sev_t__enumvalues = { + 0: 'AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED', + 1: 'AMDSMI_CPER_SEV_FATAL', + 2: 'AMDSMI_CPER_SEV_NON_FATAL_CORRECTED', + 3: 'AMDSMI_CPER_SEV_NUM', + 10: 'AMDSMI_CPER_SEV_UNUSED', +} +AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED = 0 +AMDSMI_CPER_SEV_FATAL = 1 +AMDSMI_CPER_SEV_NON_FATAL_CORRECTED = 2 +AMDSMI_CPER_SEV_NUM = 3 +AMDSMI_CPER_SEV_UNUSED = 10 +amdsmi_cper_sev_t = ctypes.c_uint32 # enum + +# values for enumeration 'amdsmi_cper_notify_type_t' +amdsmi_cper_notify_type_t__enumvalues = { + 4976123370175105969: 'AMDSMI_CPER_NOTIFY_TYPE_CMC', + 5356425115412803478: 'AMDSMI_CPER_NOTIFY_TYPE_CPE', + 5531987820403847166: 'AMDSMI_CPER_NOTIFY_TYPE_MCE', + 5619395120325705759: 'AMDSMI_CPER_NOTIFY_TYPE_PCIE', + 4992964802890589160: 'AMDSMI_CPER_NOTIFY_TYPE_INIT', + 4812579876830546431: 'AMDSMI_CPER_NOTIFY_TYPE_NMI', + 4655221457236894822: 'AMDSMI_CPER_NOTIFY_TYPE_BOOT', + 5487573144795207569: 'AMDSMI_CPER_NOTIFY_TYPE_DMAR', + 1289362001033197706: 'AMDSMI_CPER_NOTIFY_TYPE_SEA', + 5658685719731260545: 'AMDSMI_CPER_NOTIFY_TYPE_SEI', + 4761520883332928940: 'AMDSMI_CPER_NOTIFY_TYPE_PEI', + 5306157213770398665: 'AMDSMI_CPER_NOTIFY_TYPE_CXL_COMPONENT', +} +AMDSMI_CPER_NOTIFY_TYPE_CMC = 4976123370175105969 +AMDSMI_CPER_NOTIFY_TYPE_CPE = 5356425115412803478 +AMDSMI_CPER_NOTIFY_TYPE_MCE = 5531987820403847166 +AMDSMI_CPER_NOTIFY_TYPE_PCIE = 5619395120325705759 +AMDSMI_CPER_NOTIFY_TYPE_INIT = 4992964802890589160 +AMDSMI_CPER_NOTIFY_TYPE_NMI = 4812579876830546431 +AMDSMI_CPER_NOTIFY_TYPE_BOOT = 4655221457236894822 +AMDSMI_CPER_NOTIFY_TYPE_DMAR = 5487573144795207569 +AMDSMI_CPER_NOTIFY_TYPE_SEA = 1289362001033197706 +AMDSMI_CPER_NOTIFY_TYPE_SEI = 5658685719731260545 +AMDSMI_CPER_NOTIFY_TYPE_PEI = 4761520883332928940 +AMDSMI_CPER_NOTIFY_TYPE_CXL_COMPONENT = 5306157213770398665 +amdsmi_cper_notify_type_t = ctypes.c_uint64 # enum + # values for enumeration 'amdsmi_ras_err_state_t' amdsmi_ras_err_state_t__enumvalues = { 0: 'AMDSMI_RAS_ERR_STATE_NONE', @@ -2520,6 +2566,91 @@ amdsmi_get_gpu_ecc_enabled.argtypes = [amdsmi_processor_handle, ctypes.POINTER(c amdsmi_get_gpu_total_ecc_count = _libraries['libamd_smi.so'].amdsmi_get_gpu_total_ecc_count amdsmi_get_gpu_total_ecc_count.restype = amdsmi_status_t amdsmi_get_gpu_total_ecc_count.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_error_count_t)] +class struct_amdsmi_cper_guid_t(Structure): + pass + +struct_amdsmi_cper_guid_t._pack_ = 1 # source:False +struct_amdsmi_cper_guid_t._fields_ = [ + ('b', ctypes.c_ubyte * 16), +] + +amdsmi_cper_guid_t = struct_amdsmi_cper_guid_t +class struct_amdsmi_cper_timestamp_t(Structure): + pass + +struct_amdsmi_cper_timestamp_t._pack_ = 1 # source:False +struct_amdsmi_cper_timestamp_t._fields_ = [ + ('seconds', ctypes.c_ubyte), + ('minutes', ctypes.c_ubyte), + ('hours', ctypes.c_ubyte), + ('flag', ctypes.c_ubyte), + ('day', ctypes.c_ubyte), + ('month', ctypes.c_ubyte), + ('year', ctypes.c_ubyte), + ('century', ctypes.c_ubyte), +] + +amdsmi_cper_timestamp_t = struct_amdsmi_cper_timestamp_t +class struct_valid_bits_t(Structure): + pass + +struct_valid_bits_t._pack_ = 1 # source:False +struct_valid_bits_t._fields_ = [ + ('platform_id', ctypes.c_uint32, 1), + ('timestamp', ctypes.c_uint32, 1), + ('partition_id', ctypes.c_uint32, 1), + ('reserved', ctypes.c_uint32, 29), +] + +valid_bits_t = struct_valid_bits_t +class union_amdsmi_cper_valid_bits_t(Union): + pass + +class struct_valid_bits_(Structure): + pass + +struct_valid_bits_._pack_ = 1 # source:False +struct_valid_bits_._fields_ = [ + ('platform_id', ctypes.c_uint32, 1), + ('timestamp', ctypes.c_uint32, 1), + ('partition_id', ctypes.c_uint32, 1), + ('reserved', ctypes.c_uint32, 29), +] + +union_amdsmi_cper_valid_bits_t._pack_ = 1 # source:False +union_amdsmi_cper_valid_bits_t._fields_ = [ + ('valid_bits', struct_valid_bits_), + ('valid_mask', ctypes.c_uint32), +] + +amdsmi_cper_valid_bits_t = union_amdsmi_cper_valid_bits_t +class struct_amdsmi_cper_hdr_t(Structure): + pass + +struct_amdsmi_cper_hdr_t._pack_ = 1 # source:False +struct_amdsmi_cper_hdr_t._fields_ = [ + ('signature', ctypes.c_char * 4), + ('revision', ctypes.c_uint16), + ('signature_end', ctypes.c_uint32), + ('sec_cnt', ctypes.c_uint16), + ('error_severity', amdsmi_cper_sev_t), + ('cper_valid_bits', amdsmi_cper_valid_bits_t), + ('record_length', ctypes.c_uint32), + ('timestamp', amdsmi_cper_timestamp_t), + ('platform_id', ctypes.c_char * 16), + ('partition_id', amdsmi_cper_guid_t), + ('creator_id', ctypes.c_char * 16), + ('notify_type', amdsmi_cper_guid_t), + ('record_id', ctypes.c_char * 8), + ('flags', ctypes.c_uint32), + ('persistence_info', ctypes.c_uint64), + ('reserved', ctypes.c_ubyte * 12), +] + +amdsmi_cper_hdr_t = struct_amdsmi_cper_hdr_t +amdsmi_get_gpu_cper_entries = _libraries['libamd_smi.so'].amdsmi_get_gpu_cper_entries +amdsmi_get_gpu_cper_entries.restype = amdsmi_status_t +amdsmi_get_gpu_cper_entries.argtypes = [amdsmi_processor_handle, uint32_t, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.POINTER(struct_amdsmi_cper_hdr_t)), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64)] amdsmi_get_gpu_ecc_status = _libraries['libamd_smi.so'].amdsmi_get_gpu_ecc_status amdsmi_get_gpu_ecc_status.restype = amdsmi_status_t amdsmi_get_gpu_ecc_status.argtypes = [amdsmi_processor_handle, amdsmi_gpu_block_t, ctypes.POINTER(amdsmi_ras_err_state_t)] @@ -2828,7 +2959,16 @@ __all__ = \ 'AMDSMI_COMPUTE_PARTITION_INVALID', 'AMDSMI_COMPUTE_PARTITION_QPX', 'AMDSMI_COMPUTE_PARTITION_SPX', 'AMDSMI_COMPUTE_PARTITION_TPX', 'AMDSMI_CONTAINER_DOCKER', - 'AMDSMI_CONTAINER_LXC', 'AMDSMI_DEV_PERF_LEVEL_AUTO', + 'AMDSMI_CONTAINER_LXC', 'AMDSMI_CPER_NOTIFY_TYPE_BOOT', + 'AMDSMI_CPER_NOTIFY_TYPE_CMC', 'AMDSMI_CPER_NOTIFY_TYPE_CPE', + 'AMDSMI_CPER_NOTIFY_TYPE_CXL_COMPONENT', + 'AMDSMI_CPER_NOTIFY_TYPE_DMAR', 'AMDSMI_CPER_NOTIFY_TYPE_INIT', + 'AMDSMI_CPER_NOTIFY_TYPE_MCE', 'AMDSMI_CPER_NOTIFY_TYPE_NMI', + 'AMDSMI_CPER_NOTIFY_TYPE_PCIE', 'AMDSMI_CPER_NOTIFY_TYPE_PEI', + 'AMDSMI_CPER_NOTIFY_TYPE_SEA', 'AMDSMI_CPER_NOTIFY_TYPE_SEI', + 'AMDSMI_CPER_SEV_FATAL', 'AMDSMI_CPER_SEV_NON_FATAL_CORRECTED', + 'AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED', 'AMDSMI_CPER_SEV_NUM', + 'AMDSMI_CPER_SEV_UNUSED', 'AMDSMI_DEV_PERF_LEVEL_AUTO', 'AMDSMI_DEV_PERF_LEVEL_DETERMINISM', 'AMDSMI_DEV_PERF_LEVEL_FIRST', 'AMDSMI_DEV_PERF_LEVEL_HIGH', 'AMDSMI_DEV_PERF_LEVEL_LAST', 'AMDSMI_DEV_PERF_LEVEL_LOW', @@ -2955,9 +3095,9 @@ __all__ = \ 'AMDSMI_STATUS_INSUFFICIENT_SIZE', 'AMDSMI_STATUS_INTERNAL_EXCEPTION', 'AMDSMI_STATUS_INTERRUPT', 'AMDSMI_STATUS_INVAL', 'AMDSMI_STATUS_IO', - 'AMDSMI_STATUS_MAP_ERROR', 'AMDSMI_STATUS_NON_AMD_CPU', - 'AMDSMI_STATUS_NOT_FOUND', 'AMDSMI_STATUS_NOT_INIT', - 'AMDSMI_STATUS_NOT_SUPPORTED', + 'AMDSMI_STATUS_MAP_ERROR', 'AMDSMI_STATUS_MORE_DATA', + 'AMDSMI_STATUS_NON_AMD_CPU', 'AMDSMI_STATUS_NOT_FOUND', + 'AMDSMI_STATUS_NOT_INIT', 'AMDSMI_STATUS_NOT_SUPPORTED', 'AMDSMI_STATUS_NOT_YET_IMPLEMENTED', 'AMDSMI_STATUS_NO_DATA', 'AMDSMI_STATUS_NO_DRV', 'AMDSMI_STATUS_NO_ENERGY_DRV', 'AMDSMI_STATUS_NO_HSMP_DRV', 'AMDSMI_STATUS_NO_HSMP_MSG_SUP', @@ -3022,6 +3162,9 @@ __all__ = \ 'amdsmi_clk_limit_type_t', 'amdsmi_clk_type_t', 'amdsmi_compute_partition_type_t', 'amdsmi_container_types_t', 'amdsmi_counter_command_t', 'amdsmi_counter_value_t', + 'amdsmi_cper_guid_t', 'amdsmi_cper_hdr_t', + 'amdsmi_cper_notify_type_t', 'amdsmi_cper_sev_t', + 'amdsmi_cper_timestamp_t', 'amdsmi_cper_valid_bits_t', 'amdsmi_cpu_apb_disable', 'amdsmi_cpu_apb_enable', 'amdsmi_cpu_info_t', 'amdsmi_cpu_util_t', 'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t', @@ -3074,10 +3217,10 @@ __all__ = \ 'amdsmi_get_gpu_compute_process_gpus', 'amdsmi_get_gpu_compute_process_info', 'amdsmi_get_gpu_compute_process_info_by_pid', - 'amdsmi_get_gpu_device_bdf', 'amdsmi_get_gpu_device_uuid', - 'amdsmi_get_gpu_driver_info', 'amdsmi_get_gpu_ecc_count', - 'amdsmi_get_gpu_ecc_enabled', 'amdsmi_get_gpu_ecc_status', - 'amdsmi_get_gpu_enumeration_info', + 'amdsmi_get_gpu_cper_entries', 'amdsmi_get_gpu_device_bdf', + 'amdsmi_get_gpu_device_uuid', 'amdsmi_get_gpu_driver_info', + 'amdsmi_get_gpu_ecc_count', 'amdsmi_get_gpu_ecc_enabled', + 'amdsmi_get_gpu_ecc_status', 'amdsmi_get_gpu_enumeration_info', 'amdsmi_get_gpu_event_notification', 'amdsmi_get_gpu_fan_rpms', 'amdsmi_get_gpu_fan_speed', 'amdsmi_get_gpu_fan_speed_max', 'amdsmi_get_gpu_id', 'amdsmi_get_gpu_kfd_info', @@ -3192,11 +3335,13 @@ __all__ = \ 'struct_amdsmi_accelerator_partition_resource_profile_t', 'struct_amdsmi_asic_info_t', 'struct_amdsmi_board_info_t', 'struct_amdsmi_clk_info_t', 'struct_amdsmi_counter_value_t', - 'struct_amdsmi_cpu_info_t', 'struct_amdsmi_cpu_util_t', - 'struct_amdsmi_ddr_bw_metrics_t', 'struct_amdsmi_dimm_power_t', - 'struct_amdsmi_dimm_thermal_t', 'struct_amdsmi_dpm_level_t', - 'struct_amdsmi_dpm_policy_entry_t', 'struct_amdsmi_dpm_policy_t', - 'struct_amdsmi_driver_info_t', 'struct_amdsmi_engine_usage_t', + 'struct_amdsmi_cper_guid_t', 'struct_amdsmi_cper_hdr_t', + 'struct_amdsmi_cper_timestamp_t', 'struct_amdsmi_cpu_info_t', + 'struct_amdsmi_cpu_util_t', 'struct_amdsmi_ddr_bw_metrics_t', + 'struct_amdsmi_dimm_power_t', 'struct_amdsmi_dimm_thermal_t', + 'struct_amdsmi_dpm_level_t', 'struct_amdsmi_dpm_policy_entry_t', + 'struct_amdsmi_dpm_policy_t', 'struct_amdsmi_driver_info_t', + 'struct_amdsmi_engine_usage_t', 'struct_amdsmi_enumeration_info_t', 'struct_amdsmi_error_count_t', 'struct_amdsmi_evt_notification_data_t', 'struct_amdsmi_freq_volt_region_t', 'struct_amdsmi_frequencies_t', @@ -3228,6 +3373,8 @@ __all__ = \ 'struct_engine_usage_', 'struct_fw_info_list_', 'struct_memory_usage_', 'struct_nps_flags_', 'struct_numa_range_', 'struct_pcie_metric_', 'struct_pcie_static_', - 'struct_amdsmi_bdf_t', 'uint32_t', 'uint64_t', 'uint8_t', - 'union_amdsmi_bdf_t', 'union_amdsmi_nps_caps_t'] + 'struct_amdsmi_bdf_t', 'struct_valid_bits_', + 'struct_valid_bits_t', 'uint32_t', 'uint64_t', 'uint8_t', + 'union_amdsmi_bdf_t', 'union_amdsmi_cper_valid_bits_t', + 'union_amdsmi_nps_caps_t', 'valid_bits_t'] diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 0f4fba4a22..4cef2ad6c5 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -21,6 +21,7 @@ */ #include +#include #include #include #include @@ -542,6 +543,7 @@ amdsmi_status_t amdsmi_get_processor_type(amdsmi_processor_handle processor_hand return AMDSMI_STATUS_SUCCESS; } + amdsmi_status_t amdsmi_get_gpu_device_bdf(amdsmi_processor_handle processor_handle, amdsmi_bdf_t *bdf) { @@ -3547,6 +3549,276 @@ amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_ return AMDSMI_STATUS_SUCCESS; } +namespace { +static std::vector +amdsmi_get_gpu_cper_headers(const char *buffer, size_t buffer_sz) { + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] buffer_sz: " << buffer_sz; + LOG_DEBUG(ss); + + std::vector headers; + if(!buffer) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] buffer is null"; + LOG_ERROR(ss); + return headers; + } + static constexpr char cper_signature[] = "CPER"; + static constexpr size_t cper_signature_size = sizeof(cper_signature) - 1; + for(size_t data_idx = 0; + buffer_sz >= cper_signature_size && + data_idx < buffer_sz - cper_signature_size; + ++data_idx) { + + const amdsmi_cper_hdr_t *hdr = reinterpret_cast( + &buffer[data_idx]); + if(hdr->signature[0] != 'C' || hdr->signature[1] != 'P' || + hdr->signature[2] != 'E' || hdr->signature[3] != 'R' ) { + continue; + } + if(hdr->signature_end != 0xFFFFFFFF) { + continue; + } + if(hdr->record_length > buffer_sz) { + continue; + } + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] add header at data_idx: " << data_idx + << ", sig: " << hdr->signature[0] << hdr->signature[1] << hdr->signature[2] << hdr->signature[3]; + LOG_DEBUG(ss); + headers.emplace_back(hdr); + } + return headers; +} + +struct CperFileCtx { + amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR; + std::unique_ptr buffer; + long file_size = 0; +}; + +static auto amdsmi_read_cper_file(const std::string &filepath) { + + std::ostringstream ss; + + CperFileCtx ctx; + ctx.status = AMDSMI_STATUS_FILE_ERROR; + ctx.file_size = 0; + + struct stat file_stats; + if (stat(filepath.c_str(), &file_stats) == 0) { + if (!S_ISREG(file_stats.st_mode)) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file is not a regular file: " + << filepath << ", errno: " << errno << "): " << strerror(errno); + return ctx; + } + } else { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file does not exist: " + << filepath << ", errno: " << errno << "): " << strerror(errno); + ctx.status = AMDSMI_STATUS_FILE_NOT_FOUND; + return ctx; + } + + ctx.file_size = file_stats.st_size; + ctx.buffer = std::make_unique(ctx.file_size); + int file = open(filepath.c_str(), O_RDONLY); + if (file == -1) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] failed to open file: " + << filepath << ", errno:()" << errno << "): " << strerror(errno); + LOG_ERROR(ss); + return ctx; + } + long bytes_read = read(file, ctx.buffer.get(), ctx.file_size); + if (bytes_read <= 0) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] failed to read complete file, read only " + << bytes_read << " of " << ctx.file_size << " bytes"; + LOG_ERROR(ss); + return ctx; + } + close(file); + + ctx.status = AMDSMI_STATUS_SUCCESS; + ctx.file_size = bytes_read; + return ctx; +} +}//namespace + +amdsmi_status_t +amdsmi_get_gpu_cper_entries_by_path( + const std::string &amdgpu_ring_cper_file, + uint32_t severity_mask, + char *cper_data, + uint64_t *buf_size, + amdsmi_cper_hdr_t **cper_hdrs, + uint64_t *entry_count, + uint64_t *cursor) { + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] begin\n" + << ", amdgpu_ring_cper_file: " << amdgpu_ring_cper_file + << ", severity_mask: " << severity_mask; + LOG_DEBUG(ss); + + if(!cper_data) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_data should be a valid memory address\n"; + LOG_ERROR(ss); + if(entry_count) {*entry_count = 0;} + if(buf_size) { *buf_size = 0; } + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!buf_size) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be a valid memory address"; + LOG_ERROR(ss); + if(entry_count) {*entry_count = 0;} + if(buf_size) { *buf_size = 0; } + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!*buf_size) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be greater than zero"; + LOG_ERROR(ss); + if(entry_count) {*entry_count = 0;} + if(buf_size) { *buf_size = 0; } + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!cper_hdrs) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs should be a valid memory address"; + LOG_ERROR(ss); + if(entry_count) {*entry_count = 0;} + if(buf_size) { *buf_size = 0; } + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!entry_count) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be a valid memory address"; + LOG_ERROR(ss); + if(entry_count) {*entry_count = 0;} + if(buf_size) { *buf_size = 0; } + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!*entry_count) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be greater than 0"; + LOG_ERROR(ss); + if(entry_count) {*entry_count = 0;} + if(buf_size) { *buf_size = 0; } + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!cursor) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cursor should be a valid memory address"; + LOG_ERROR(ss); + if(entry_count) {*entry_count = 0;} + if(buf_size) { *buf_size = 0; } + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + + auto ctx = amdsmi_read_cper_file(amdgpu_ring_cper_file); + if(ctx.status != AMDSMI_STATUS_SUCCESS) { + return ctx.status; + } + + auto headers = amdsmi_get_gpu_cper_headers(ctx.buffer.get(), ctx.file_size); + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] num headers: " << headers.size(); + LOG_DEBUG(ss); + + uint64_t data_idx = 0; + uint64_t header_idx = 0; + size_t num_headers_copied = 0; + for(const amdsmi_cper_hdr_t *header: headers) { + if(((1 << header->error_severity) & severity_mask) != + (1 << header->error_severity)) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x" + << std::hex << (1 << header->error_severity) << ", given severity_mask: 0x" + << std::hex << severity_mask << ", record_length:" + << std::dec << header->record_length; + LOG_DEBUG(ss); + continue; + } + else { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x" + << std::hex << (1 << header->error_severity) << ", given severity_mask: 0x" + << std::hex << severity_mask << ", record_length:" + << std::dec << header->record_length; + LOG_DEBUG(ss); + } + if((*buf_size - data_idx) < header->record_length ) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buffer filled up without copying all cper entries, buf_size: " << std::dec << *buf_size; + LOG_ERROR(ss); + *entry_count = num_headers_copied; + *buf_size = data_idx; + return (data_idx == 0) ? + AMDSMI_STATUS_OUT_OF_RESOURCES : + AMDSMI_STATUS_MORE_DATA; + } + if(num_headers_copied == *entry_count) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs filled up before finished with copying all header pointers, entry_count: " << std::dec << *entry_count; + LOG_ERROR(ss); + *entry_count = num_headers_copied; + *buf_size = data_idx; + return (data_idx == 0) ? + AMDSMI_STATUS_OUT_OF_RESOURCES : + AMDSMI_STATUS_MORE_DATA; + } + if(*cursor != header_idx) { + ++header_idx; + continue; + } + cper_hdrs[num_headers_copied] = reinterpret_cast(&cper_data[data_idx]); + ++num_headers_copied; + *cursor = ++header_idx; + std::memcpy( + &cper_data[data_idx], + reinterpret_cast(header), + header->record_length); + data_idx += header->record_length; + } + *entry_count = num_headers_copied; + *buf_size = data_idx; + + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] *entry_count: " << (entry_count ? *entry_count : -1) + << ", *cursor: " << (cursor ? *cursor : -1) + << ", *buf_size: " << (buf_size ? *buf_size : -1); + + LOG_DEBUG(ss); + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_gpu_cper_entries( + amdsmi_processor_handle processor_handle, + uint32_t severity_mask, + char *cper_data, + uint64_t *buf_size, + amdsmi_cper_hdr_t **cper_hdrs, + uint64_t *entry_count, + uint64_t *cursor) { + + AMDSMI_CHECK_INIT(); + if (!amd::smi::is_sudo_user()) { + return AMDSMI_STATUS_NO_PERM; + } + + amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; + amdsmi_status_t status = get_gpu_device_from_handle(processor_handle, &gpu_device); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + std::string path = std::string("/sys/kernel/debug/dri/") + + std::to_string(gpu_device->get_card_from_bdf()) + + "/amdgpu_ring_cper"; + + + return amdsmi_get_gpu_cper_entries_by_path( + path, + severity_mask, + cper_data, + buf_size, + cper_hdrs, + entry_count, + cursor); +} + amdsmi_status_t amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list) { AMDSMI_CHECK_INIT(); diff --git a/projects/amdsmi/tests/amd_smi_test/CMakeLists.txt b/projects/amdsmi/tests/amd_smi_test/CMakeLists.txt index 6433986a9a..3680af541a 100644 --- a/projects/amdsmi/tests/amd_smi_test/CMakeLists.txt +++ b/projects/amdsmi/tests/amd_smi_test/CMakeLists.txt @@ -54,8 +54,6 @@ include_directories(${TEST} ${CMAKE_CURRENT_SOURCE_DIR}/.. ${ROCM_INC_DIR}/..) # Build rules add_executable(${TEST} ${tstSources} ${functionalSources}) - -#AMD_SMI_TARGET? target_link_libraries(${TEST} ${AMD_SMI_TARGET} GTest::gtest_main @@ -63,6 +61,9 @@ target_link_libraries(${TEST} stdc++ pthread) +target_compile_definitions(${TEST} PRIVATE + CPER_SYS_ROOT="${CMAKE_CURRENT_SOURCE_DIR}/cper") + # Install tests install( TARGETS ${TEST} diff --git a/projects/amdsmi/tests/amd_smi_test/amdsmi_get_gpu_cper_entries.cc b/projects/amdsmi/tests/amd_smi_test/amdsmi_get_gpu_cper_entries.cc new file mode 100644 index 0000000000..63365691e1 --- /dev/null +++ b/projects/amdsmi/tests/amd_smi_test/amdsmi_get_gpu_cper_entries.cc @@ -0,0 +1,365 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include + +#include "amd_smi/amdsmi.h" +#include "rocm_smi/rocm_smi_logger.h" + +extern amdsmi_status_t +amdsmi_get_gpu_cper_entries_by_path( + const std::string &amdgpu_ring_cper_file, + uint32_t severity_mask, + char *cper_data, + uint64_t *buf_size, + amdsmi_cper_hdr_t **cper_hdrs, + uint64_t *entry_count, + uint64_t *cursor); + +class CperEntriesTest : public testing::Test{ + //class object public so that it is accessible + //within the tests that are written +public: + CperEntriesTest() { + setenv("CPER_SYS_ROOT", CPER_SYS_ROOT, 1); + ROCmLogging::Logger::getInstance()-> + updateLogLevel(ROCmLogging::LogLevel::LOG_LEVEL_DEBUG); + } +}; + +TEST_F(CperEntriesTest, TestNullCperData){ + uint32_t gpu_num = 9; + uint32_t severity_mask = amdsmi_cper_sev_t::AMDSMI_CPER_SEV_FATAL; + char *cper_data = nullptr; + uint64_t buf_size = 0; + amdsmi_cper_hdr_t *cper_hdrs = nullptr; + uint64_t entry_count = 0; + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data, + nullptr, + &cper_hdrs, + &entry_count, + &cursor); + ASSERT_EQ(err, AMDSMI_STATUS_OUT_OF_RESOURCES); +} + +TEST_F(CperEntriesTest, TestNullBufferSize){ + uint32_t gpu_num = 9; + uint32_t severity_mask = amdsmi_cper_sev_t::AMDSMI_CPER_SEV_FATAL; + uint64_t buf_size = 0; + auto cper_data = std::make_unique(buf_size); + amdsmi_cper_hdr_t *cper_hdrs = nullptr; + uint64_t entry_count = 0; + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + nullptr, + &cper_hdrs, + &entry_count, + &cursor); + ASSERT_EQ(err, AMDSMI_STATUS_OUT_OF_RESOURCES); +} + +TEST_F(CperEntriesTest, TestNullCperHeaders){ + uint32_t gpu_num = 9; + uint32_t severity_mask = amdsmi_cper_sev_t::AMDSMI_CPER_SEV_FATAL; + uint64_t buf_size = 4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + amdsmi_cper_hdr_t *cper_hdrs = nullptr; + uint64_t entry_count = 0; + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + &cper_hdrs, + &entry_count, + &cursor); + ASSERT_EQ(err, AMDSMI_STATUS_OUT_OF_RESOURCES); +} + +TEST_F(CperEntriesTest, TestNullCperHeaderEntryCount){ + uint32_t gpu_num = 9; + uint32_t severity_mask = amdsmi_cper_sev_t::AMDSMI_CPER_SEV_FATAL; + uint64_t buf_size = 4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 0; + auto cper_hdrs = std::make_unique(entry_count); + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + nullptr, + &cursor); + ASSERT_EQ(err, AMDSMI_STATUS_OUT_OF_RESOURCES); +} + +TEST_F(CperEntriesTest, TestNotEnoughBufferSize){ + uint32_t gpu_num = 9; + uint32_t severity_mask = + AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED| + AMDSMI_CPER_SEV_NON_FATAL_CORRECTED| + AMDSMI_CPER_SEV_FATAL; + uint64_t buf_size = 1024; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 10; + auto cper_hdrs = std::make_unique(entry_count); + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + &entry_count, + &cursor); + ASSERT_EQ(err, AMDSMI_STATUS_MORE_DATA); + ASSERT_EQ(entry_count, 2); +} + +TEST_F(CperEntriesTest, TestNotEnoughHeaderPtrs){ + uint32_t gpu_num = 9; + uint32_t severity_mask = + AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED| + AMDSMI_CPER_SEV_NON_FATAL_CORRECTED| + AMDSMI_CPER_SEV_FATAL; + uint64_t buf_size = 4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 4; + auto cper_hdrs = std::make_unique(entry_count); + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + &entry_count, + &cursor); + ASSERT_EQ(entry_count, 4); + ASSERT_EQ(err, AMDSMI_STATUS_MORE_DATA); +} + +TEST_F(CperEntriesTest, TestGetsAllSeverityErrors){ + uint32_t gpu_num = 9; + uint32_t severity_mask = + (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED)| + (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED)| + (1 << AMDSMI_CPER_SEV_FATAL); + uint64_t buf_size = 4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 10; + auto cper_hdrs = std::make_unique(entry_count); + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + &entry_count, + &cursor); + ASSERT_EQ(entry_count, 8); + ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); +} + +TEST_F(CperEntriesTest, TestGetsCorrectableSeverityErrors){ + uint32_t gpu_num = 9; + uint32_t severity_mask = + (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED); + uint64_t buf_size = 4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 10; + auto cper_hdrs = std::make_unique(entry_count); + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + &entry_count, + &cursor); + ASSERT_EQ(entry_count, 1); + ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); +} + +TEST_F(CperEntriesTest, TestGetsFatalSeverityErrors){ + uint32_t gpu_num = 9; + uint32_t severity_mask = + (1 << AMDSMI_CPER_SEV_FATAL); + uint64_t buf_size = 4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 10; + auto cper_hdrs = std::make_unique(entry_count); + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + &entry_count, + &cursor); + ASSERT_EQ(entry_count, 1); + ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); +} + +TEST_F(CperEntriesTest, TestGetsUncorrectableSeverityErrors){ + uint32_t gpu_num = 9; + uint32_t severity_mask = + (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED); + uint64_t buf_size = 4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 10; + auto cper_hdrs = std::make_unique(entry_count); + uint64_t cursor = 0; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + &entry_count, + &cursor); + ASSERT_EQ(entry_count, 6); + ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); +} + +TEST_F(CperEntriesTest, TestCursor5GetsLast3HeadersGivenTotal8Headers){ + uint32_t gpu_num = 9; + uint32_t severity_mask = + (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED)| + (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED)| + (1 << AMDSMI_CPER_SEV_FATAL); + uint64_t buf_size = 4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 10; + auto cper_hdrs = std::make_unique(entry_count); + uint64_t cursor = 5; + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + &entry_count, + &cursor); + ASSERT_EQ(entry_count, 3); + ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); +} + +TEST_F(CperEntriesTest, TestCursorAdvances){ + uint32_t gpu_num = 9; + uint32_t severity_mask = + (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED)| + (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED)| + (1 << AMDSMI_CPER_SEV_FATAL); + uint64_t buf_size = 512;//4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 10; + auto cper_hdrs = std::make_unique(entry_count); + + uint64_t buf_size_original = buf_size; + uint64_t entry_count_original = entry_count; + uint64_t cursor_idx = 0; + uint64_t cursor = 0; + while(true) { + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + &entry_count, + &cursor); + ASSERT_EQ(entry_count, 1); + ASSERT_EQ(cursor, ++cursor_idx); + ASSERT_TRUE(err == AMDSMI_STATUS_MORE_DATA || err == AMDSMI_STATUS_SUCCESS); + if(err == AMDSMI_STATUS_SUCCESS) { + break; + } + buf_size = buf_size_original; + entry_count = entry_count_original; + } +} + +TEST_F(CperEntriesTest, TestGetsCorrectHeaderCountFromAllDevices) { + //we can get these deviceids by calling: + // ls -alh tests/amd_smi_test/cper/sys/kernel/debug/dri/ + static constexpr int deviceids[] = { 1, 9, 17, 25, 33}; + //we can get the numbers in the expected_num_headers array below by calling: + // hexdump -C tests/amd_smi_test/cper/sys/kernel/debug/dri//amdgpu_ring_cper | grep CPER|wc -l + // where is one of the entries in the deviceids array above. + static constexpr int expected_num_headers[] = { 19, 8, 7, 4, 7}; + + for(int device_idx = 0; + device_idx < sizeof(deviceids)/sizeof(deviceids[0]); + ++device_idx) { + + uint32_t gpu_num = deviceids[device_idx]; + uint32_t severity_mask = + (1 << AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED)| + (1 << AMDSMI_CPER_SEV_NON_FATAL_CORRECTED)| + (1 << AMDSMI_CPER_SEV_FATAL); + uint64_t buf_size = 4 * (1<<20); //4 MB; + auto cper_data = std::make_unique(buf_size); + uint64_t entry_count = 20; + auto cper_hdrs = std::make_unique(entry_count); + uint64_t cursor = 0; + + std::string gpu = std::string(CPER_SYS_ROOT) + "/sys/kernel/debug/dri/" + std::to_string(gpu_num) + "/amdgpu_ring_cper"; + amdsmi_status_t err = amdsmi_get_gpu_cper_entries_by_path( + gpu, + severity_mask, + cper_data.get(), + &buf_size, + cper_hdrs.get(), + &entry_count, + &cursor); + ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); + ASSERT_EQ(entry_count, expected_num_headers[device_idx]); + ASSERT_EQ(cursor, expected_num_headers[device_idx]); + } +} diff --git a/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/1/amdgpu_ring_cper b/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/1/amdgpu_ring_cper new file mode 100644 index 0000000000000000000000000000000000000000..a494b69d66fef1f7343cfefe26de1b5e917189f3 GIT binary patch literal 8736 zcmZQzU|{IrfMDkU*B}PQ|3JXVzz7s)2jU7KLq|)TL7rJs#Gt~^z`)4Lpu*h5(FiDq z4ia-y(hEuvqW|*0zL+!dsL!sB)YT6?g?F1;88T?h|NkGM0+n?F$lUN_c zy>Zc+O!0{1MQbv{9drEAgpxW{ zYL%0&qxBzf{84R?l#^;?#w%ty=|(WW4(D=Gi^Ou0kC|He$g=+C? s$S^wpHfYL8a^n@VoHR5d6tBa%oCJ+;kkXzMXQf(xG9E1_=`&9Q0CVHY#m>7T* zLj}(oBW1O39*v9EWQsS0%A%@f0K3uO(bo)JoCTzr0)We2VvS{FXh5+Y=3{A)0S*dx z8W8e~Obi?sfWgh63Y22Bk7Ss(0IZb@Oj3!!6|b`9)QVRl28{(UUt&6XG+yZ&uNDl# zRLZZ$qwzY3@+&(dwc^!;L36Ym9yH~!71iR^bTq#XqWsFkOs)KCHX5&kC|<=_sTHs0 Kqwz|ocm)8$a98O7 literal 0 HcmV?d00001 diff --git a/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/25/amdgpu_ring_cper b/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/25/amdgpu_ring_cper new file mode 100644 index 0000000000000000000000000000000000000000..f307556a78cf82df8469bf996599d980d1b1c806 GIT binary patch literal 1504 zcmZQzU|^_Vgka|Y*B}PQ|3JXV02E?ihsrUT3NbLUNQxL#7#bKDSs7HAn>ZQ)<J)V9T?8UhPyheFqP7~@o2b{5bjn~3wM*zaEFFF05YbSpa1{> literal 0 HcmV?d00001 diff --git a/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/33/amdgpu_ring_cper b/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/33/amdgpu_ring_cper new file mode 100644 index 0000000000000000000000000000000000000000..194a1f7036c9c187488b91bf88b810443ae64d77 GIT binary patch literal 3456 zcmZQzU|>jKhG6Fa*B}PQ|3JXVzz7s)2jU7KL)}z}fssX0#Gt~^z`)4Lpu*h5(FiDq z4ia-y(hEuvqW|*0zL+!dsL!sB)YT6?g?AfU88T=r`2QcF0+n?F$lUxkwDBJ~scEiHGanYJg@rF=Ykeko}ws2>G`UhPt zv3y+i5^F3YLj#Ic|M_8DI|c?0g&8m&gyy*L8)&d1hy#reFdJ17MLgQatMSN;S7-?$ z4N4dW3U>}%{l@`~C`L90jtjuzi$N76ps&d=4OE~ouyBJ&Y60YU1vvp$P71OKQ>mOZ zAsDZt!HVu&t>3{BXBZ+6N4^1;N>7N;}|O3QMiaG a2Z0Spq;`cR!?XgBHypU9l*7G476Sn3K~Ty7 literal 0 HcmV?d00001 diff --git a/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/41/amdgpu_ring_cper b/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/41/amdgpu_ring_cper new file mode 100644 index 0000000000000000000000000000000000000000..686082edec957149c5811173e0fa2fedc00d2559 GIT binary patch literal 376 zcmZQzU|@)YV&?$YAO^<&K)}er2oz%n;tC)`lTDa`kwsF(pu*6=z{tv=!ra8s2q=dR z5_4113rZ29|MI`Sm^1OH&#sQt)ek*|cNL2`I)6#5Wj03}FUF7D*9<3PS?}BP)Xna}!4+pfEZ} z%uPuzC`E{F>^`S^eJ`Ku&b4#CyuR{df`OGGgT{jY{}C!sSto#80ib=1N=ytOYb$ux z7%8iL^JrYOCR4m2R2Ee=1K5rJj=l!y;w&J|6aZZI5^F3YLj#KKpb!NK1TeAzNrf56 zl1vO77k&Z_Rs{)8+8`L<0G9RAZJYyUP=i2@SCAh-@mc|7Fp?Rs|MI`Sm^1OH&#sQt z)ek*|cN(KoOawE2QMOQ~}J|lwyij@H%hcYbK1s4Ry F9{~P{TKoV2 literal 0 HcmV?d00001 diff --git a/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/57/amdgpu_ring_cper b/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/57/amdgpu_ring_cper new file mode 100644 index 0000000000000000000000000000000000000000..ce58bc9f84b9623e708de4eb8427a57d9f9a160f GIT binary patch literal 12 KcmZQzKmY&$3;+QD literal 0 HcmV?d00001 diff --git a/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/9/amdgpu_ring_cper b/projects/amdsmi/tests/amd_smi_test/cper/sys/kernel/debug/dri/9/amdgpu_ring_cper new file mode 100644 index 0000000000000000000000000000000000000000..377c797bfb5c388aef06796a53fdd4ffcecebea8 GIT binary patch literal 3680 zcmZQzU|^WR48hI;u0afp|ABy!fe9$i4#YPY85pchgculEBt;A=3=Is7tPCp5O&pDY za_As2HzmEG6d}5?`<(9ey?m}a*UtI!`pS<9CRT8SG8d$3X*OYP;BPik^$1BJnSiCZrk{7T4^1r^AGx4a; zu8!2z4?Tr5%fkMDOQpxKNy06n||UPznWr+=#6`jG|}&IE)MmXttuZmq6_!0J{=;*Z=?k literal 0 HcmV?d00001 diff --git a/projects/amdsmi/tests/amd_smi_test/main.cc b/projects/amdsmi/tests/amd_smi_test/main.cc index dac5f94ecf..e7f0cc1b88 100644 --- a/projects/amdsmi/tests/amd_smi_test/main.cc +++ b/projects/amdsmi/tests/amd_smi_test/main.cc @@ -105,7 +105,6 @@ static void RunGenericTest(TestBase *test) { RunCustomTestEpilog(test); } - // TEST ENTRY TEMPLATE: // TEST(rocrtst, Perf_) { // ;