From 0f75c19e4d4b5b777715e4f4cda40635709f8c22 Mon Sep 17 00:00:00 2001 From: "Park, Peter" Date: Tue, 9 Sep 2025 11:27:15 -0400 Subject: [PATCH] [SWDEV-551318] Add doc about RAS / CPER (#636) * add doc about ras/cper * add sample code examples for CPER and AFID --------- Signed-off-by: Park, Peter Signed-off-by: Arif, Maisam Co-authored-by: Oosman Saeed [ROCm/amdsmi commit: 5e92adc5b3d815321de032c51e885e744b923007] --- projects/amdsmi/docs/conceptual/ras.md | 95 +++++++++++ .../amdsmi/docs/how-to/amdsmi-cli-tool.md | 153 +++++++++++++++++ projects/amdsmi/docs/index.md | 6 +- .../amdsmi/docs/reference/amdsmi-py-api.md | 159 +----------------- projects/amdsmi/docs/sphinx/_toc.yml.in | 4 + .../amdsmi/example/amd_smi_afid_example.py | 49 ++++++ .../amdsmi/example/amd_smi_cper_example.py | 126 ++++++++++++++ 7 files changed, 440 insertions(+), 152 deletions(-) create mode 100644 projects/amdsmi/docs/conceptual/ras.md create mode 100644 projects/amdsmi/example/amd_smi_afid_example.py create mode 100644 projects/amdsmi/example/amd_smi_cper_example.py diff --git a/projects/amdsmi/docs/conceptual/ras.md b/projects/amdsmi/docs/conceptual/ras.md new file mode 100644 index 0000000000..255fd0e04b --- /dev/null +++ b/projects/amdsmi/docs/conceptual/ras.md @@ -0,0 +1,95 @@ +--- +myst: + html_meta: + "description lang=en": "AMD SMI for reliability, availability, serviceability." + "keywords": "system, management, interface, cper, log, error, spec, ecc, afid, fault, ras" +--- + +# Reliability, availability, serviceability (RAS) + +RAS aims to increase the robustness of a system by detecting hardware errors, recording them, and +correcting them where possible. See [Reliability, availability, serviceability (Linux +kernel)](https://docs.kernel.org/admin-guide/RAS/main.html) for more general information. + +## ECC + +ECC (Error-Correcting Code) is a type of memory to automatically detect errors. Correctable 1-bit +errors are handled by the ECC logic and logged by the hardware. Uncorrectable 2-bit errors can be +detected but not reliably fixed; this is a more serious event that must be reported. See [RAS Error +Count sysfs Interface](https://docs.kernel.org/gpu/amdgpu/ras.html#ras-error-count-sysfs-interface) +to learn how AMD SMI accesses error counts. + +While ECC is a mechanism to handle different errors, CPER is the standard used to report that the event +occurred. + +## CPER + +At its core, CPER (Common Platform Error Record) is a standard format included in the [UEFI +specification](https://uefi.org/specs/UEFI/2.10/01_Introduction.html) to report errors to the +operating system. It works as a standard error report template that different hardware components +can fill out when something goes wrong. It consists of a header, one or more section descriptors -- +and for each descriptor, an associated section containing error or informational data. See [CPER +(UEFI Specification)](https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html) for +more information. + +A CPER record consists of vital information for diagnostics such as: + +- Error source +- Error type +- Error severity + - 0 - Recoverable (also called non-fatal uncorrected) + - 1 - Fatal + - 2 - Corrected + - 3 - Informational +- Timestamp +- Other data + +A CPER record might contain an AFID in its data to help map a complex error to a more actionable service task. + +## AFID + +AFIDs (AMD Field ID) are unique numerical IDs associated with specific events or errors produced by +AMD Instinct accelerators. It provides a specific identifier for a known condition, which helps +facilitate root cause analysis. Each AFID is associated with category, type, and severity fields. See +[AFID Event List](https://docs.amd.com/r/en-US/AMD_Field_ID_70122_v1.0/AFID-Event-List) for more +information. + +## From concept to action + +AMD SMI provides tools to programmatically monitor and manage these RAS features. + +:::::{tab-set} +::::{tab-item} C/C++ +The AMD SMI library provides APIs to query ECC error counts and manage CPER records +(list, decode, and clear). + +See [ECC information](/doxygen/docBin/html/group__tagECCInfo) and [RAS +information](/doxygen/docBin/html/group__tagRasInfo) for available APIs. +:::: + +::::{tab-item} Python +See related APIs: + +- [](#amdsmi_get_gpu_ecc_count) +- [](#amdsmi_get_gpu_ecc_enabled) +- [](#amdsmi_get_gpu_ecc_status) +- [](#amdsmi_get_gpu_total_ecc_count) +- [](#amdsmi_get_gpu_cper_entries) +- [](#amdsmi_get_afids_from_cper) +- [](#amdsmi_get_gpu_ras_feature_info) +- [](#amdsmi_get_gpu_ras_block_features_enabled) +:::: + +::::{tab-item} amd-smi CLI +See [`amd-smi ras --help`](#cmd-ras) for details and available options. +```shell +amd-smi ras --help +``` +:::: +::::: + +## Further reading + +- [AMD Field ID](https://docs.amd.com/r/en-US/AMD_Field_ID_70122_v1.0/Introduction) +- [CPER (UEFI specification)](https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html) +- [Reliability, availability, serviceability (Linux kernel)](https://docs.kernel.org/admin-guide/RAS/main.html) diff --git a/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md b/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md index 4ef283876a..383e53e07d 100644 --- a/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md +++ b/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md @@ -1078,3 +1078,156 @@ GPU: 0 LEVEL 0: 45 MHz ... ``` + +### Listing CPER entries using amd-smi + +This example code shows how to list CPER entries for a given GPU into files + +```python +from amdsmi import * +import os + +amdsmi_init() + +def get_severity_mask(severity): + severity_mask = 0 + if severity == "all": + # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) + severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) + elif severity == "fatal": + # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) + severity_mask |= (1 << 1) + elif severity in ("nonfatal", "nonfatal-uncorrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) + severity_mask |= (1 << 0) + elif severity in ("nonfatal-corrected", "corrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) + severity_mask |= (1 << 2) + return severity_mask + +def gpuid(device): + for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()): + if device.value == device_handle.value: + return gpu_index + +def dump_cper_entry(entry, cper_data, key): + try: + os.mkdir("/tmp/cper_dump", mode=0o777, dir_fd=None) + except FileExistsError: + pass + cper_file = f"/tmp/cper_dump/cper_entry_{key}.bin" + with open(cper_file, "wb") as file: + size = cper_data[key]["size"] + data = cper_data[key]["bytes"] + data = bytes(x % 256 for x in data[:size]) + file.write(data) + print(f" Wrote cper data to file: {cper_file}") + json_file = f"/tmp/cper_dump/cper_entry_{key}.json" + with open(json_file, "wt") as file: + file.write(str(entry)) + +def get_gpu_cper_entries(): + try: + devices = amdsmi_interface.amdsmi_get_processor_handles() + buffer_size = 1024*100 + initial_cursor = 0 + severity = "all" + for device in devices: + while True: + entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries( + device, get_severity_mask(severity), buffer_size, initial_cursor) + gpu_id = gpuid(device) + print("#############################################################################") + print(f"cper entries for severity: '{severity}', gpu #{gpu_id}, cursor: {initial_cursor}-{new_cursor - 1}") + for key, entry in entries.items(): + print("----------------") + print("Entry", initial_cursor + key) + print(" Error Severity:", entry.get("error_severity", "Unknown")) + print(" Notify Type:", entry.get("notify_type", "Unknown")) + print(" Timestamp:", entry.get("timestamp", "")) + print(f" Cper entry metadata: {entry}") + dump_cper_entry(entry, cper_data, key) + if initial_cursor == new_cursor: + break + initial_cursor=new_cursor + break + except AmdSmiException as e: + print(e) + +get_gpu_cper_entries() + +``` + +Output: + +```shell +cper entries for severity: 'all', gpu #0, cursor: 0-3 +---------------- +Entry 0 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/09/07 00:14:22 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/09/07 00:14:22', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0x1002:0x74A2', 'creator_id': b'amdgpu', 'record_id': b'5:1', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_0.bin +---------------- +Entry 1 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/09/07 00:14:26 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/09/07 00:14:26', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0x1002:0x74A2', 'creator_id': b'amdgpu', 'record_id': b'5:2', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_1.bin +---------------- +Entry 2 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/09/08 06:12:11 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/09/08 06:12:11', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0x1002:0x74A2', 'creator_id': b'amdgpu', 'record_id': b'5:3', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_2.bin +---------------- +Entry 3 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/09/08 06:13:59 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/09/08 06:13:59', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0x1002:0x74A2', 'creator_id': b'amdgpu', 'record_id': b'5:4', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_3.bin +############################################################################# +cper entries for severity: 'all', gpu #0, cursor: 4-3 +``` + +### Listing AFID numbers from CPER files + +This example code shows how to retrieve the AFID numbers from CPER files + +```python +from amdsmi import * +import os + +amdsmi_init() + +def amdsmi_get_afids_from_cper(): + directory_path = "/tmp/cper_dump/" + print(f"Searching for cper file in {directory_path}") + with os.scandir(directory_path) as cper_files: + for cper_file in cper_files: + if cper_file.is_file(): # Check if the entry is a file (not a subdirectory) + if ".bin" in cper_file.path: + print(f"Found {cper_file.path}") + with open(cper_file.path, "rb") as file: + raw = file.read() + afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw) + print(f"afids: {afids}") + +amdsmi_get_afids_from_cper() +``` + +Output: + +```shell +sudo python3 afid.py +Searching for cper file in /tmp/cper_dump/ +Found /tmp/cper_dump/cper_entry_0.bin +afids: [17] +Found /tmp/cper_dump/cper_entry_1.bin +afids: [17] +``` + diff --git a/projects/amdsmi/docs/index.md b/projects/amdsmi/docs/index.md index 0e671b52c0..3d9879d00a 100644 --- a/projects/amdsmi/docs/index.md +++ b/projects/amdsmi/docs/index.md @@ -46,9 +46,13 @@ AMD SMI is the successor to . * [Go API](./reference/amdsmi-go-api.md) ::: +:::{grid-item-card} Conceptual +* [Reliability, availability, serviceability](./conceptual/ras.md) +::: + :::{grid-item-card} Tutorials * [AMD SMI examples (GitHub)](https://github.com/ROCm/amdsmi/tree/amd-staging/example) -* [ROCm SMI examples (GitHub)](https://github.com/ROCm/rocm_smi_lib/tree/amd-staging/example) +* [AMD SMI CLI walkthrough](https://rocm.blogs.amd.com/software-tools-optimization/amd-smi-overview/README.html) ::: :::: diff --git a/projects/amdsmi/docs/reference/amdsmi-py-api.md b/projects/amdsmi/docs/reference/amdsmi-py-api.md index 42c25e2c5d..17ee4fb1a3 100644 --- a/projects/amdsmi/docs/reference/amdsmi-py-api.md +++ b/projects/amdsmi/docs/reference/amdsmi-py-api.md @@ -1299,64 +1299,14 @@ Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: Example: ```python -from amdsmi import * - -amdsmi_init() - -def get_severity_mask(severity): - severity_mask = 0 - if severity == "all": - # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) - severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) - elif severity == "fatal": - # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) - severity_mask |= (1 << 1) - elif severity in ("nonfatal", "nonfatal-uncorrected"): - # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) - severity_mask |= (1 << 0) - elif severity in ("nonfatal-corrected", "corrected"): - # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) - severity_mask |= (1 << 2) - return severity_mask - -def gpuid(device): - for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()): - if device.value == device_handle.value: - return gpu_index - -try: - devices = amdsmi_interface.amdsmi_get_processor_handles() - buffer_size = 1024*100 - initial_cursor = 0 - severity = "all" - for device in devices: + try: entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries( - device, get_severity_mask(severity), buffer_size, initial_cursor) - gpu_id = gpuid(device) - print(f"cper entries for '{severity}' severity on gpu #{gpu_id}:") - for key, entry in entries.items(): - print("Entry", key) - print(" Error Severity:", entry.get("error_severity", "Unknown")) - print(" Notify Type:", entry.get("notify_type", "Unknown")) - print(" Timestamp:", entry.get("timestamp", "")) -except AmdSmiException as e: - print(e) + device, severity_mask, buffer_size, initial_cursor) + except AmdSmiException as e: + print(e) ``` -Output: - -```shell -cper entries for 'nonfatal-corrected' severity on gpu #0: -cper entries for 'nonfatal-corrected' severity on gpu #1: -Entry 0 - Error Severity: non_fatal_corrected - Notify Type: CMC - Timestamp: 2025/08/13 19:28:31 -Entry 1 - Error Severity: non_fatal_corrected - Notify Type: CMC - Timestamp: 2025/08/13 19:36:38 -``` +Refer to amd_smi_cper_example.py for a complete example ### amdsmi_get_afids_from_cper @@ -1386,108 +1336,15 @@ Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: Example: ```python -from amdsmi import * -import os - -amdsmi_init() - -def get_severity_mask(severity): - severity_mask = 0 - if severity == "all": - # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) - severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) - elif severity == "fatal": - # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) - severity_mask |= (1 << 1) - elif severity in ("nonfatal", "nonfatal-uncorrected"): - # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) - severity_mask |= (1 << 0) - elif severity in ("nonfatal-corrected", "corrected"): - # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) - severity_mask |= (1 << 2) - return severity_mask - -def gpuid(device): - for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()): - if device.value == device_handle.value: - return gpu_index - -def dump_cper_entry(entry, cper_data, key): try: - os.mkdir("/tmp/cper_dump", mode=0o777, dir_fd=None) - except FileExistsError: - pass - cper_file = f"/tmp/cper_dump/cper_entry_{key}.bin" - with open(cper_file, "wb") as file: - size = cper_data[key]["size"] - data = cper_data[key]["bytes"] - data = bytes(x % 256 for x in data[:size]) - file.write(data) - print(f" Wrote cper data to file: {cper_file}") - json_file = f"/tmp/cper_dump/cper_entry_{key}.json" - with open(json_file, "wt") as file: - file.write(str(entry)) - -def get_gpu_cper_entries(): - try: - devices = amdsmi_interface.amdsmi_get_processor_handles() - buffer_size = 1024*100 - initial_cursor = 0 - severity = "all" - for device in devices: - entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries( - device, get_severity_mask(severity), buffer_size, initial_cursor) - gpu_id = gpuid(device) - print("###################") - print(f"cper entries for '{severity}' severity on gpu #{gpu_id}:") - for key, entry in entries.items(): - print("----------------") - print("Entry", key) - print(" Error Severity:", entry.get("error_severity", "Unknown")) - print(" Notify Type:", entry.get("notify_type", "Unknown")) - print(" Timestamp:", entry.get("timestamp", "")) - print(f" Cper entry metadata: {entry}") - dump_cper_entry(entry, cper_data, key) + with open(cper_file.path, "rb") as file: + afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(file.read()) except AmdSmiException as e: print(e) - -get_gpu_cper_entries() ``` -Output: +Refer to amd_smi_afid_example.py for a complete example -``` shell -################### -cper entries for 'all' severity on gpu #0: -################### -cper entries for 'all' severity on gpu #1: -################### -cper entries for 'all' severity on gpu #2: -################### -cper entries for 'all' severity on gpu #3: -################### -cper entries for 'all' severity on gpu #4: -################### -cper entries for 'all' severity on gpu #5: -################### -cper entries for 'all' severity on gpu #6: -################### -cper entries for 'all' severity on gpu #7: ----------------- -Entry 0 - Error Severity: non_fatal_corrected - Notify Type: CMC - Timestamp: 2025/08/13 20:07:56 - Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/08/13 20:07:56', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0xcafe:0xbeef', 'creator_id': b'amdgpu', 'record_id': b'0:1', 'flags': 0, 'persistence_info': 0} - Wrote cper data to file: /tmp/cper_dump/cper_entry_0.bin ----------------- -Entry 1 - Error Severity: non_fatal_corrected - Notify Type: CMC - Timestamp: 2025/08/13 20:14:58 - Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/08/13 20:14:58', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0xcafe:0xbeef', 'creator_id': b'amdgpu', 'record_id': b'0:2', 'flags': 0, 'persistence_info': 0} - Wrote cper data to file: /tmp/cper_dump/cper_entry_1.bin -``` ### amdsmi_get_gpu_ras_feature_info diff --git a/projects/amdsmi/docs/sphinx/_toc.yml.in b/projects/amdsmi/docs/sphinx/_toc.yml.in index 909f23f575..7e7eaebc40 100644 --- a/projects/amdsmi/docs/sphinx/_toc.yml.in +++ b/projects/amdsmi/docs/sphinx/_toc.yml.in @@ -44,6 +44,10 @@ subtrees: - file: reference/changelog.md title: Changelog +- caption: Conceptual + entries: + - file: conceptual/ras.md + - caption: Tutorials entries: - url: https://github.com/ROCm/amdsmi/tree/amd-staging/example diff --git a/projects/amdsmi/example/amd_smi_afid_example.py b/projects/amdsmi/example/amd_smi_afid_example.py new file mode 100644 index 0000000000..2d303c4ecc --- /dev/null +++ b/projects/amdsmi/example/amd_smi_afid_example.py @@ -0,0 +1,49 @@ +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from amdsmi import * +import os + +amdsmi_init() + +def amdsmi_get_afids_from_cper(): + directory_path = "/tmp/cper_dump/" + print(f"Searching for cper file in {directory_path}") + with os.scandir(directory_path) as cper_files: + for cper_file in cper_files: + if cper_file.is_file(): # Check if the entry is a file (not a subdirectory) + if ".bin" in cper_file.path: + print(f"Found {cper_file.path}") + with open(cper_file.path, "rb") as file: + raw = file.read() + afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw) + print(f"afids: {afids}") + +amdsmi_get_afids_from_cper() + +""" +Sample output: + +sudo python3 afid.py +Searching for cper file in /tmp/cper_dump/ +Found /tmp/cper_dump/cper_entry_0.bin +afids: [17] +Found /tmp/cper_dump/cper_entry_1.bin +afids: [17] +""" \ No newline at end of file diff --git a/projects/amdsmi/example/amd_smi_cper_example.py b/projects/amdsmi/example/amd_smi_cper_example.py new file mode 100644 index 0000000000..22b27f94a3 --- /dev/null +++ b/projects/amdsmi/example/amd_smi_cper_example.py @@ -0,0 +1,126 @@ +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from amdsmi import * +import os + +amdsmi_init() + +def get_severity_mask(severity): + severity_mask = 0 + if severity == "all": + # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) + severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) + elif severity == "fatal": + # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) + severity_mask |= (1 << 1) + elif severity in ("nonfatal", "nonfatal-uncorrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) + severity_mask |= (1 << 0) + elif severity in ("nonfatal-corrected", "corrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) + severity_mask |= (1 << 2) + return severity_mask + +def gpuid(device): + for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()): + if device.value == device_handle.value: + return gpu_index + +def dump_cper_entry(entry, cper_data, key): + try: + os.mkdir("/tmp/cper_dump", mode=0o777, dir_fd=None) + except FileExistsError: + pass + cper_file = f"/tmp/cper_dump/cper_entry_{key}.bin" + with open(cper_file, "wb") as file: + size = cper_data[key]["size"] + data = cper_data[key]["bytes"] + data = bytes(x % 256 for x in data[:size]) + file.write(data) + print(f" Wrote cper data to file: {cper_file}") + json_file = f"/tmp/cper_dump/cper_entry_{key}.json" + with open(json_file, "wt") as file: + file.write(str(entry)) + +def get_gpu_cper_entries(): + try: + devices = amdsmi_interface.amdsmi_get_processor_handles() + buffer_size = 1024*100 + initial_cursor = 0 + severity = "all" + for device in devices: + while True: + entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries( + device, get_severity_mask(severity), buffer_size, initial_cursor) + gpu_id = gpuid(device) + print("#############################################################################") + print(f"cper entries for severity: '{severity}', gpu #{gpu_id}, cursor: {initial_cursor}-{new_cursor - 1}") + for key, entry in entries.items(): + print("----------------") + print("Entry", initial_cursor + key) + print(" Error Severity:", entry.get("error_severity", "Unknown")) + print(" Notify Type:", entry.get("notify_type", "Unknown")) + print(" Timestamp:", entry.get("timestamp", "")) + print(f" Cper entry metadata: {entry}") + dump_cper_entry(entry, cper_data, key) + if initial_cursor == new_cursor: + break + initial_cursor = new_cursor + break + except AmdSmiException as e: + print(e) + +get_gpu_cper_entries() + +""" +Sample output: + +cper entries for severity: 'all', gpu #0, cursor: 0-3 +---------------- +Entry 0 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/09/07 00:14:22 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/09/07 00:14:22', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0x1002:0x74A2', 'creator_id': b'amdgpu', 'record_id': b'5:1', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_0.bin +---------------- +Entry 1 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/09/07 00:14:26 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/09/07 00:14:26', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0x1002:0x74A2', 'creator_id': b'amdgpu', 'record_id': b'5:2', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_1.bin +---------------- +Entry 2 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/09/08 06:12:11 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/09/08 06:12:11', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0x1002:0x74A2', 'creator_id': b'amdgpu', 'record_id': b'5:3', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_2.bin +---------------- +Entry 3 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/09/08 06:13:59 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/09/08 06:13:59', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0x1002:0x74A2', 'creator_id': b'amdgpu', 'record_id': b'5:4', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_3.bin +############################################################################# +cper entries for severity: 'all', gpu #0, cursor: 4-3 +""" \ No newline at end of file