[SWDEV-546239] amd-smi ras cper - no data created (#614)

* Update amd-smi doc with examples of CPER and AFID API usage.

---------

Signed-off-by: Oosman Saeed <oossaeed@amd.com>
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>

[ROCm/amdsmi commit: fd5e37a07e]
Этот коммит содержится в:
Saeed, Oosman
2025-08-20 11:27:41 -05:00
коммит произвёл GitHub
родитель d32bae0e8f
Коммит 3779562abb
2 изменённых файлов: 213 добавлений и 59 удалений
+8 -1
Просмотреть файл
@@ -1178,7 +1178,7 @@ class AMDSMIHelpers():
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
if not getattr(self, "_cper_warning_printed", False):
print(f"WARNING:No cper files will be dumped unless --folder=<folder_name> is specified.")
print(f"WARNING:No cper files will be dumped unless --folder=<folder_name> is specified and cper entries exist.")
self._cper_warning_printed = True
self._print_header(folder)
@@ -1442,11 +1442,13 @@ class AMDSMIHelpers():
logger.set_cper_exit_message(False)
self.stop = False
num_entries = 0
while True:
try:
entries, new_cursor, cper_data, status_code = amdsmi_interface.amdsmi_get_gpu_cper_entries(
device_handle, severity_mask, buffer_size, args.cursor[gpu_idx])
logging.debug(f"cper_entries | entries: {entries}")
num_entries = num_entries + len(entries)
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Error opening CPER file. This command requires elevation') from e
@@ -1466,6 +1468,11 @@ class AMDSMIHelpers():
break
else:
self.display_cper_files_generated(entries, device_handle, args.folder)
if num_entries == 0 and not args.follow:
if args.folder:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
else:
self.display_cper_files_generated(entries, device_handle, args.folder)
def get_bitmask_ranges(self, bitmask_dict):
ranges = {}
+205 -58
Просмотреть файл
@@ -1260,11 +1260,20 @@ Description: Dump CPER entries for a given GPU in a file using from CPER header
Input parameters:
* `processor_handle` device which to query
* `severity_mask` the severity mask of the entries to be retrieved
* `buffer_size` pointer to a variable that specifies the size of the cper_data
* `cursor` pointer to a variable that will contain the cursor for the next call
* `severity_mask` the severity mask of the entries to be retrieved:
1:'nonfatal-uncorrected',
2: 'fatal',
4: 'nonfatal-corrected', 'corrected',
7: 'all'
* `buffer_size` number of bytes that will be used to create a buffer for copying cper entries into; default is 1048576 bytes
* `cursor` the zero based index at which to start retrieving cper entries; default value is 0; for example, if there are 10 cper entries available, then with a cursor value of 8, it will retrieve the last two cper entries only
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data, status_code
status_code:
AMDSMI_STATUS_SUCCESS: If all entries were retrieved successfully
AMDSMI_STATUS_MORE_DATA: If some of the entries were retrieved and:
* A subsequent call to the API with the updated cursor will result in the fetching the next batch of entries, or
* Increasing the input buffer_size will allow more entries to be fetched with the same cursor
Field | Description
---|---
@@ -1290,75 +1299,194 @@ Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
Example:
```python
for device in devices:
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
print("CPER entries for device", device)
for key, entry in entries.items():
print("Entry", key)
print(" Error Severity:", entry.get("error_severity", "Unknown"))
print(" Notify Type:", entry.get("notify_type", "Unknown"))
print(" Timestamp:", entry.get("timestamp", ""))
print()
print("New Cursor Position:", new_cursor)
from amdsmi import *
amdsmi_init()
def get_severity_mask(severity):
severity_mask = 0
if severity == "all":
# Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2)
severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2))
elif severity == "fatal":
# Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1)
severity_mask |= (1 << 1)
elif severity in ("nonfatal", "nonfatal-uncorrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0)
severity_mask |= (1 << 0)
elif severity in ("nonfatal-corrected", "corrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
severity_mask |= (1 << 2)
return severity_mask
def gpuid(device):
for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()):
if device.value == device_handle.value:
return gpu_index
try:
devices = amdsmi_interface.amdsmi_get_processor_handles()
buffer_size = 1024*100
initial_cursor = 0
severity = "all"
for device in devices:
entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(
device, get_severity_mask(severity), buffer_size, initial_cursor)
gpu_id = gpuid(device)
print(f"cper entries for '{severity}' severity on gpu #{gpu_id}:")
for key, entry in entries.items():
print("Entry", key)
print(" Error Severity:", entry.get("error_severity", "Unknown"))
print(" Notify Type:", entry.get("notify_type", "Unknown"))
print(" Timestamp:", entry.get("timestamp", ""))
except AmdSmiException as e:
print(e)
```
Output:
```shell
cper entries for 'nonfatal-corrected' severity on gpu #0:
cper entries for 'nonfatal-corrected' severity on gpu #1:
Entry 0
Error Severity: non_fatal_corrected
Notify Type: CMC
Timestamp: 2025/08/13 19:28:31
Entry 1
Error Severity: non_fatal_corrected
Notify Type: CMC
Timestamp: 2025/08/13 19:36:38
```
### amdsmi_get_afids_from_cper
Description: Get the AFIDs from CPER buffer
Input parameters:
* `processor_handle` device which to query
* `severity_mask` the severity mask of the entries to be retrieved
* `buffer_size` pointer to a variable that specifies the size of the cper_data
* `cursor` pointer to a variable that will contain the cursor for the next call
* `cper_afid_data`: Either
- raw bytes or bytearray of a single CPER record, or
- a list of dicts each with keys "bytes" (List[int]) and "size" (int).
Output: Dictionary with fields, updated cursor, a dictionary of the cper_data, and API status_code
Field | Description
---|---
`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. |
`notify_type` | The notification type associated with the CPER entry. |
`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. |
`signature` | A 4-byte signature identifying the entry, typically `CPER`. |
`revision` | The revision number of the CPER record format. |
`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. |
`sec_cnt` | The count of sections included in the CPER entry. |
`record_length` | The total length in bytes of the CPER entry. |
`platform_id` | A character array identifying the GPU or platform. |
`creator_id` | A character array indicating the creator of the CPER entry. |
`record_id` | A unique identifier for the CPER entry. |
`flags` | Reserved flags related to the CPER entry. |
`persistence_info` | Reserved information related to persistence. |
Output: Tuple[List[int], int]: A tuple containing:
- A list of extracted AFIDs.
- The total count of AFIDs.
* `status_code` | Upon successful retrieval of data, status_code will be AMDSMI_STATUS_SUCCESS (0) or AMDSMI_STATUS_MORE_DATA (39) if more data can be retrieve by subsequent call to the `amdsmi_get_gpu_cper_entries` function. In the later case, the input parameter `cursor` should be set to the updated `cursor` that was returned from the previous call.
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
* `AmdSmiParameterException`
* `AmdSmiLibraryException` with these possible error codes:
AMDSMI_STATUS_INVAL
AMDSMI_STATUS_UNEXPECTED_SIZE
AMDSMI_STATUS_UNEXPECTED_DATA
AMDSMI_STATUS_NOT_SUPPORTED
* `AmdSmiParameterException`
Example:
```python
for device in devices:
entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
print("CPER entries for device", device)
for key, entry in entries.items():
print("Entry", key)
print(" Error Severity:", entry.get("error_severity", "Unknown"))
print(" Notify Type:", entry.get("notify_type", "Unknown"))
print(" Timestamp:", entry.get("timestamp", ""))
print()
print("New Cursor Position:", new_cursor)
except AmdSmiException as e:
print(e)
from amdsmi import *
import os
amdsmi_init()
def get_severity_mask(severity):
severity_mask = 0
if severity == "all":
# Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2)
severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2))
elif severity == "fatal":
# Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1)
severity_mask |= (1 << 1)
elif severity in ("nonfatal", "nonfatal-uncorrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0)
severity_mask |= (1 << 0)
elif severity in ("nonfatal-corrected", "corrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
severity_mask |= (1 << 2)
return severity_mask
def gpuid(device):
for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()):
if device.value == device_handle.value:
return gpu_index
def dump_cper_entry(entry, cper_data, key):
try:
os.mkdir("/tmp/cper_dump", mode=0o777, dir_fd=None)
except FileExistsError:
pass
cper_file = f"/tmp/cper_dump/cper_entry_{key}.bin"
with open(cper_file, "wb") as file:
size = cper_data[key]["size"]
data = cper_data[key]["bytes"]
data = bytes(x % 256 for x in data[:size])
file.write(data)
print(f" Wrote cper data to file: {cper_file}")
json_file = f"/tmp/cper_dump/cper_entry_{key}.json"
with open(json_file, "wt") as file:
file.write(str(entry))
def get_gpu_cper_entries():
try:
devices = amdsmi_interface.amdsmi_get_processor_handles()
buffer_size = 1024*100
initial_cursor = 0
severity = "all"
for device in devices:
entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(
device, get_severity_mask(severity), buffer_size, initial_cursor)
gpu_id = gpuid(device)
print("###################")
print(f"cper entries for '{severity}' severity on gpu #{gpu_id}:")
for key, entry in entries.items():
print("----------------")
print("Entry", key)
print(" Error Severity:", entry.get("error_severity", "Unknown"))
print(" Notify Type:", entry.get("notify_type", "Unknown"))
print(" Timestamp:", entry.get("timestamp", ""))
print(f" Cper entry metadata: {entry}")
dump_cper_entry(entry, cper_data, key)
except AmdSmiException as e:
print(e)
get_gpu_cper_entries()
```
Output:
``` shell
###################
cper entries for 'all' severity on gpu #0:
###################
cper entries for 'all' severity on gpu #1:
###################
cper entries for 'all' severity on gpu #2:
###################
cper entries for 'all' severity on gpu #3:
###################
cper entries for 'all' severity on gpu #4:
###################
cper entries for 'all' severity on gpu #5:
###################
cper entries for 'all' severity on gpu #6:
###################
cper entries for 'all' severity on gpu #7:
----------------
Entry 0
Error Severity: non_fatal_corrected
Notify Type: CMC
Timestamp: 2025/08/13 20:07:56
Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/08/13 20:07:56', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0xcafe:0xbeef', 'creator_id': b'amdgpu', 'record_id': b'0:1', 'flags': 0, 'persistence_info': 0}
Wrote cper data to file: /tmp/cper_dump/cper_entry_0.bin
----------------
Entry 1
Error Severity: non_fatal_corrected
Notify Type: CMC
Timestamp: 2025/08/13 20:14:58
Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/08/13 20:14:58', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0xcafe:0xbeef', 'creator_id': b'amdgpu', 'record_id': b'0:2', 'flags': 0, 'persistence_info': 0}
Wrote cper data to file: /tmp/cper_dump/cper_entry_1.bin
```
### amdsmi_get_gpu_ras_feature_info
@@ -1389,16 +1517,35 @@ Exceptions that can be thrown by `amdsmi_get_gpu_ras_feature_info` function:
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
ras_info = amdsmi_get_gpu_ras_feature_info(device)
print(ras_info)
except AmdSmiException as e:
print(e)
from amdsmi import *
import os
amdsmi_init()
def amdsmi_get_afids_from_cper():
directory_path = "/tmp/cper_dump/"
print(f"Searching for cper file in {directory_path}")
with os.scandir(directory_path) as cper_files:
for cper_file in cper_files:
if cper_file.is_file():
if ".bin" in cper_file.path:
print(f"Found {cper_file.path}")
with open(cper_file.path, "rb") as file:
raw = file.read()
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
print(f"afids: {afids}")
amdsmi_get_afids_from_cper()
```
Output:
```
sudo python3 afid.py
Searching for cper file in /tmp/cper_dump/
Found /tmp/cper_dump/cper_entry_0.bin
afids: [17]
Found /tmp/cper_dump/cper_entry_1.bin
afids: [17]
```
### amdsmi_get_gpu_ras_block_features_enabled