[SWDEV-546239] amd-smi ras cper - no data created (#614)
* Update amd-smi doc with examples of CPER and AFID API usage. --------- Signed-off-by: Oosman Saeed <oossaeed@amd.com> Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
This commit is contained in:
committed by
GitHub
orang tua
e6af86f44a
melakukan
fd5e37a07e
@@ -1178,7 +1178,7 @@ class AMDSMIHelpers():
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
if not getattr(self, "_cper_warning_printed", False):
|
||||
print(f"WARNING:No cper files will be dumped unless --folder=<folder_name> is specified.")
|
||||
print(f"WARNING:No cper files will be dumped unless --folder=<folder_name> is specified and cper entries exist.")
|
||||
self._cper_warning_printed = True
|
||||
|
||||
self._print_header(folder)
|
||||
@@ -1442,11 +1442,13 @@ class AMDSMIHelpers():
|
||||
logger.set_cper_exit_message(False)
|
||||
self.stop = False
|
||||
|
||||
num_entries = 0
|
||||
while True:
|
||||
try:
|
||||
entries, new_cursor, cper_data, status_code = amdsmi_interface.amdsmi_get_gpu_cper_entries(
|
||||
device_handle, severity_mask, buffer_size, args.cursor[gpu_idx])
|
||||
logging.debug(f"cper_entries | entries: {entries}")
|
||||
num_entries = num_entries + len(entries)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Error opening CPER file. This command requires elevation') from e
|
||||
@@ -1466,6 +1468,11 @@ class AMDSMIHelpers():
|
||||
break
|
||||
else:
|
||||
self.display_cper_files_generated(entries, device_handle, args.folder)
|
||||
if num_entries == 0 and not args.follow:
|
||||
if args.folder:
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
else:
|
||||
self.display_cper_files_generated(entries, device_handle, args.folder)
|
||||
|
||||
def get_bitmask_ranges(self, bitmask_dict):
|
||||
ranges = {}
|
||||
|
||||
@@ -1260,11 +1260,20 @@ Description: Dump CPER entries for a given GPU in a file using from CPER header
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
* `severity_mask` the severity mask of the entries to be retrieved
|
||||
* `buffer_size` pointer to a variable that specifies the size of the cper_data
|
||||
* `cursor` pointer to a variable that will contain the cursor for the next call
|
||||
* `severity_mask` the severity mask of the entries to be retrieved:
|
||||
1:'nonfatal-uncorrected',
|
||||
2: 'fatal',
|
||||
4: 'nonfatal-corrected', 'corrected',
|
||||
7: 'all'
|
||||
* `buffer_size` number of bytes that will be used to create a buffer for copying cper entries into; default is 1048576 bytes
|
||||
* `cursor` the zero based index at which to start retrieving cper entries; default value is 0; for example, if there are 10 cper entries available, then with a cursor value of 8, it will retrieve the last two cper entries only
|
||||
|
||||
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
|
||||
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data, status_code
|
||||
status_code:
|
||||
AMDSMI_STATUS_SUCCESS: If all entries were retrieved successfully
|
||||
AMDSMI_STATUS_MORE_DATA: If some of the entries were retrieved and:
|
||||
* A subsequent call to the API with the updated cursor will result in the fetching the next batch of entries, or
|
||||
* Increasing the input buffer_size will allow more entries to be fetched with the same cursor
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
@@ -1290,75 +1299,194 @@ Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
|
||||
Example:
|
||||
|
||||
```python
|
||||
for device in devices:
|
||||
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
|
||||
print("CPER entries for device", device)
|
||||
for key, entry in entries.items():
|
||||
print("Entry", key)
|
||||
print(" Error Severity:", entry.get("error_severity", "Unknown"))
|
||||
print(" Notify Type:", entry.get("notify_type", "Unknown"))
|
||||
print(" Timestamp:", entry.get("timestamp", ""))
|
||||
print()
|
||||
print("New Cursor Position:", new_cursor)
|
||||
from amdsmi import *
|
||||
|
||||
amdsmi_init()
|
||||
|
||||
def get_severity_mask(severity):
|
||||
severity_mask = 0
|
||||
if severity == "all":
|
||||
# Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2)
|
||||
severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2))
|
||||
elif severity == "fatal":
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1)
|
||||
severity_mask |= (1 << 1)
|
||||
elif severity in ("nonfatal", "nonfatal-uncorrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0)
|
||||
severity_mask |= (1 << 0)
|
||||
elif severity in ("nonfatal-corrected", "corrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
|
||||
severity_mask |= (1 << 2)
|
||||
return severity_mask
|
||||
|
||||
def gpuid(device):
|
||||
for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()):
|
||||
if device.value == device_handle.value:
|
||||
return gpu_index
|
||||
|
||||
try:
|
||||
devices = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
buffer_size = 1024*100
|
||||
initial_cursor = 0
|
||||
severity = "all"
|
||||
for device in devices:
|
||||
entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(
|
||||
device, get_severity_mask(severity), buffer_size, initial_cursor)
|
||||
gpu_id = gpuid(device)
|
||||
print(f"cper entries for '{severity}' severity on gpu #{gpu_id}:")
|
||||
for key, entry in entries.items():
|
||||
print("Entry", key)
|
||||
print(" Error Severity:", entry.get("error_severity", "Unknown"))
|
||||
print(" Notify Type:", entry.get("notify_type", "Unknown"))
|
||||
print(" Timestamp:", entry.get("timestamp", ""))
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
Output:
|
||||
|
||||
```shell
|
||||
cper entries for 'nonfatal-corrected' severity on gpu #0:
|
||||
cper entries for 'nonfatal-corrected' severity on gpu #1:
|
||||
Entry 0
|
||||
Error Severity: non_fatal_corrected
|
||||
Notify Type: CMC
|
||||
Timestamp: 2025/08/13 19:28:31
|
||||
Entry 1
|
||||
Error Severity: non_fatal_corrected
|
||||
Notify Type: CMC
|
||||
Timestamp: 2025/08/13 19:36:38
|
||||
```
|
||||
|
||||
### amdsmi_get_afids_from_cper
|
||||
|
||||
Description: Get the AFIDs from CPER buffer
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
* `severity_mask` the severity mask of the entries to be retrieved
|
||||
* `buffer_size` pointer to a variable that specifies the size of the cper_data
|
||||
* `cursor` pointer to a variable that will contain the cursor for the next call
|
||||
* `cper_afid_data`: Either
|
||||
- raw bytes or bytearray of a single CPER record, or
|
||||
- a list of dicts each with keys "bytes" (List[int]) and "size" (int).
|
||||
|
||||
Output: Dictionary with fields, updated cursor, a dictionary of the cper_data, and API status_code
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. |
|
||||
`notify_type` | The notification type associated with the CPER entry. |
|
||||
`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. |
|
||||
`signature` | A 4-byte signature identifying the entry, typically `CPER`. |
|
||||
`revision` | The revision number of the CPER record format. |
|
||||
`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. |
|
||||
`sec_cnt` | The count of sections included in the CPER entry. |
|
||||
`record_length` | The total length in bytes of the CPER entry. |
|
||||
`platform_id` | A character array identifying the GPU or platform. |
|
||||
`creator_id` | A character array indicating the creator of the CPER entry. |
|
||||
`record_id` | A unique identifier for the CPER entry. |
|
||||
`flags` | Reserved flags related to the CPER entry. |
|
||||
`persistence_info` | Reserved information related to persistence. |
|
||||
Output: Tuple[List[int], int]: A tuple containing:
|
||||
- A list of extracted AFIDs.
|
||||
- The total count of AFIDs.
|
||||
|
||||
* `status_code` | Upon successful retrieval of data, status_code will be AMDSMI_STATUS_SUCCESS (0) or AMDSMI_STATUS_MORE_DATA (39) if more data can be retrieve by subsequent call to the `amdsmi_get_gpu_cper_entries` function. In the later case, the input parameter `cursor` should be set to the updated `cursor` that was returned from the previous call.
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
|
||||
|
||||
* `AmdSmiParameterException`
|
||||
* `AmdSmiLibraryException` with these possible error codes:
|
||||
AMDSMI_STATUS_INVAL
|
||||
AMDSMI_STATUS_UNEXPECTED_SIZE
|
||||
AMDSMI_STATUS_UNEXPECTED_DATA
|
||||
AMDSMI_STATUS_NOT_SUPPORTED
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
for device in devices:
|
||||
entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
|
||||
print("CPER entries for device", device)
|
||||
for key, entry in entries.items():
|
||||
print("Entry", key)
|
||||
print(" Error Severity:", entry.get("error_severity", "Unknown"))
|
||||
print(" Notify Type:", entry.get("notify_type", "Unknown"))
|
||||
print(" Timestamp:", entry.get("timestamp", ""))
|
||||
print()
|
||||
print("New Cursor Position:", new_cursor)
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
from amdsmi import *
|
||||
import os
|
||||
|
||||
amdsmi_init()
|
||||
|
||||
def get_severity_mask(severity):
|
||||
severity_mask = 0
|
||||
if severity == "all":
|
||||
# Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2)
|
||||
severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2))
|
||||
elif severity == "fatal":
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1)
|
||||
severity_mask |= (1 << 1)
|
||||
elif severity in ("nonfatal", "nonfatal-uncorrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0)
|
||||
severity_mask |= (1 << 0)
|
||||
elif severity in ("nonfatal-corrected", "corrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
|
||||
severity_mask |= (1 << 2)
|
||||
return severity_mask
|
||||
|
||||
def gpuid(device):
|
||||
for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()):
|
||||
if device.value == device_handle.value:
|
||||
return gpu_index
|
||||
|
||||
def dump_cper_entry(entry, cper_data, key):
|
||||
try:
|
||||
os.mkdir("/tmp/cper_dump", mode=0o777, dir_fd=None)
|
||||
except FileExistsError:
|
||||
pass
|
||||
cper_file = f"/tmp/cper_dump/cper_entry_{key}.bin"
|
||||
with open(cper_file, "wb") as file:
|
||||
size = cper_data[key]["size"]
|
||||
data = cper_data[key]["bytes"]
|
||||
data = bytes(x % 256 for x in data[:size])
|
||||
file.write(data)
|
||||
print(f" Wrote cper data to file: {cper_file}")
|
||||
json_file = f"/tmp/cper_dump/cper_entry_{key}.json"
|
||||
with open(json_file, "wt") as file:
|
||||
file.write(str(entry))
|
||||
|
||||
def get_gpu_cper_entries():
|
||||
try:
|
||||
devices = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
buffer_size = 1024*100
|
||||
initial_cursor = 0
|
||||
severity = "all"
|
||||
for device in devices:
|
||||
entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(
|
||||
device, get_severity_mask(severity), buffer_size, initial_cursor)
|
||||
gpu_id = gpuid(device)
|
||||
print("###################")
|
||||
print(f"cper entries for '{severity}' severity on gpu #{gpu_id}:")
|
||||
for key, entry in entries.items():
|
||||
print("----------------")
|
||||
print("Entry", key)
|
||||
print(" Error Severity:", entry.get("error_severity", "Unknown"))
|
||||
print(" Notify Type:", entry.get("notify_type", "Unknown"))
|
||||
print(" Timestamp:", entry.get("timestamp", ""))
|
||||
print(f" Cper entry metadata: {entry}")
|
||||
dump_cper_entry(entry, cper_data, key)
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
|
||||
get_gpu_cper_entries()
|
||||
```
|
||||
|
||||
Output:
|
||||
|
||||
``` shell
|
||||
###################
|
||||
cper entries for 'all' severity on gpu #0:
|
||||
###################
|
||||
cper entries for 'all' severity on gpu #1:
|
||||
###################
|
||||
cper entries for 'all' severity on gpu #2:
|
||||
###################
|
||||
cper entries for 'all' severity on gpu #3:
|
||||
###################
|
||||
cper entries for 'all' severity on gpu #4:
|
||||
###################
|
||||
cper entries for 'all' severity on gpu #5:
|
||||
###################
|
||||
cper entries for 'all' severity on gpu #6:
|
||||
###################
|
||||
cper entries for 'all' severity on gpu #7:
|
||||
----------------
|
||||
Entry 0
|
||||
Error Severity: non_fatal_corrected
|
||||
Notify Type: CMC
|
||||
Timestamp: 2025/08/13 20:07:56
|
||||
Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/08/13 20:07:56', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0xcafe:0xbeef', 'creator_id': b'amdgpu', 'record_id': b'0:1', 'flags': 0, 'persistence_info': 0}
|
||||
Wrote cper data to file: /tmp/cper_dump/cper_entry_0.bin
|
||||
----------------
|
||||
Entry 1
|
||||
Error Severity: non_fatal_corrected
|
||||
Notify Type: CMC
|
||||
Timestamp: 2025/08/13 20:14:58
|
||||
Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/08/13 20:14:58', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0xcafe:0xbeef', 'creator_id': b'amdgpu', 'record_id': b'0:2', 'flags': 0, 'persistence_info': 0}
|
||||
Wrote cper data to file: /tmp/cper_dump/cper_entry_1.bin
|
||||
```
|
||||
|
||||
### amdsmi_get_gpu_ras_feature_info
|
||||
@@ -1389,16 +1517,35 @@ Exceptions that can be thrown by `amdsmi_get_gpu_ras_feature_info` function:
|
||||
Example:
|
||||
|
||||
```python
|
||||
try:
|
||||
devices = amdsmi_get_processor_handles()
|
||||
if len(devices) == 0:
|
||||
print("No GPUs on machine")
|
||||
else:
|
||||
for device in devices:
|
||||
ras_info = amdsmi_get_gpu_ras_feature_info(device)
|
||||
print(ras_info)
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
from amdsmi import *
|
||||
import os
|
||||
|
||||
amdsmi_init()
|
||||
|
||||
def amdsmi_get_afids_from_cper():
|
||||
directory_path = "/tmp/cper_dump/"
|
||||
print(f"Searching for cper file in {directory_path}")
|
||||
with os.scandir(directory_path) as cper_files:
|
||||
for cper_file in cper_files:
|
||||
if cper_file.is_file():
|
||||
if ".bin" in cper_file.path:
|
||||
print(f"Found {cper_file.path}")
|
||||
with open(cper_file.path, "rb") as file:
|
||||
raw = file.read()
|
||||
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
|
||||
print(f"afids: {afids}")
|
||||
|
||||
amdsmi_get_afids_from_cper()
|
||||
|
||||
```
|
||||
Output:
|
||||
```
|
||||
sudo python3 afid.py
|
||||
Searching for cper file in /tmp/cper_dump/
|
||||
Found /tmp/cper_dump/cper_entry_0.bin
|
||||
afids: [17]
|
||||
Found /tmp/cper_dump/cper_entry_1.bin
|
||||
afids: [17]
|
||||
```
|
||||
|
||||
### amdsmi_get_gpu_ras_block_features_enabled
|
||||
|
||||
Reference in New Issue
Block a user