[SWDEV-538308] CPER CLI 20 limit bug (#499)

The bug was reproduced like this.

In terminal #1, run command:
sudo amd-smi ras --cper --gpu 6 --severity all --folder /tmp/cper_dump --follow 

In terminal #2, inject errors:
while true; do sudo amdgpuras -b 7 -s 1 -m 6 -t 2; sleep 2; done

The terminal #1 starts dumping cper entry information that it captures. After 20 entries have been captured, open terminal #3 and run same command as terminal #1:
sudo amd-smi ras --cper --gpu 6 --severity all --folder /tmp/cper_dump --follow 

From terminal #3, there will be no output, even when terminal #1 continues capturing and printing information.

The fix:

Since we already have more than 20 CPER entries available in the GPU buffer, when we run the command from terminal #3 to start capturing from the beginning and pass 20 buffers to copy entries to, the C++ API returns a code saying there is more data available.

The Python CLI should not treat this as an error, but should continue to print what the API returned.

---------

Signed-off-by: Oosman Saeed <oossaeed@amd.com>
Этот коммит содержится в:
Saeed, Oosman
2025-07-07 11:11:13 -05:00
коммит произвёл GitHub
родитель f559075a81
Коммит 5b95d227bc
4 изменённых файлов: 25 добавлений и 18 удалений
+2 -2
Просмотреть файл
@@ -1374,7 +1374,7 @@ class AMDSMIHelpers():
while True:
try:
entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries(
entries, new_cursor, cper_data, status_code = amdsmi_interface.amdsmi_get_gpu_cper_entries(
device_handle, severity_mask, buffer_size, args.cursor[gpu_idx])
logging.debug(f"cper_entries | entries: {entries}")
except amdsmi_exception.AmdSmiLibraryException as e:
@@ -1386,7 +1386,7 @@ class AMDSMIHelpers():
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR:
raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e
else:
logging.debug(f"Error retrieving CPER entries: {e}")
logging.debug(f"Cannot retrieve CPER entries: {e}")
break
args.cursor[gpu_idx] = new_cursor
if len(entries) == 0:
+9 -3
Просмотреть файл
@@ -1279,7 +1279,7 @@ Input parameters:
* `buffer_size` pointer to a variable that specifies the size of the cper_data
* `cursor` pointer to a variable that will contain the cursor for the next call
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
Output: Dictionary with fields, updated cursor, a dictionary of the cper_data, and API status_code
Field | Description
---|---
@@ -1297,16 +1297,22 @@ Field | Description
`flags` | Reserved flags related to the CPER entry. |
`persistence_info` | Reserved information related to persistence. |
* `status_code` | Upon successful retrieval of data, status_code will be AMDSMI_STATUS_SUCCESS (0) or AMDSMI_STATUS_MORE_DATA (39) if more data can be retrieve by subsequent call to the `amdsmi_get_gpu_cper_entries` function. In the later case, the input parameter `cursor` should be set to the updated `cursor` that was returned from the previous call.
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
* `AmdSmiLibraryException`
* `AmdSmiLibraryException` with these possible error codes:
AMDSMI_STATUS_INVAL
AMDSMI_STATUS_UNEXPECTED_SIZE
AMDSMI_STATUS_UNEXPECTED_DATA
AMDSMI_STATUS_NOT_SUPPORTED
* `AmdSmiParameterException`
Example:
```python
for device in devices:
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
print("CPER entries for device", device)
for key, entry in entries.items():
print("Entry", key)
+9 -8
Просмотреть файл
@@ -2435,7 +2435,7 @@ def amdsmi_get_gpu_cper_entries(
severity_mask: int,
buffer_size: int = 4 * 1048576,
cursor: int = 0
) -> Tuple[Dict[str, Any], int, List[Dict[str, Any]]]:
) -> Tuple[Dict[str, Any], int, List[Dict[str, Any]], int]:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
@@ -2445,15 +2445,16 @@ def amdsmi_get_gpu_cper_entries(
# Allocate a buffer for CPER data.
buf = ctypes.create_string_buffer(buffer_size)
buf_size = ctypes.c_uint64(buffer_size)
entry_count = ctypes.c_uint64(20)
num_cper_hdrs = 20
entry_count = ctypes.c_uint64(num_cper_hdrs)
cur = ctypes.c_uint64(cursor)
# Allocate a pointer for the CPER header array.
cper_hdrs_array = (POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * 20)()
cper_hdrs = ctypes.cast(cper_hdrs_array, POINTER(POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t)))
cper_hdrs_array = (ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * num_cper_hdrs)()
cper_hdrs = ctypes.cast(cper_hdrs_array, ctypes.POINTER(ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t)))
# Call the underlying AMD-SMI API.
ret = amdsmi_wrapper.amdsmi_get_gpu_cper_entries(
status_code = amdsmi_wrapper.amdsmi_get_gpu_cper_entries(
processor_handle,
ctypes.c_uint32(severity_mask),
buf,
@@ -2462,8 +2463,8 @@ def amdsmi_get_gpu_cper_entries(
ctypes.byref(entry_count),
ctypes.byref(cur)
)
if ret != amdsmi_wrapper.AMDSMI_STATUS_SUCCESS:
raise AmdSmiLibraryException(ret)
if status_code not in {amdsmi_wrapper.AMDSMI_STATUS_SUCCESS, amdsmi_wrapper.AMDSMI_STATUS_MORE_DATA}:
raise AmdSmiLibraryException(status_code)
entries = {}
cper_data = []
@@ -2518,7 +2519,7 @@ def amdsmi_get_gpu_cper_entries(
entries[i] = cper_entry.copy()
offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset.
return entries, cur.value, cper_data
return entries, cur.value, cper_data, status_code
def amdsmi_get_afids_from_cper(
+5 -5
Просмотреть файл
@@ -420,7 +420,7 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
if(((1 << header->error_severity) & severity_mask) !=
static_cast<uint32_t>(1 << header->error_severity)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x"
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
<< std::hex << (header->error_severity) << ", given severity_mask: 0x"
<< std::hex << severity_mask << ", record_length:"
<< std::dec << header->record_length;
LOG_DEBUG(ss);
@@ -428,7 +428,7 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
}
else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x"
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
<< std::hex << (header->error_severity) << ", given severity_mask: 0x"
<< std::hex << severity_mask << ", record_length:"
<< std::dec << header->record_length;
LOG_DEBUG(ss);
@@ -468,9 +468,9 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
*buf_size = data_idx;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] *entry_count: " << entry_count
<< ", *cursor: " << cursor
<< ", *buf_size: " << buf_size;
<< "[CPER] *entry_count: " << *entry_count
<< ", *cursor: " << *cursor
<< ", *buf_size: " << *buf_size;
LOG_DEBUG(ss);
return AMDSMI_STATUS_SUCCESS;