[SWDEV-538308] CPER CLI 20 limit bug (#499)
The bug was reproduced like this. In terminal #1, run command: sudo amd-smi ras --cper --gpu 6 --severity all --folder /tmp/cper_dump --follow In terminal #2, inject errors: while true; do sudo amdgpuras -b 7 -s 1 -m 6 -t 2; sleep 2; done The terminal #1 starts dumping cper entry information that it captures. After 20 entries have been captured, open terminal #3 and run same command as terminal #1: sudo amd-smi ras --cper --gpu 6 --severity all --folder /tmp/cper_dump --follow From terminal #3, there will be no output, even when terminal #1 continues capturing and printing information. The fix: Since we already have more than 20 CPER entries available in the GPU buffer, when we run the command from terminal #3 to start capturing from the beginning and pass 20 buffers to copy entries to, the C++ API returns a code saying there is more data available. The Python CLI should not treat this as an error, but should continue to print what the API returned. --------- Signed-off-by: Oosman Saeed <oossaeed@amd.com>
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
f559075a81
Коммит
5b95d227bc
@@ -1374,7 +1374,7 @@ class AMDSMIHelpers():
|
||||
|
||||
while True:
|
||||
try:
|
||||
entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries(
|
||||
entries, new_cursor, cper_data, status_code = amdsmi_interface.amdsmi_get_gpu_cper_entries(
|
||||
device_handle, severity_mask, buffer_size, args.cursor[gpu_idx])
|
||||
logging.debug(f"cper_entries | entries: {entries}")
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
@@ -1386,7 +1386,7 @@ class AMDSMIHelpers():
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR:
|
||||
raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e
|
||||
else:
|
||||
logging.debug(f"Error retrieving CPER entries: {e}")
|
||||
logging.debug(f"Cannot retrieve CPER entries: {e}")
|
||||
break
|
||||
args.cursor[gpu_idx] = new_cursor
|
||||
if len(entries) == 0:
|
||||
|
||||
@@ -1279,7 +1279,7 @@ Input parameters:
|
||||
* `buffer_size` pointer to a variable that specifies the size of the cper_data
|
||||
* `cursor` pointer to a variable that will contain the cursor for the next call
|
||||
|
||||
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
|
||||
Output: Dictionary with fields, updated cursor, a dictionary of the cper_data, and API status_code
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
@@ -1297,16 +1297,22 @@ Field | Description
|
||||
`flags` | Reserved flags related to the CPER entry. |
|
||||
`persistence_info` | Reserved information related to persistence. |
|
||||
|
||||
* `status_code` | Upon successful retrieval of data, status_code will be AMDSMI_STATUS_SUCCESS (0) or AMDSMI_STATUS_MORE_DATA (39) if more data can be retrieve by subsequent call to the `amdsmi_get_gpu_cper_entries` function. In the later case, the input parameter `cursor` should be set to the updated `cursor` that was returned from the previous call.
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiLibraryException` with these possible error codes:
|
||||
AMDSMI_STATUS_INVAL
|
||||
AMDSMI_STATUS_UNEXPECTED_SIZE
|
||||
AMDSMI_STATUS_UNEXPECTED_DATA
|
||||
AMDSMI_STATUS_NOT_SUPPORTED
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
for device in devices:
|
||||
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
|
||||
entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
|
||||
print("CPER entries for device", device)
|
||||
for key, entry in entries.items():
|
||||
print("Entry", key)
|
||||
|
||||
@@ -2435,7 +2435,7 @@ def amdsmi_get_gpu_cper_entries(
|
||||
severity_mask: int,
|
||||
buffer_size: int = 4 * 1048576,
|
||||
cursor: int = 0
|
||||
) -> Tuple[Dict[str, Any], int, List[Dict[str, Any]]]:
|
||||
) -> Tuple[Dict[str, Any], int, List[Dict[str, Any]], int]:
|
||||
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(
|
||||
@@ -2445,15 +2445,16 @@ def amdsmi_get_gpu_cper_entries(
|
||||
# Allocate a buffer for CPER data.
|
||||
buf = ctypes.create_string_buffer(buffer_size)
|
||||
buf_size = ctypes.c_uint64(buffer_size)
|
||||
entry_count = ctypes.c_uint64(20)
|
||||
num_cper_hdrs = 20
|
||||
entry_count = ctypes.c_uint64(num_cper_hdrs)
|
||||
cur = ctypes.c_uint64(cursor)
|
||||
|
||||
# Allocate a pointer for the CPER header array.
|
||||
cper_hdrs_array = (POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * 20)()
|
||||
cper_hdrs = ctypes.cast(cper_hdrs_array, POINTER(POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t)))
|
||||
cper_hdrs_array = (ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * num_cper_hdrs)()
|
||||
cper_hdrs = ctypes.cast(cper_hdrs_array, ctypes.POINTER(ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t)))
|
||||
|
||||
# Call the underlying AMD-SMI API.
|
||||
ret = amdsmi_wrapper.amdsmi_get_gpu_cper_entries(
|
||||
status_code = amdsmi_wrapper.amdsmi_get_gpu_cper_entries(
|
||||
processor_handle,
|
||||
ctypes.c_uint32(severity_mask),
|
||||
buf,
|
||||
@@ -2462,8 +2463,8 @@ def amdsmi_get_gpu_cper_entries(
|
||||
ctypes.byref(entry_count),
|
||||
ctypes.byref(cur)
|
||||
)
|
||||
if ret != amdsmi_wrapper.AMDSMI_STATUS_SUCCESS:
|
||||
raise AmdSmiLibraryException(ret)
|
||||
if status_code not in {amdsmi_wrapper.AMDSMI_STATUS_SUCCESS, amdsmi_wrapper.AMDSMI_STATUS_MORE_DATA}:
|
||||
raise AmdSmiLibraryException(status_code)
|
||||
|
||||
entries = {}
|
||||
cper_data = []
|
||||
@@ -2518,7 +2519,7 @@ def amdsmi_get_gpu_cper_entries(
|
||||
entries[i] = cper_entry.copy()
|
||||
offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset.
|
||||
|
||||
return entries, cur.value, cper_data
|
||||
return entries, cur.value, cper_data, status_code
|
||||
|
||||
|
||||
def amdsmi_get_afids_from_cper(
|
||||
|
||||
@@ -420,7 +420,7 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
|
||||
if(((1 << header->error_severity) & severity_mask) !=
|
||||
static_cast<uint32_t>(1 << header->error_severity)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x"
|
||||
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
|
||||
<< std::hex << (header->error_severity) << ", given severity_mask: 0x"
|
||||
<< std::hex << severity_mask << ", record_length:"
|
||||
<< std::dec << header->record_length;
|
||||
LOG_DEBUG(ss);
|
||||
@@ -428,7 +428,7 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
|
||||
}
|
||||
else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x"
|
||||
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
|
||||
<< std::hex << (header->error_severity) << ", given severity_mask: 0x"
|
||||
<< std::hex << severity_mask << ", record_length:"
|
||||
<< std::dec << header->record_length;
|
||||
LOG_DEBUG(ss);
|
||||
@@ -468,9 +468,9 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
|
||||
*buf_size = data_idx;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] *entry_count: " << entry_count
|
||||
<< ", *cursor: " << cursor
|
||||
<< ", *buf_size: " << buf_size;
|
||||
<< "[CPER] *entry_count: " << *entry_count
|
||||
<< ", *cursor: " << *cursor
|
||||
<< ", *buf_size: " << *buf_size;
|
||||
|
||||
LOG_DEBUG(ss);
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
|
||||
Ссылка в новой задаче
Block a user