diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index b98638b507..39495b4c3b 100755 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -1374,7 +1374,7 @@ class AMDSMIHelpers(): while True: try: - entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries( + entries, new_cursor, cper_data, status_code = amdsmi_interface.amdsmi_get_gpu_cper_entries( device_handle, severity_mask, buffer_size, args.cursor[gpu_idx]) logging.debug(f"cper_entries | entries: {entries}") except amdsmi_exception.AmdSmiLibraryException as e: @@ -1386,7 +1386,7 @@ class AMDSMIHelpers(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR: raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e else: - logging.debug(f"Error retrieving CPER entries: {e}") + logging.debug(f"Cannot retrieve CPER entries: {e}") break args.cursor[gpu_idx] = new_cursor if len(entries) == 0: diff --git a/docs/reference/amdsmi-py-api.md b/docs/reference/amdsmi-py-api.md index 476d50076a..a993ba266c 100644 --- a/docs/reference/amdsmi-py-api.md +++ b/docs/reference/amdsmi-py-api.md @@ -1279,7 +1279,7 @@ Input parameters: * `buffer_size` pointer to a variable that specifies the size of the cper_data * `cursor` pointer to a variable that will contain the cursor for the next call -Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data +Output: Dictionary with fields, updated cursor, a dictionary of the cper_data, and API status_code Field | Description ---|--- @@ -1297,16 +1297,22 @@ Field | Description `flags` | Reserved flags related to the CPER entry. | `persistence_info` | Reserved information related to persistence. | +* `status_code` | Upon successful retrieval of data, status_code will be AMDSMI_STATUS_SUCCESS (0) or AMDSMI_STATUS_MORE_DATA (39) if more data can be retrieve by subsequent call to the `amdsmi_get_gpu_cper_entries` function. In the later case, the input parameter `cursor` should be set to the updated `cursor` that was returned from the previous call. + Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: -* `AmdSmiLibraryException` +* `AmdSmiLibraryException` with these possible error codes: + AMDSMI_STATUS_INVAL + AMDSMI_STATUS_UNEXPECTED_SIZE + AMDSMI_STATUS_UNEXPECTED_DATA + AMDSMI_STATUS_NOT_SUPPORTED * `AmdSmiParameterException` Example: ```python for device in devices: - entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor) + entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor) print("CPER entries for device", device) for key, entry in entries.items(): print("Entry", key) diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 8fc49f0e94..9619706375 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -2435,7 +2435,7 @@ def amdsmi_get_gpu_cper_entries( severity_mask: int, buffer_size: int = 4 * 1048576, cursor: int = 0 -) -> Tuple[Dict[str, Any], int, List[Dict[str, Any]]]: +) -> Tuple[Dict[str, Any], int, List[Dict[str, Any]], int]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( @@ -2445,15 +2445,16 @@ def amdsmi_get_gpu_cper_entries( # Allocate a buffer for CPER data. buf = ctypes.create_string_buffer(buffer_size) buf_size = ctypes.c_uint64(buffer_size) - entry_count = ctypes.c_uint64(20) + num_cper_hdrs = 20 + entry_count = ctypes.c_uint64(num_cper_hdrs) cur = ctypes.c_uint64(cursor) # Allocate a pointer for the CPER header array. - cper_hdrs_array = (POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * 20)() - cper_hdrs = ctypes.cast(cper_hdrs_array, POINTER(POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t))) + cper_hdrs_array = (ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * num_cper_hdrs)() + cper_hdrs = ctypes.cast(cper_hdrs_array, ctypes.POINTER(ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t))) # Call the underlying AMD-SMI API. - ret = amdsmi_wrapper.amdsmi_get_gpu_cper_entries( + status_code = amdsmi_wrapper.amdsmi_get_gpu_cper_entries( processor_handle, ctypes.c_uint32(severity_mask), buf, @@ -2462,8 +2463,8 @@ def amdsmi_get_gpu_cper_entries( ctypes.byref(entry_count), ctypes.byref(cur) ) - if ret != amdsmi_wrapper.AMDSMI_STATUS_SUCCESS: - raise AmdSmiLibraryException(ret) + if status_code not in {amdsmi_wrapper.AMDSMI_STATUS_SUCCESS, amdsmi_wrapper.AMDSMI_STATUS_MORE_DATA}: + raise AmdSmiLibraryException(status_code) entries = {} cper_data = [] @@ -2518,7 +2519,7 @@ def amdsmi_get_gpu_cper_entries( entries[i] = cper_entry.copy() offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset. - return entries, cur.value, cper_data + return entries, cur.value, cper_data, status_code def amdsmi_get_afids_from_cper( diff --git a/src/amd_smi/amd_smi_cper.cc b/src/amd_smi/amd_smi_cper.cc index d9504ceb52..83b3ac42e7 100644 --- a/src/amd_smi/amd_smi_cper.cc +++ b/src/amd_smi/amd_smi_cper.cc @@ -420,7 +420,7 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path( if(((1 << header->error_severity) & severity_mask) != static_cast(1 << header->error_severity)) { ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x" - << std::hex << (1 << header->error_severity) << ", given severity_mask: 0x" + << std::hex << (header->error_severity) << ", given severity_mask: 0x" << std::hex << severity_mask << ", record_length:" << std::dec << header->record_length; LOG_DEBUG(ss); @@ -428,7 +428,7 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path( } else { ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x" - << std::hex << (1 << header->error_severity) << ", given severity_mask: 0x" + << std::hex << (header->error_severity) << ", given severity_mask: 0x" << std::hex << severity_mask << ", record_length:" << std::dec << header->record_length; LOG_DEBUG(ss); @@ -468,9 +468,9 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path( *buf_size = data_idx; ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ - << "[CPER] *entry_count: " << entry_count - << ", *cursor: " << cursor - << ", *buf_size: " << buf_size; + << "[CPER] *entry_count: " << *entry_count + << ", *cursor: " << *cursor + << ", *buf_size: " << *buf_size; LOG_DEBUG(ss); return AMDSMI_STATUS_SUCCESS;