diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index afb0eb42dc..1bca9a754a 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1178,7 +1178,7 @@ class AMDSMIHelpers(): if not getattr(self, "_cper_display_initialized", False): # Warning if no folder was specified elsewhere if not getattr(self, "_cper_warning_printed", False): - print(f"WARNING:No cper files will be dumped unless --folder= is specified.") + print(f"WARNING:No cper files will be dumped unless --folder= is specified and cper entries exist.") self._cper_warning_printed = True self._print_header(folder) @@ -1442,11 +1442,13 @@ class AMDSMIHelpers(): logger.set_cper_exit_message(False) self.stop = False + num_entries = 0 while True: try: entries, new_cursor, cper_data, status_code = amdsmi_interface.amdsmi_get_gpu_cper_entries( device_handle, severity_mask, buffer_size, args.cursor[gpu_idx]) logging.debug(f"cper_entries | entries: {entries}") + num_entries = num_entries + len(entries) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Error opening CPER file. This command requires elevation') from e @@ -1466,6 +1468,11 @@ class AMDSMIHelpers(): break else: self.display_cper_files_generated(entries, device_handle, args.folder) + if num_entries == 0 and not args.follow: + if args.folder: + self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) + else: + self.display_cper_files_generated(entries, device_handle, args.folder) def get_bitmask_ranges(self, bitmask_dict): ranges = {} diff --git a/projects/amdsmi/docs/reference/amdsmi-py-api.md b/projects/amdsmi/docs/reference/amdsmi-py-api.md index e982484e54..42c25e2c5d 100644 --- a/projects/amdsmi/docs/reference/amdsmi-py-api.md +++ b/projects/amdsmi/docs/reference/amdsmi-py-api.md @@ -1260,11 +1260,20 @@ Description: Dump CPER entries for a given GPU in a file using from CPER header Input parameters: * `processor_handle` device which to query -* `severity_mask` the severity mask of the entries to be retrieved -* `buffer_size` pointer to a variable that specifies the size of the cper_data -* `cursor` pointer to a variable that will contain the cursor for the next call +* `severity_mask` the severity mask of the entries to be retrieved: + 1:'nonfatal-uncorrected', + 2: 'fatal', + 4: 'nonfatal-corrected', 'corrected', + 7: 'all' +* `buffer_size` number of bytes that will be used to create a buffer for copying cper entries into; default is 1048576 bytes +* `cursor` the zero based index at which to start retrieving cper entries; default value is 0; for example, if there are 10 cper entries available, then with a cursor value of 8, it will retrieve the last two cper entries only -Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data +Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data, status_code + status_code: + AMDSMI_STATUS_SUCCESS: If all entries were retrieved successfully + AMDSMI_STATUS_MORE_DATA: If some of the entries were retrieved and: + * A subsequent call to the API with the updated cursor will result in the fetching the next batch of entries, or + * Increasing the input buffer_size will allow more entries to be fetched with the same cursor Field | Description ---|--- @@ -1290,75 +1299,194 @@ Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: Example: ```python -for device in devices: - entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor) - print("CPER entries for device", device) - for key, entry in entries.items(): - print("Entry", key) - print(" Error Severity:", entry.get("error_severity", "Unknown")) - print(" Notify Type:", entry.get("notify_type", "Unknown")) - print(" Timestamp:", entry.get("timestamp", "")) - print() - print("New Cursor Position:", new_cursor) +from amdsmi import * + +amdsmi_init() + +def get_severity_mask(severity): + severity_mask = 0 + if severity == "all": + # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) + severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) + elif severity == "fatal": + # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) + severity_mask |= (1 << 1) + elif severity in ("nonfatal", "nonfatal-uncorrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) + severity_mask |= (1 << 0) + elif severity in ("nonfatal-corrected", "corrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) + severity_mask |= (1 << 2) + return severity_mask + +def gpuid(device): + for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()): + if device.value == device_handle.value: + return gpu_index + +try: + devices = amdsmi_interface.amdsmi_get_processor_handles() + buffer_size = 1024*100 + initial_cursor = 0 + severity = "all" + for device in devices: + entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries( + device, get_severity_mask(severity), buffer_size, initial_cursor) + gpu_id = gpuid(device) + print(f"cper entries for '{severity}' severity on gpu #{gpu_id}:") + for key, entry in entries.items(): + print("Entry", key) + print(" Error Severity:", entry.get("error_severity", "Unknown")) + print(" Notify Type:", entry.get("notify_type", "Unknown")) + print(" Timestamp:", entry.get("timestamp", "")) except AmdSmiException as e: print(e) ``` +Output: + +```shell +cper entries for 'nonfatal-corrected' severity on gpu #0: +cper entries for 'nonfatal-corrected' severity on gpu #1: +Entry 0 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/08/13 19:28:31 +Entry 1 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/08/13 19:36:38 +``` + ### amdsmi_get_afids_from_cper Description: Get the AFIDs from CPER buffer Input parameters: -* `processor_handle` device which to query -* `severity_mask` the severity mask of the entries to be retrieved -* `buffer_size` pointer to a variable that specifies the size of the cper_data -* `cursor` pointer to a variable that will contain the cursor for the next call +* `cper_afid_data`: Either + - raw bytes or bytearray of a single CPER record, or + - a list of dicts each with keys "bytes" (List[int]) and "size" (int). -Output: Dictionary with fields, updated cursor, a dictionary of the cper_data, and API status_code - -Field | Description ----|--- -`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. | -`notify_type` | The notification type associated with the CPER entry. | -`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. | -`signature` | A 4-byte signature identifying the entry, typically `CPER`. | -`revision` | The revision number of the CPER record format. | -`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. | -`sec_cnt` | The count of sections included in the CPER entry. | -`record_length` | The total length in bytes of the CPER entry. | -`platform_id` | A character array identifying the GPU or platform. | -`creator_id` | A character array indicating the creator of the CPER entry. | -`record_id` | A unique identifier for the CPER entry. | -`flags` | Reserved flags related to the CPER entry. | -`persistence_info` | Reserved information related to persistence. | +Output: Tuple[List[int], int]: A tuple containing: + - A list of extracted AFIDs. + - The total count of AFIDs. * `status_code` | Upon successful retrieval of data, status_code will be AMDSMI_STATUS_SUCCESS (0) or AMDSMI_STATUS_MORE_DATA (39) if more data can be retrieve by subsequent call to the `amdsmi_get_gpu_cper_entries` function. In the later case, the input parameter `cursor` should be set to the updated `cursor` that was returned from the previous call. Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: +* `AmdSmiParameterException` * `AmdSmiLibraryException` with these possible error codes: AMDSMI_STATUS_INVAL AMDSMI_STATUS_UNEXPECTED_SIZE AMDSMI_STATUS_UNEXPECTED_DATA AMDSMI_STATUS_NOT_SUPPORTED -* `AmdSmiParameterException` Example: ```python -for device in devices: - entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor) - print("CPER entries for device", device) - for key, entry in entries.items(): - print("Entry", key) - print(" Error Severity:", entry.get("error_severity", "Unknown")) - print(" Notify Type:", entry.get("notify_type", "Unknown")) - print(" Timestamp:", entry.get("timestamp", "")) - print() - print("New Cursor Position:", new_cursor) -except AmdSmiException as e: - print(e) +from amdsmi import * +import os + +amdsmi_init() + +def get_severity_mask(severity): + severity_mask = 0 + if severity == "all": + # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) + severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) + elif severity == "fatal": + # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) + severity_mask |= (1 << 1) + elif severity in ("nonfatal", "nonfatal-uncorrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) + severity_mask |= (1 << 0) + elif severity in ("nonfatal-corrected", "corrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) + severity_mask |= (1 << 2) + return severity_mask + +def gpuid(device): + for gpu_index, device_handle in enumerate(amdsmi_interface.amdsmi_get_processor_handles()): + if device.value == device_handle.value: + return gpu_index + +def dump_cper_entry(entry, cper_data, key): + try: + os.mkdir("/tmp/cper_dump", mode=0o777, dir_fd=None) + except FileExistsError: + pass + cper_file = f"/tmp/cper_dump/cper_entry_{key}.bin" + with open(cper_file, "wb") as file: + size = cper_data[key]["size"] + data = cper_data[key]["bytes"] + data = bytes(x % 256 for x in data[:size]) + file.write(data) + print(f" Wrote cper data to file: {cper_file}") + json_file = f"/tmp/cper_dump/cper_entry_{key}.json" + with open(json_file, "wt") as file: + file.write(str(entry)) + +def get_gpu_cper_entries(): + try: + devices = amdsmi_interface.amdsmi_get_processor_handles() + buffer_size = 1024*100 + initial_cursor = 0 + severity = "all" + for device in devices: + entries, new_cursor, cper_data, status_code = amdsmi_get_gpu_cper_entries( + device, get_severity_mask(severity), buffer_size, initial_cursor) + gpu_id = gpuid(device) + print("###################") + print(f"cper entries for '{severity}' severity on gpu #{gpu_id}:") + for key, entry in entries.items(): + print("----------------") + print("Entry", key) + print(" Error Severity:", entry.get("error_severity", "Unknown")) + print(" Notify Type:", entry.get("notify_type", "Unknown")) + print(" Timestamp:", entry.get("timestamp", "")) + print(f" Cper entry metadata: {entry}") + dump_cper_entry(entry, cper_data, key) + except AmdSmiException as e: + print(e) + +get_gpu_cper_entries() +``` + +Output: + +``` shell +################### +cper entries for 'all' severity on gpu #0: +################### +cper entries for 'all' severity on gpu #1: +################### +cper entries for 'all' severity on gpu #2: +################### +cper entries for 'all' severity on gpu #3: +################### +cper entries for 'all' severity on gpu #4: +################### +cper entries for 'all' severity on gpu #5: +################### +cper entries for 'all' severity on gpu #6: +################### +cper entries for 'all' severity on gpu #7: +---------------- +Entry 0 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/08/13 20:07:56 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/08/13 20:07:56', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0xcafe:0xbeef', 'creator_id': b'amdgpu', 'record_id': b'0:1', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_0.bin +---------------- +Entry 1 + Error Severity: non_fatal_corrected + Notify Type: CMC + Timestamp: 2025/08/13 20:14:58 + Cper entry metadata: {'error_severity': 'non_fatal_corrected', 'notify_type': 'CMC', 'timestamp': '2025/08/13 20:14:58', 'signature': b'CPER', 'revision': 256, 'signature_end': '0xffffffff', 'sec_cnt': 1, 'record_length': 472, 'platform_id': b'0xcafe:0xbeef', 'creator_id': b'amdgpu', 'record_id': b'0:2', 'flags': 0, 'persistence_info': 0} + Wrote cper data to file: /tmp/cper_dump/cper_entry_1.bin ``` ### amdsmi_get_gpu_ras_feature_info @@ -1389,16 +1517,35 @@ Exceptions that can be thrown by `amdsmi_get_gpu_ras_feature_info` function: Example: ```python -try: - devices = amdsmi_get_processor_handles() - if len(devices) == 0: - print("No GPUs on machine") - else: - for device in devices: - ras_info = amdsmi_get_gpu_ras_feature_info(device) - print(ras_info) -except AmdSmiException as e: - print(e) +from amdsmi import * +import os + +amdsmi_init() + +def amdsmi_get_afids_from_cper(): + directory_path = "/tmp/cper_dump/" + print(f"Searching for cper file in {directory_path}") + with os.scandir(directory_path) as cper_files: + for cper_file in cper_files: + if cper_file.is_file(): + if ".bin" in cper_file.path: + print(f"Found {cper_file.path}") + with open(cper_file.path, "rb") as file: + raw = file.read() + afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw) + print(f"afids: {afids}") + +amdsmi_get_afids_from_cper() + +``` +Output: +``` +sudo python3 afid.py +Searching for cper file in /tmp/cper_dump/ +Found /tmp/cper_dump/cper_entry_0.bin +afids: [17] +Found /tmp/cper_dump/cper_entry_1.bin +afids: [17] ``` ### amdsmi_get_gpu_ras_block_features_enabled