diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 7975be5d11..68ad3840c8 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -33,6 +33,7 @@ from amdsmi_cli_exceptions import AmdSmiInvalidParameterException, AmdSmiRequire from amdsmi_helpers import AMDSMIHelpers from amdsmi_logger import AMDSMILogger from amdsmi import amdsmi_exception, amdsmi_interface +from pathlib import Path class AMDSMICommands(): """This class contains all the commands corresponding to AMDSMIParser @@ -6325,9 +6326,35 @@ class AMDSMICommands(): with self.logger.destination.open('a', encoding="utf-8") as output_file: output_file.write(legend_output + '\n') + def __pvtDumpAfids(self, cper_file): + # 1) Fetch the CPER “file” and ensure we have raw bytes + raw_data = cper_file + if hasattr(raw_data, "read"): + # fetch_cper_file returned a file‐object + raw = raw_data.read() + elif isinstance(raw_data, Path): + # Path: read the bytes directly + raw = raw_data.read_bytes() + elif isinstance(raw_data, str): + # fetch_cper_file returned a filename + with open(raw_data, "rb") as f: + raw = f.read() + else: + # assume it's already bytes + raw = raw_data + size = len(raw) + self.helpers.hexdump_to_string(raw) + afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw) + print(f"AFIDS: ", end="") + for afid in afids: + print(afid, end=" ") + print("") - def ras(self, args, multiple_devices=False, gpu=None, cper=None, - severity=None, folder=None, file_limit=None, follow=None): + + + + def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None, + severity=None, folder=None, file_limit=None, cper_file=None, follow=None): """ Retrieve and process CPER (RAS) entries for a target GPU. @@ -6338,23 +6365,32 @@ class AMDSMICommands(): The output file name is auto-generated using the timestamp from the CPER header data (converted from the header’s "YYYY/MM/DD HH:MM:SS" format), along with the GPU/platform ID and error severity. """ + # GPU handle logic. if gpu: args.gpu = gpu if cper: args.cper = cper + if afid: + args.afid = afid if severity: args.severity = severity if folder: args.folder = folder if file_limit: args.file_limit = file_limit + if cper_file: + args.cper_file = cper_file if follow: args.follow = follow - if args.gpu == None: args.gpu = self.device_handles + #Fetching AFID + if args.afid and args.cper_file: + self.__pvtDumpAfids(args.cper_file) + return + if not self.group_check_printed: self.helpers.check_required_groups() self.group_check_printed = True @@ -6362,7 +6398,6 @@ class AMDSMICommands(): handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras) if handled_multiple_gpus: return - args.gpu = device_handle # Parse severity mask dynamically from the --severity option. @@ -6381,17 +6416,15 @@ class AMDSMICommands(): severity_mask |= (1 << 0) elif sev in ("nonfatal-corrected", "corrected"): # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) - severity_mask |= (1 << 2) - + severity_mask |= (1 << 2) + + cursor = 0 + buffer_size = 1048576 if args.cper: # Start from cursor 0 (no timestamp argument provided). - cursor = 0 - buffer_size = 1048576 file_limit = int(args.file_limit) if args.file_limit else 1000 - # Main loop: continuously retrieve CPER entries if --follow is set. gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) - # Print header only when dumping to a folder if args.follow and not getattr(self, "_cper_follow_prompted", False): print("Press CTRL + C to stop.") @@ -6409,12 +6442,11 @@ class AMDSMICommands(): if partition_id != 0: logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}") return - - if args.folder and args.gpu: - print(f"Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}") - elif args.folder: + + if args.folder and not getattr(self, "_cper_folder_prompted", False): print(f"Dumping CPER file header entries in folder {args.folder}") - + self._cper_folder_prompted = True + self.logger.set_cper_exit_message(False) self.stop = False diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index b7c38fa345..2a6b39109a 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1078,7 +1078,7 @@ class AMDSMIHelpers(): msg = ( "WARNING: User is missing the following required groups: %s. " "Please add user to these groups." - ) % ", ".join(sorted(missing_groups)) + ) % ", ".join(sodurted(missing_groups)) print(msg) logging.warning(msg) @@ -1116,7 +1116,7 @@ class AMDSMIHelpers(): self._cper_warning_printed = True # Header - print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") self._cper_display_initialized = True for entry_index, entry in enumerate(entries.values()): @@ -1138,7 +1138,7 @@ class AMDSMIHelpers(): timestamp = entry.get("timestamp", "unknown") gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") self.increment_cper_count() time.sleep(1) @@ -1156,7 +1156,7 @@ class AMDSMIHelpers(): self._cper_warning_printed = True # Header - print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") self._cper_display_initialized = True # Loop through all entries in the dictionary. @@ -1180,14 +1180,16 @@ class AMDSMIHelpers(): timestamp = entry.get("timestamp", "unknown") gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") self.increment_cper_count() def dump_gpu_entries(self, folder, entries, cper_data, device_handle): - # Header - print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") - self._cper_display_initialized = True + # One‐time initialization: print warning & header only once + if not getattr(self, "_cper_display_initialized", False): + # Warning if no folder was specified elsewhere + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") + self._cper_display_initialized = True if folder: @@ -1220,7 +1222,7 @@ class AMDSMIHelpers(): #print header timestamp = entry.get("timestamp", "unknown") gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") self.increment_cper_count() @@ -1241,9 +1243,11 @@ class AMDSMIHelpers(): def dump_all_entries(self, folder, entries, cper_data, device_handle): - # Header - print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") - self._cper_display_initialized = True + # One‐time initialization: print warning & header only once + if not getattr(self, "_cper_display_initialized", False): + # Warning if no folder was specified elsewhere + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") + self._cper_display_initialized = True if folder: @@ -1276,7 +1280,7 @@ class AMDSMIHelpers(): #print header timestamp = entry.get("timestamp", "unknown") gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") self.increment_cper_count() try: @@ -1293,9 +1297,11 @@ class AMDSMIHelpers(): def dump_all_entries_follow(self, folder, entries, cper_data, device_handle): - # Header - print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") - self._cper_display_initialized = True + # One‐time initialization: print warning & header only once + if not getattr(self, "_cper_display_initialized", False): + # Warning if no folder was specified elsewhere + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") + self._cper_display_initialized = True if folder: @@ -1328,7 +1334,7 @@ class AMDSMIHelpers(): #print header timestamp = entry.get("timestamp", "unknown") gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") self.increment_cper_count() time.sleep(1) @@ -1346,9 +1352,11 @@ class AMDSMIHelpers(): def dump_gpu_entries_follow(self, folder, entries, cper_data, device_handle): - # Header - print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") - self._cper_display_initialized = True + # One‐time initialization: print warning & header only once + if not getattr(self, "_cper_display_initialized", False): + # Warning if no folder was specified elsewhere + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") + self._cper_display_initialized = True if folder: @@ -1381,7 +1389,7 @@ class AMDSMIHelpers(): #print header timestamp = entry.get("timestamp", "unknown") gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") self.increment_cper_count() time.sleep(1) @@ -1396,3 +1404,35 @@ class AMDSMIHelpers(): else: print(json.dumps(entries, indent=2, default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) + + + def hexdump_to_string(self, data: Union[bytes, List[int]]) -> str: + """ + Convert binary data to a hexdump string. + + Args: + data: bytes object or list of integer byte values (0–255). + + Returns: + A multiline string, each line showing: + offset (in hex), hex bytes (16 per line), and printable ASCII. + """ + # Normalize to list of ints + if isinstance(data, bytes): + data_ints = list(data) + else: + # allow list of ints or single-character strings + data_ints = [b if isinstance(b, int) else ord(b) for b in data] + + lines: List[str] = [] + size = len(data_ints) + + for offset in range(0, size, 16): + chunk = data_ints[offset : offset + 16] + hex_values = " ".join(f"{b:02x}" for b in chunk) + # pad hex_values to 16*3-1 = 47 chars (two hex digits + space) + hex_values = hex_values.ljust(16 * 3 - 1) + ascii_values = "".join(chr(b) if 32 <= b <= 126 else "." for b in chunk) + lines.append(f"{offset:08x} {hex_values} |{ascii_values}|") + + return "\n".join(lines) \ No newline at end of file diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 8f941d5689..fd4b66f312 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -300,7 +300,7 @@ class AMDSMIParser(argparse.ArgumentParser): return CheckOutputFilePath - def _check_input_file_path(self): + def _check_cper_file_path(self): """ Argument action validator: Returns a path to a file from the input file path provided. If the file doesn't exist or is empty raise error @@ -310,8 +310,7 @@ class AMDSMIParser(argparse.ArgumentParser): def __call__(self, parser, args, values, option_string=None): path = Path(values) if not path.exists(): - raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), values) + raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct. ") if path.is_dir(): raise argparse.ArgumentTypeError( @@ -1413,12 +1412,13 @@ class AMDSMIParser(argparse.ArgumentParser): # Help text for RAS arguments cper_help = "Trigger CPER data retrieval" - + afid_help = "Generate an AFID (AMD Field ID) using CPER record, which is similar to XID." severity_choices = ["nonfatal-uncorrected", "fatal", "nonfatal-corrected", "all"] severity_choices_str = ", ".join(severity_choices) severity_help = f"Set the SEVERITY filters from the following:\n {severity_choices_str}" folder_help = "Folder to dump CPER report files" file_limit_help = "Maximum number of entries per output file" + cper_file_help = "Full path of the cper record file to generate the AFID" follow_help = "Continuously monitor for new entries" ras_parser = subparsers.add_parser("ras", help=ras_help, description=ras_description) @@ -1427,10 +1427,12 @@ class AMDSMIParser(argparse.ArgumentParser): ras_parser.set_defaults(func=func) # Required flags and arguments: - ras_parser.add_argument("--cper", action="store_true", required=True, help=cper_help) + ras_parser.add_argument("--cper", action="store_true", required=False, help=cper_help) + ras_parser.add_argument("--afid", action="store_true", required=False, help=afid_help) ras_parser.add_argument("--severity", type=str.lower, nargs='+', default=['all'], help=severity_help, choices=severity_choices, metavar='SEVERITY') ras_parser.add_argument("--folder", type=str, action=self._check_folder_path(), default=False, help=folder_help) ras_parser.add_argument("--file_limit", type=self._positive_int, action='store', default=1000, help=file_limit_help) + ras_parser.add_argument("--cper_file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help) ras_parser.add_argument("--follow", action="store_true", default=False, help=follow_help) # Add common modifiers and device selection arguments. diff --git a/projects/amdsmi/docs/reference/amdsmi-py-api.md b/projects/amdsmi/docs/reference/amdsmi-py-api.md index 656f6f5629..1ce318e3ac 100644 --- a/projects/amdsmi/docs/reference/amdsmi-py-api.md +++ b/projects/amdsmi/docs/reference/amdsmi-py-api.md @@ -5274,3 +5274,52 @@ try: except AmdSmiException as e: print(e) ``` +### amdsmi_get_afids_from_cper + +Description: Get the AFIDs from CPER buffer + +Input parameters: +* `processor_handle` device which to query +* `severity_mask` the severity mask of the entries to be retrieved +* `buffer_size` pointer to a variable that specifies the size of the cper_data +* `cursor` pointer to a variable that will contain the cursor for the next call + +Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data + +Field | Description +---|--- +`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. | +`notify_type` | The notification type associated with the CPER entry. | +`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. | +`signature` | A 4-byte signature identifying the entry, typically `CPER`. | +`revision` | The revision number of the CPER record format. | +`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. | +`sec_cnt` | The count of sections included in the CPER entry. | +`record_length` | The total length in bytes of the CPER entry. | +`platform_id` | A character array identifying the GPU or platform. | +`creator_id` | A character array indicating the creator of the CPER entry. | +`record_id` | A unique identifier for the CPER entry. | +`flags` | Reserved flags related to the CPER entry. | +`persistence_info` | Reserved information related to persistence. | + +Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: + +* `AmdSmiLibraryException` +* `AmdSmiParameterException` + +Example: + +```python +for device in devices: + entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor) + print("CPER entries for device", device) + for key, entry in entries.items(): + print("Entry", key) + print(" Error Severity:", entry.get("error_severity", "Unknown")) + print(" Notify Type:", entry.get("notify_type", "Unknown")) + print(" Timestamp:", entry.get("timestamp", "")) + print() + print("New Cursor Position:", new_cursor) +except AmdSmiException as e: + print(e) +``` diff --git a/projects/amdsmi/include/aca-decode/aca_decode.h b/projects/amdsmi/include/aca-decode/aca_decode.h new file mode 100755 index 0000000000..692eb0fe37 --- /dev/null +++ b/projects/amdsmi/include/aca-decode/aca_decode.h @@ -0,0 +1,64 @@ +/** + * @file aca_decode.h + * @brief Internal decoder interface and data structures + */ +#ifndef ACA_DECODE_H +#define ACA_DECODE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aca_fields.h" + +/** + * @brief Internal decoder structure with parsed register fields + */ +typedef struct +{ + uint64_t aca_status; /**< Raw status register value */ + uint64_t aca_ipid; /**< Raw IPID register value */ + uint64_t aca_synd; /**< Raw syndrome register value */ + uint32_t flags; /**< Decoder flags */ + uint16_t hw_revision; /**< Hardware hw_revision */ + + aca_status_fields_t status; /**< Parsed status fields */ + aca_ipid_fields_t ipid; /**< Parsed IPID fields */ + aca_synd_fields_t synd; /**< Parsed syndrome fields */ +} aca_decoder_t; + +/** + * @brief Structure containing raw ACA error data from hardware + */ +typedef struct +{ + uint64_t aca_status; /**< Raw status register value */ + uint64_t aca_ipid; /**< Raw IPID register value */ + uint64_t aca_synd; /**< Raw syndrome register value */ + uint32_t flags; /**< Flags from descriptor */ + uint16_t hw_revision; /**< Hardware hw_revision number */ +} aca_raw_data_t; + +/** + * @brief Structure containing decoded error information + */ +typedef struct +{ + const char *bank_ref; /**< Reference to bank name string */ + const char *error_type_ref; /**< Reference to error type string */ + const char *severity_ref; /**< Reference to error severity string */ + const char *category_ref; /**< Reference to error category string */ + int afid; /**< AFID value (AMD Field ID) */ +} aca_error_info_t; + +/** + * @brief Main decode function that processes raw ACA error data + * @param[in] raw_data Pointer to structure containing raw ACA error data + * @return Decoded error information structure + */ +aca_error_info_t aca_decode(const aca_raw_data_t *raw_data); + +#ifdef __cplusplus +} +#endif +#endif /* ACA_DECODE_H */ diff --git a/projects/amdsmi/include/aca-decode/aca_fields.h b/projects/amdsmi/include/aca-decode/aca_fields.h new file mode 100644 index 0000000000..b51214f727 --- /dev/null +++ b/projects/amdsmi/include/aca-decode/aca_fields.h @@ -0,0 +1,110 @@ +/** + * @file aca_fields.h + * @brief ACA register field definitions and manipulation functions + * + * Contains structures and functions for decoding and handling + * ACA register fields. It provides field + * definitions for status, IPID, and syndrome registers, along with + * functions to initialize and access these fields. + */ +#ifndef ACA_FIELDS_H +#define ACA_FIELDS_H + +#include + +/** + * @brief Base structure for ACA fields containing raw register value + */ +typedef struct +{ + uint64_t raw_value; /**< Raw 64-bit register value */ +} aca_fields_t; + +/** + * @brief Structure containing decoded ACA status register fields + */ +typedef struct +{ + aca_fields_t base; + uint16_t error_code; + uint8_t error_code_ext; + uint8_t reserv22; + uint8_t addr_lsb; + uint8_t reserv30; + uint8_t err_core_id; + uint8_t reserv38; + uint8_t scrub; + uint8_t reserv41; + uint8_t poison; + uint8_t deferred; + uint8_t uecc; + uint8_t cecc; + uint8_t reserv47; + uint8_t synd_v; + uint8_t reserv54; + uint8_t tcc; + uint8_t err_core_id_val; + uint8_t pcc; + uint8_t addr_v; + uint8_t misc_v; + uint8_t en; + uint8_t uc; + uint8_t overflow; + uint8_t val; +} aca_status_fields_t; + +/** + * @brief Structure containing decoded ACA IPID register fields + */ +typedef struct +{ + aca_fields_t base; + uint32_t instance_id_lo; + uint16_t hardware_id; + uint16_t aca_type; + uint8_t instance_id_hi; +} aca_ipid_fields_t; + +/** + * @brief Structure containing decoded ACA syndrome register fields + */ +typedef struct +{ + aca_fields_t base; + uint32_t error_information; + uint8_t length; + uint8_t error_priority; + uint8_t reserved27; + uint16_t syndrome; + uint32_t reserved39; +} aca_synd_fields_t; + +/** + * @brief Reads the raw value from an ACA field structure + * @param[in] fields Pointer to the ACA fields structure + * @return The raw 64-bit value stored in the structure + */ +uint64_t aca_fields_read(const aca_fields_t *fields); + +/** + * @brief Initializes ACA status fields from a raw status register value + * @param[out] fields Pointer to the status fields structure to initialize + * @param[in] status_reg Raw 64-bit status register value + */ +void aca_status_init(aca_status_fields_t *fields, uint64_t status_reg); + +/** + * @brief Initializes ACA IPID fields from a raw IPID register value + * @param[out] fields Pointer to the IPID fields structure to initialize + * @param[in] ipid_reg Raw 64-bit IPID register value + */ +void aca_ipid_init(aca_ipid_fields_t *fields, uint64_t ipid_reg); + +/** + * @brief Initializes ACA syndrome fields from a raw syndrome register value + * @param[out] fields Pointer to the syndrome fields structure to initialize + * @param[in] synd_reg Raw 64-bit syndrome register value + */ +void aca_synd_init(aca_synd_fields_t *fields, uint64_t synd_reg); + +#endif diff --git a/projects/amdsmi/include/aca-decode/aca_tables.h b/projects/amdsmi/include/aca-decode/aca_tables.h new file mode 100644 index 0000000000..11660f0d1b --- /dev/null +++ b/projects/amdsmi/include/aca-decode/aca_tables.h @@ -0,0 +1,84 @@ +/** + * @file aca_tables.h + * @brief ACA lookup table definitions and helper functions + * @details Contains data structures and functions definitions for mapping ACA Registers + * into their corresponding names and types. + */ + +#ifndef ACA_TABLES_H +#define ACA_TABLES_H + +#include +#include + +/** + * @brief Structure mapping hardware ID and ACA type to bank names + */ +typedef struct +{ + uint16_t hw_id; /**< Hardware ID value */ + uint16_t aca_type; /**< ACA type identifier */ + const char *name; /**< Bank name string */ +} aca_bank_entry_t; + +/** + * @brief Structure mapping bank-specific error codes to error types + */ +typedef struct +{ + const char *bank; /**< Bank name string */ + uint32_t error_code; /**< Error code value */ + const char *type; /**< Error type string */ +} aca_error_type_t; + +/** + * @brief Structure for generic error code to error type mapping + */ +typedef struct +{ + uint32_t error_code; /**< Error code value */ + const char *type; /**< Error type string */ +} aca_error_entry_t; + +// External table declarations +extern const aca_bank_entry_t bank_table[]; +extern const aca_error_type_t error_table[]; +extern const aca_error_entry_t xcd_error_table[]; +extern const aca_error_entry_t aid_error_table[]; + +// Table size constants +extern const size_t NUM_BANKS; +extern const size_t NUM_ERRORS; +extern const size_t NUM_XCD_ERRORS; +extern const size_t NUM_AID_ERRORS; + +/** + * @brief Find bank name based on hardware ID and ACA type + * @param[in] hw_id Hardware ID value + * @param[in] aca_type ACA type value + * @param[out] bank_name Pointer to store result string + * @return 0 on success, 1 if not found, -1 on parameter error + */ +int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name); + +/** + * @brief Find error type for a specific bank and error code + * @param[in] bank Bank name string + * @param[in] error_code Error code value + * @param[out] error_type Pointer to store result string + * @return 0 on success, 1 if not found, -1 on parameter error + */ +int find_error_type_by_bank(const char *bank, uint32_t error_code, const char **error_type); + +/** + * @brief Generic lookup for error codes in an error table + * @param[in] table Pointer to error table + * @param[in] table_size Number of table entries + * @param[in] error_code Error code to look up + * @param[out] error_type Pointer to store result string + * @return 0 on success, 1 if not found, -1 on parameter error + */ +int find_error_in_table(const aca_error_entry_t *table, size_t table_size, + uint32_t error_code, const char **error_type); + +#endif diff --git a/projects/amdsmi/include/aca-decode/error_map.h b/projects/amdsmi/include/aca-decode/error_map.h new file mode 100644 index 0000000000..0ce012e19b --- /dev/null +++ b/projects/amdsmi/include/aca-decode/error_map.h @@ -0,0 +1,27 @@ +#ifndef ERROR_MAP_H +#define ERROR_MAP_H + +#include + +/** + * @brief Structure representing an error mapping entry + */ +typedef struct +{ + uint32_t id; + const char *error_category; + const char *error_type; + const char *method; + const char *error_severity; +} error_map_entry_t; + +/** + * @brief Get error ID based on category, type and severity + * @param[in] error_category Error category string + * @param[in] error_type Error type string + * @param[in] error_severity Error severity string + * @return Error ID if found, -1 if not found + */ +int get_error_id(const char *error_category, const char *error_type, const char *error_severity); + +#endif /* ERROR_MAP_H */ diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 28b22f6a74..8ea02158ef 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -150,7 +150,7 @@ typedef enum { #define AMDSMI_MAX_NUM_JPEG 32 /** - * @brief new for gpu metrics v1.8, document presents NUM_JPEG_ENG_V1 + * @brief Introduced in gpu metrics v1.8, document presents NUM_JPEG_ENG_V1 * but will change to AMDSMI_MAX_NUM_JPEG_ENG_V1 for continuity */ #define AMDSMI_MAX_NUM_JPEG_ENG_V1 40 @@ -182,6 +182,11 @@ typedef enum { */ #define AMDSMI_MAX_NUM_XCP 8 +/** + * @brief Max Number of AFIDs that will be inside one cper entry + */ +#define MAX_NUMBER_OF_AFIDS_PER_RECORD 12 + /* string format */ #define AMDSMI_TIME_FORMAT "%02d:%02d:%02d.%03d" #define AMDSMI_DATE_FORMAT "%04d-%02d-%02d:%02d:%02d:%02d.%03d" @@ -4795,6 +4800,32 @@ amdsmi_get_gpu_cper_entries(amdsmi_processor_handle processor_handle, uint32_t s /** @} End tagECCInfo */ +/** + * @brief Get the AFIDs from CPER buffer + * + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} + * @platform{guest_mvf} @platform{guest_windows} + * + * @details A utility function which retrieves the AFIDs from the CPER record. + * + * @param[in] cper_buffer a pointer to the buffer with one CPER record. The caller must make sure the whole CPER record is loaded into the buffer. + * + * @param[in] buf_size is the size of the cper_buffer. + * + * @param[out] afids a pointer to an array of uint64_t to which the AF IDs will be written + * + * @param[in,out] num_afids As input, the value passed through this parameter is the number of + * uint64_t that may be safely written to the memory pointed to by @p afids. This is the limit + * on how many AF IDs will be written to @p afids. On return, @p num_afids will contain the + * number of AF IDs written to @p afids, or the number of AF IDs that could have been written + * if enough memory had been provided. It is suggest to pass MAX_NUMBER_OF_AFIDS_PER_RECORD for all + * AF Ids. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_afids_from_cper( + char* cper_buffer, uint32_t buf_size, uint64_t* afids, uint32_t* num_afids); + /*****************************************************************************/ /** @defgroup tagErrorQuery Error Queries * These functions provide error information about AMDSMI calls as well as diff --git a/projects/amdsmi/include/amd_smi/impl/amd_smi_cper.h b/projects/amdsmi/include/amd_smi/impl/amd_smi_cper.h new file mode 100644 index 0000000000..03960f6142 --- /dev/null +++ b/projects/amdsmi/include/amd_smi/impl/amd_smi_cper.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#pragma once + +#include "amd_smi/amdsmi.h" + +#pragma pack(1) + +#define CPER_MAX_OAM_COUNT (8) + +typedef enum cper_error_severity { + CPER_SEV_FATAL_UNCORRECTED = 0, + CPER_SEV_FATAL = 1, + CPER_SEV_FATAL_CORRECTED = 2, + + CPER_SEV_UNUSED = 10, +}; + +typedef enum cper_aca_reg { + CPER_ACA_REG_CTL_LO = 0, + CPER_ACA_REG_CTL_HI = 1, + CPER_ACA_REG_STATUS_LO = 2, + CPER_ACA_REG_STATUS_HI = 3, + CPER_ACA_REG_ADDR_LO = 4, + CPER_ACA_REG_ADDR_HI = 5, + CPER_ACA_REG_MISC0_LO = 6, + CPER_ACA_REG_MISC0_HI = 7, + CPER_ACA_REG_CONFIG_LO = 8, + CPER_ACA_REG_CONFIG_HI = 9, + CPER_ACA_REG_IPID_LO = 10, + CPER_ACA_REG_IPID_HI = 11, + CPER_ACA_REG_SYND_LO = 12, + CPER_ACA_REG_SYND_HI = 13, + + CPER_ACA_REG_COUNT = 32, +}; + +struct cper_sec_desc { + uint32_t sec_offset; /* Offset from the start of CPER entry */ + uint32_t sec_length; + uint8_t revision_minor; /* CPER_SEC_MINOR_REV_1 */ + uint8_t revision_major; /* CPER_SEC_MAJOR_REV_22 */ + union { + struct { + uint8_t fru_id : 1; + uint8_t fru_text : 1; + uint8_t reserved : 6; + } valid_bits; + uint8_t valid_mask; + }; + uint8_t reserved; + union { + struct { + uint32_t primary : 1; + uint32_t reserved1 : 2; + uint32_t exceed_err_threshold : 1; + uint32_t latent_err : 1; /* "Deferred" error Creation*/ + uint32_t reserved2 : 27; + } flags_bits; + uint32_t flags_mask; + }; + amdsmi_cper_guid_t sec_type; /* AMD non-Standard, AMD Crashdump */ + char fru_id[16]; /* FRU Serial ID */ + amdsmi_cper_sev_t severity; + char fru_text[20]; /* "OAM%d" */ +}; + +struct cper_sec_nonstd_err_info { + amdsmi_cper_guid_t error_type; + union { + struct { + uint64_t ms_chk : 1; + uint64_t target_addr_id : 1; + uint64_t req_id : 1; + uint64_t resp_id : 1; + uint64_t instr_ptr : 1; + uint64_t reserved : 59; + } valid_bits; + uint64_t valid_mask; + }; + union { + struct { + uint64_t err_type_valid : 1; + uint64_t pcc_valid : 1; + uint64_t uncorr_valid : 1; + uint64_t precise_ip_valid : 1; + uint64_t restartable_ip_valid : 1; + uint64_t overflow_valid : 1; + uint64_t reserved1 : 10; + + uint64_t err_type : 2; + uint64_t pcc : 1; + uint64_t uncorr : 1; + uint64_t precised_ip : 1; + uint64_t restartable_ip : 1; + uint64_t overflow : 1; + uint64_t reserved2 : 41; + } ms_chk_bits; + uint64_t ms_chk_mask; + }; + + uint64_t target_addr_id; + uint64_t req_id; + uint64_t resp_id; + uint64_t instr_ptr; +}; + +struct cper_sec_nonstd_err_ctx { + uint16_t reg_ctx_type; + uint16_t reg_arr_size; + uint32_t msr_addr; + uint64_t mm_reg_addr; + uint32_t reg_dump[CPER_ACA_REG_COUNT]; /* This buffer can grow */ +}; + +struct cper_sec_nonstd_err_hdr { + union { + struct { + uint64_t apic_id : 1; + uint64_t fw_id : 1; + uint64_t err_info_cnt : 6; /* should match context_cnt */ + uint64_t err_context_cnt : 6; /* should match info_cnt */ + } valid_bits; + uint64_t valid_mask; + }; + + uint64_t apic_id; + char fw_id[48]; +}; + +struct cper_sec_nonstd_err_body { + struct cper_sec_nonstd_err_info err_info; + struct cper_sec_nonstd_err_ctx err_ctx; +}; + +struct cper_sec_nonstd_err { + struct cper_sec_nonstd_err_hdr hdr; + struct cper_sec_nonstd_err_body body[]; /* Variable Size, today only 1 entry */ +}; + +struct cper_sec_crashdump_data { + uint16_t reg_ctx_type; + uint16_t reg_arr_size; + uint32_t reserved1; + uint64_t reserved2; + + union { + struct { + uint32_t status_lo; + uint32_t status_hi; + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t ipid_lo; + uint32_t ipid_hi; + uint32_t synd_lo; + uint32_t synd_hi; + } fatal_err; + + struct { + uint64_t msg[CPER_MAX_OAM_COUNT]; + } boot_err; + } dump; + +}; + +struct cper_sec_crashdump { + uint64_t reserved1; + uint64_t reserved2; + char fw_id[48]; + uint64_t reserved3[8]; + + struct cper_sec_crashdump_data data; +}; + +struct cper_sec { + union { + struct { + uint8_t fru_id : 1; + uint8_t fru_text : 1; + uint8_t reserved : 6; + } valid_bits; + uint8_t valid_mask; + }; + + union { + struct cper_sec_crashdump crashdump; + struct cper_sec_nonstd_err runtime_err; + }; +}; + +/* General CPER record structure */ +struct cper_1_0 { + struct cper_hdr *hdr; + struct cper_sec_desc *sec_desc; /* Variable Size */ + struct cper_sec *sec; /* Variable Size */ +}; + +#pragma pack() + +amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask, + char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs, + uint64_t *entry_count, uint64_t *cursor); +std::vector cper_decode(const amdsmi_cper_hdr_t *cper); diff --git a/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h b/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h index a550d651aa..1db60cbe7d 100644 --- a/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h +++ b/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h @@ -56,7 +56,6 @@ std::string smi_split_string(std::string str, char delim); std::string smi_amdgpu_get_status_string(amdsmi_status_t ret, bool fullStatus); amdsmi_status_t smi_clear_char_and_reinitialize(char buffer[], uint32_t len, std::string newString); -amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask, char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs, uint64_t *entry_count, uint64_t *cursor); /** * @brief Wait for user input, a debugging function to pause the program * diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 8f96eed3b0..3142b7b565 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -60,6 +60,9 @@ AMDSMI_MAX_NUM_JPEG = 32 AMDSMI_MAX_NUM_XCC = 8 AMDSMI_MAX_NUM_XCP = 8 +# max num afids per cper record +MAX_NUMBER_OF_AFIDS_PER_RECORD = 12 + # Max number of DPM policies AMDSMI_MAX_NUM_PM_POLICIES = 32 @@ -1888,7 +1891,6 @@ def amdsmi_get_gpu_asic_info( # Remove commas from vendor name for clean output asic_info["vendor_name"] = asic_info["vendor_name"].replace(',', '') - # logging.debug("amdsmi_interface.py | amdsmi_get_gpu_asic_info | return_dictionary = \n" + str(json.dumps(asic_info, indent=4))) return asic_info @@ -2300,9 +2302,10 @@ def notifyTypeToString(notify_type_b): idx = idx +1 return "".join(guid[::-1]) -def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +def amdsmi_get_gpu_cper_entries( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, severity_mask: int, - buffer_size: int = 4*1048576, + buffer_size: int = 4 * 1048576, cursor: int = 0 ) -> Tuple[List[Dict[str, Any]], int]: @@ -2316,6 +2319,7 @@ def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processo buf_size = ctypes.c_uint64(buffer_size) entry_count = ctypes.c_uint64(20) cur = ctypes.c_uint64(cursor) + # Allocate a pointer for the CPER header array. cper_hdrs_array = (ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * 20)() cper_hdrs = ctypes.cast(cper_hdrs_array, ctypes.POINTER(ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t))) @@ -2336,51 +2340,114 @@ def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processo entries = {} cper_data = [] offset = 0 + # Iterate over each entry using its variable record_length. for i in range(entry_count.value): entry_address = ctypes.addressof(buf) + offset entry_ptr = ctypes.cast(entry_address, ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t)) + + # Extract the raw bytes and size of the entry. cper_data.append({ - "bytes":list((entry_ptr.contents.record_length * ctypes.c_byte).from_address(entry_address)), - "size":entry_ptr.contents.record_length + "bytes": list((entry_ptr.contents.record_length * ctypes.c_byte).from_address(entry_address)), + "size": entry_ptr.contents.record_length }) + # Extract the timestamp fields. year = entry_ptr.contents.timestamp.year - # Adjust the year if it's less than 100. You can tweak this logic based on your expected data. - if year < 100: - year += 2000 + if year < 100: # Adjust the year if it's less than 100. + year += 2000 formatted_timestamp = ( - f"{year:04d}/" - f"{entry_ptr.contents.timestamp.month:02d}/" - f"{entry_ptr.contents.timestamp.day:02d} " - f"{entry_ptr.contents.timestamp.hours:02d}:" - f"{entry_ptr.contents.timestamp.minutes:02d}:" - f"{entry_ptr.contents.timestamp.seconds:02d}" + f"{year:04d}/" + f"{entry_ptr.contents.timestamp.month:02d}/" + f"{entry_ptr.contents.timestamp.day:02d} " + f"{entry_ptr.contents.timestamp.hours:02d}:" + f"{entry_ptr.contents.timestamp.minutes:02d}:" + f"{entry_ptr.contents.timestamp.seconds:02d}" ) + + # Create a dictionary for the CPER entry. cper_entry = { - "error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get(entry_ptr.contents.error_severity, "AMDSMI_CPER_SEV_UNUSED").replace("AMDSMI_CPER_SEV_", "").lower(), + "error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get( + entry_ptr.contents.error_severity, "AMDSMI_CPER_SEV_UNUSED" + ).replace("AMDSMI_CPER_SEV_", "").lower(), "notify_type": _notifyTypeToString(entry_ptr.contents.notify_type.b), "timestamp": formatted_timestamp, - "signature" : entry_ptr.contents.signature, - "revision" : entry_ptr.contents.revision, - "signature_end" : hex(entry_ptr.contents.signature_end), - "sec_cnt" : entry_ptr.contents.sec_cnt, - "record_length" : entry_ptr.contents.record_length, - "platform_id" : entry_ptr.contents.platform_id, - "creator_id" : entry_ptr.contents.creator_id, - "record_id" : entry_ptr.contents.record_id, - "flags" : entry_ptr.contents.flags, - "persistence_info" : entry_ptr.contents.persistence_info, + "signature": entry_ptr.contents.signature, + "revision": entry_ptr.contents.revision, + "signature_end": hex(entry_ptr.contents.signature_end), + "sec_cnt": entry_ptr.contents.sec_cnt, + "record_length": entry_ptr.contents.record_length, + "platform_id": entry_ptr.contents.platform_id, + "creator_id": entry_ptr.contents.creator_id, + "record_id": entry_ptr.contents.record_id, + "flags": entry_ptr.contents.flags, + "persistence_info": entry_ptr.contents.persistence_info, #"reserved" : entry_ptr.contents.reserved #"cper_valid_bit" : entry_ptr.contents.cper_valid_bits, #"partition_id" : entry_ptr.contents.partition_id, } + entries[i] = cper_entry.copy() - offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset + offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset. return entries, cur.value, cper_data +def amdsmi_get_afids_from_cper( + cper_afid_data: Union[bytes, bytearray, List[Dict[str, Any]]] +) -> Tuple[List[int], int]: + """ + Extract AFIDs from one or more CPER blobs. + + Args: + cper_afid_data: Either + - raw bytes or bytearray of a single CPER record, or + - a list of dicts each with keys "bytes" (List[int]) and "size" (int). + + Returns: + Tuple[List[int], int]: A tuple containing: + - A list of extracted AFIDs. + - The total count of AFIDs. + """ + # Normalize single blob into a list of records + if isinstance(cper_afid_data, (bytes, bytearray)): + cper_records = [{ + "bytes": list(cper_afid_data), + "size": len(cper_afid_data) + }] + else: + cper_records = cper_afid_data + + all_afids: List[int] = [] + + for record in cper_records: + raw_bytes = bytes(record["bytes"]) + record_size = record["size"] + + # Wrap as char* + buf = ctypes.create_string_buffer(raw_bytes, record_size) + buf_ptr = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char)) + + afid_array = (ctypes.c_uint64 * MAX_NUMBER_OF_AFIDS_PER_RECORD)() + num_afids_ct = ctypes.c_uint32(MAX_NUMBER_OF_AFIDS_PER_RECORD) + + # Call the wrapper function + status = amdsmi_wrapper.amdsmi_get_afids_from_cper( + buf_ptr, + ctypes.c_uint32(record_size), + afid_array, + ctypes.byref(num_afids_ct) + ) + if status != amdsmi_wrapper.AMDSMI_STATUS_SUCCESS: + raise AmdSmiLibraryException(f"get_afids failed: {status}") + + # Collect exactly the decoded AFIDs + count = num_afids_ct.value + all_afids.extend(afid_array[i] for i in range(count)) + + return all_afids, len(all_afids) + + def amdsmi_get_gpu_board_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index a0ed754d15..d9a83b9345 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -2642,6 +2642,9 @@ amdsmi_cper_hdr_t = struct_amdsmi_cper_hdr_t amdsmi_get_gpu_cper_entries = _libraries['libamd_smi.so'].amdsmi_get_gpu_cper_entries amdsmi_get_gpu_cper_entries.restype = amdsmi_status_t amdsmi_get_gpu_cper_entries.argtypes = [amdsmi_processor_handle, uint32_t, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.POINTER(struct_amdsmi_cper_hdr_t)), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64)] +amdsmi_get_afids_from_cper = _libraries['libamd_smi.so'].amdsmi_get_afids_from_cper +amdsmi_get_afids_from_cper.restype = amdsmi_status_t +amdsmi_get_afids_from_cper.argtypes = [ctypes.POINTER(ctypes.c_char), uint32_t, ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint32)] amdsmi_get_gpu_ecc_status = _libraries['libamd_smi.so'].amdsmi_get_gpu_ecc_status amdsmi_get_gpu_ecc_status.restype = amdsmi_status_t amdsmi_get_gpu_ecc_status.argtypes = [amdsmi_processor_handle, amdsmi_gpu_block_t, ctypes.POINTER(amdsmi_ras_err_state_t)] @@ -3171,9 +3174,9 @@ __all__ = \ 'amdsmi_free_name_value_pairs', 'amdsmi_freq_ind_t', 'amdsmi_freq_volt_region_t', 'amdsmi_frequencies_t', 'amdsmi_frequency_range_t', 'amdsmi_fw_block_t', - 'amdsmi_fw_info_t', 'amdsmi_get_clk_freq', - 'amdsmi_get_clock_info', 'amdsmi_get_cpu_cclk_limit', - 'amdsmi_get_cpu_core_boostlimit', + 'amdsmi_fw_info_t', 'amdsmi_get_afids_from_cper', + 'amdsmi_get_clk_freq', 'amdsmi_get_clock_info', + 'amdsmi_get_cpu_cclk_limit', 'amdsmi_get_cpu_core_boostlimit', 'amdsmi_get_cpu_core_current_freq_limit', 'amdsmi_get_cpu_core_energy', 'amdsmi_get_cpu_current_io_bandwidth', diff --git a/projects/amdsmi/src/CMakeLists.txt b/projects/amdsmi/src/CMakeLists.txt index d28bee67b9..10f9c832ff 100644 --- a/projects/amdsmi/src/CMakeLists.txt +++ b/projects/amdsmi/src/CMakeLists.txt @@ -16,6 +16,7 @@ set(INC_DIR "${PROJECT_SOURCE_DIR}/include/amd_smi") set(SRC_LIST "${SRC_DIR}/amd_smi.cc" + "${SRC_DIR}/amd_smi_cper.cc" "${SRC_DIR}/amd_smi_common.cc" "${SRC_DIR}/amd_smi_drm.cc" "${SRC_DIR}/amd_smi_gpu_device.cc" @@ -29,6 +30,7 @@ set(SRC_LIST set(INC_LIST "${INC_DIR}/amdsmi.h" "${INC_DIR}/impl/amd_smi_common.h" + "${INC_DIR}/impl/amd_smi_cper.h" "${INC_DIR}/impl/amd_smi_processor.h" "${INC_DIR}/impl/amd_smi_drm.h" "${INC_DIR}/impl/amd_smi_gpu_device.h" @@ -38,6 +40,13 @@ set(INC_LIST "${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi.h" "${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi_utils.h") +set(ACA_SRC_DIR "aca-decode") +set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c ${ACA_SRC_DIR}/aca_tables.c + ${ACA_SRC_DIR}/error_map.c) +set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/aca-decode") +set(INC_LIST ${INC_LIST} ${ACA_INC_DIR}/aca_decode.h ${ACA_INC_DIR}/aca_fields.h ${ACA_INC_DIR}/aca_tables.h + ${ACA_INC_DIR}/error_map.h) + if(ENABLE_ESMI_LIB) list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi.h) list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi_monitor.h) @@ -72,7 +81,7 @@ target_link_libraries(amd_smi_ex ${AMD_SMI}) add_library(${AMD_SMI} ${SRC_LIST} ${INC_LIST}) target_link_libraries(${AMD_SMI} pthread rt dl ${DRM_LIBRARIES} ${AMDGPU_DRM_LIBRARIES}) target_include_directories(${AMD_SMI} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/rocm_smi/include - ${PROJECT_SOURCE_DIR}/common/shared_mutex) + ${PROJECT_SOURCE_DIR}/common/shared_mutex ${ACA_INC_DIR}) # use the target_include_directories() command to specify the include directories for the target target_include_directories(${AMD_SMI} PUBLIC "$" diff --git a/projects/amdsmi/src/aca-decode/aca_decode.c b/projects/amdsmi/src/aca-decode/aca_decode.c new file mode 100644 index 0000000000..b38fada092 --- /dev/null +++ b/projects/amdsmi/src/aca-decode/aca_decode.c @@ -0,0 +1,285 @@ +/** + * @file aca_decode.c + * @brief Implementation of ACA error decoding functions + * + * This file contains functions for decoding and analyzing ACA error information from + * raw register data. It provides functionality to determine error severity, bank + * information, and specific error types based on hardware-specific error codes. + */ + +#include "aca_decode.h" +#include "aca_tables.h" +#include "error_map.h" +#include + +/** + * @brief Gets the bank name based on hardware ID and ACA type + * @param[in] decoder Pointer to the ACA decoder structure + * @param[out] bank_name Pointer to a string containing the bank name + * @return 0 on success, -1 on failure + */ +static int +aca_decoder_get_bank(const aca_decoder_t *decoder, const char **bank_name) +{ + if (!decoder || !bank_name) + { + return -1; + } + + const aca_ipid_fields_t *ipid = &decoder->ipid; + return find_bank_name(ipid->hardware_id, ipid->aca_type, bank_name); +} + +/** + * @brief Determines the error severity based on status fields + * @param[in] status Pointer to the ACA status fields structure + * @return String indicating error severity: "Fatal", "Uncorrected, Non-fatal", "Corrected", or "UNKNOWN" + */ +static const char *get_error_severity(const aca_status_fields_t *status) +{ + if (status->poison) + return "Uncorrected, Non-fatal"; + if (status->pcc) + return "Fatal"; + if (!status->pcc && status->uc && status->tcc) + return "Fatal"; + if (!status->pcc && status->uc && !status->tcc) + return "Uncorrected, Non-fatal"; + if (!status->pcc && !status->uc && !status->tcc && status->deferred) + return "Uncorrected, Non-fatal"; + if (!status->pcc && !status->uc && !status->tcc && !status->deferred) + return "Corrected"; + return "UNKNOWN"; +} + +/** + * @brief Determines the error category based on bank and error type + * @param[in] bank Pointer to the bank name + * @param[in] error_type Pointer to the error type + * @return String indicating error category: "HBM Errors", "Off-Package Link Errors", or "Device Internal Errors" + */ +static const char *get_error_category(const char *bank, const char *error_type) +{ + if (!bank || !error_type) + { + return "UNKNOWN"; + } + + if (strcmp(bank, "umc") == 0) + { + if (strcmp(error_type, "On-die ECC") == 0 || + strcmp(error_type, "WriteDataPoisonErr") == 0 || + strcmp(error_type, "AddressCommandParityErr") == 0 || + strcmp(error_type, "WriteDataCrcErr") == 0 || + strcmp(error_type, "EcsErr") == 0 || + strcmp(error_type, "RdCrcErr") == 0 || + strcmp(error_type, "End-to-end CRC") == 0) + { + return "HBM Errors"; + } + } + else if (strcmp(bank, "pcs_xgmi") == 0 || + strcmp(bank, "kpx_serdes") == 0 || + strcmp(bank, "kpx_wafl") == 0 || + (strcmp(bank, "psp") == 0 && strcmp(error_type, "WAFL") == 0)) + { + return "Off-Package Link Errors"; + } + + return "Device Internal Errors"; +} + +/** + * @brief Determines the service error type from error attributes + * @param[in] error_category Pointer to the error category string + * @param[in] error_bank Pointer to the error bank string + * @param[in] error_type Pointer to the error type string + * @param[in] error_severity Pointer to the error severity string + * @param[out] service_error_type Pointer to store the resulting service error type string + * @return 0 on success, non-zero on failure + */ +static int get_service_error_type(const char *error_category, const char *error_bank, const char *error_type, + const char *error_severity, const char **service_error_type) +{ + if (!error_category || !error_type || !error_severity || !service_error_type || + strcmp(error_category, "UNKNOWN") == 0 || + strcmp(error_type, "UNKNOWN") == 0 || + strcmp(error_severity, "UNKNOWN") == 0) + { + return -1; + } + if (strcmp(error_type, "Bad Page Retirement Threshold") == 0) + { + *service_error_type = "Bad Page Retirement Threshold"; + return 0; + } + if (strcmp(error_type, "RdCrcErr") == 0) + { + *service_error_type = "End-to-end CRC"; + return 0; + } + if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Corrected") == 0)) + { + *service_error_type = "All"; + return 0; + } + if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Fatal") == 0) && + (strcmp(error_type, "On-die ECC") != 0) && (strcmp(error_type, "End-to-end CRC") != 0)) + { + *service_error_type = "All Others"; + return 0; + } + if (strcmp(error_category, "Device Internal Errors") == 0) + { + if ((strcmp(error_severity, "Uncorrected, Non-fatal") == 0 || + strcmp(error_severity, "Corrected") == 0 || + strcmp(error_severity, "Fatal") == 0) && + strcmp(error_type, "Hardware Assertion (HWA)") != 0 && + strcmp(error_type, "Watchdog Timeout (WDT)") != 0) + { + *service_error_type = "All Others"; + return 0; + } + } + if (strcmp(error_category, "Off-Package Link Errors") == 0) + { + if (strcmp(error_bank, "pcs_xgmi") == 0) + { + *service_error_type = "XGMI"; + return 0; + } + if (strcmp(error_bank, "kpx_wafl") == 0) + { + *service_error_type = "WAFL"; + return 0; + } + } + + return -1; +} + +/** + * @brief Extracts error information from the decoder and populates the info structure + * @param[in] decoder Pointer to the ACA decoder structure + * @param[out] info Pointer to the error info structure to be populated + */ +static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_info_t *info) +{ + const char *bank; + const char *error_type; + int result; + + result = aca_decoder_get_bank(decoder, &bank); + if (result < 0) + { + bank = "UNKNOWN"; + } + info->bank_ref = bank; + + // 0b1000 indicate error threshold has been exceeded, and is always fatal + if (decoder->flags & 0x8) + { + info->severity_ref = "Fatal"; + } + else + { + info->severity_ref = get_error_severity(&decoder->status); + } + + if (decoder->status.error_code_ext >= 0x3A && decoder->status.error_code_ext <= 0x3E) + { + uint32_t instance_id = decoder->ipid.instance_id_lo; + uint32_t error_info = decoder->synd.error_information & 0xFF; + + if ((instance_id == 0x36430400 || instance_id == 0x38430400 || + instance_id == 0x36430401 || instance_id == 0x38430401) && + find_error_in_table(xcd_error_table, NUM_XCD_ERRORS, error_info, &error_type) == 0) + { + info->error_type_ref = error_type; + } + else if ((instance_id == 0x3B30400 || instance_id == 0x3B30401) && + find_error_in_table(aid_error_table, NUM_AID_ERRORS, error_info, &error_type) == 0) + { + info->error_type_ref = error_type; + } + else + { + info->error_type_ref = "UNKNOWN"; + } + } + // 0b1000 indicate error threshold has been exceeded + else if (decoder->flags & 0x8) + { + info->error_type_ref = "Bad Page Retirement Threshold"; + } + else + { + if (find_error_type_by_bank(bank, decoder->status.error_code_ext, &error_type) == 0) + { + info->error_type_ref = error_type; + } + else + { + info->error_type_ref = "UNKNOWN"; + } + } + + // 0b1000 indicate error threshold has been exceeded, and is always a HBM error + if (decoder->flags & 0x8) + { + info->category_ref = "HBM Errors"; + } + else + { + info->category_ref = get_error_category(bank, info->error_type_ref); + } + + const char *service_error; + if (get_service_error_type(info->category_ref, info->bank_ref, info->error_type_ref, info->severity_ref, &service_error) != 0) + { + service_error = info->error_type_ref; + } + + info->afid = get_error_id(info->category_ref, service_error, info->severity_ref); +} + +/** + * @brief Initializes an ACA decoder structure with raw register values + * @param[out] decoder Pointer to the decoder structure to initialize + * @param[in] hw_revision Hardware hw_revision number + * @param[in] flags Decoder flags + * @param[in] status_reg Raw status register value + * @param[in] ipid_reg Raw IPID register value + * @param[in] synd_reg Raw syndrome register value + */ +static void aca_decoder_init(aca_decoder_t *decoder, uint16_t hw_revision, uint32_t flags, + uint64_t status_reg, uint64_t ipid_reg, uint64_t synd_reg) +{ + memset(decoder, 0, sizeof(aca_decoder_t)); + + decoder->hw_revision = hw_revision; + decoder->flags = flags; + decoder->aca_status = status_reg; + decoder->aca_ipid = ipid_reg; + decoder->aca_synd = synd_reg; + + aca_status_init(&decoder->status, status_reg); + aca_ipid_init(&decoder->ipid, ipid_reg); + aca_synd_init(&decoder->synd, synd_reg); +} + +aca_error_info_t aca_decode(const aca_raw_data_t *raw_data) +{ + aca_decoder_t decoder = {0}; + aca_error_info_t info = {0}; + + aca_decoder_init(&decoder, + raw_data->hw_revision, + raw_data->flags, + raw_data->aca_status, + raw_data->aca_ipid, + raw_data->aca_synd); + + aca_decoder_get_error_info(&decoder, &info); + return info; +} diff --git a/projects/amdsmi/src/aca-decode/aca_fields.c b/projects/amdsmi/src/aca-decode/aca_fields.c new file mode 100644 index 0000000000..465c33fa92 --- /dev/null +++ b/projects/amdsmi/src/aca-decode/aca_fields.c @@ -0,0 +1,76 @@ +/** + * @file aca_fields.c + * @brief Implementation of ACA register field handling + * + * This file contains functions for initializing and reading various ACA register fields + * including status, IPID, and syndrome registers. Each function + * extracts specific bit fields from raw register values and populates corresponding + * field structures. + */ + +#include "aca_fields.h" + +/** + * @brief Extracts a bit field from a value + * @param[in] value The source value to extract bits from + * @param[in] start Starting bit position + * @param[in] count Number of bits to extract + * @param[in] type The type to cast the extracted bits to + * @return The extracted bits as a value of the specified type + */ +#define EXTRACT_BITS(value, start, count, type) ((type)(((value) >> (start)) & ((1ULL << (count)) - 1))) + +uint64_t aca_fields_read(const aca_fields_t *fields) +{ + return fields->raw_value; +} + +void aca_status_init(aca_status_fields_t *fields, uint64_t status_reg) +{ + fields->base.raw_value = status_reg; + fields->error_code = EXTRACT_BITS(status_reg, 0, 16, uint16_t); + fields->error_code_ext = EXTRACT_BITS(status_reg, 16, 6, uint8_t); + fields->reserv22 = EXTRACT_BITS(status_reg, 22, 2, uint8_t); + fields->addr_lsb = EXTRACT_BITS(status_reg, 24, 6, uint8_t); + fields->reserv30 = EXTRACT_BITS(status_reg, 30, 2, uint8_t); + fields->err_core_id = EXTRACT_BITS(status_reg, 32, 6, uint8_t); + fields->reserv38 = EXTRACT_BITS(status_reg, 38, 2, uint8_t); + fields->scrub = EXTRACT_BITS(status_reg, 40, 1, uint8_t); + fields->reserv41 = EXTRACT_BITS(status_reg, 41, 2, uint8_t); + fields->poison = EXTRACT_BITS(status_reg, 43, 1, uint8_t); + fields->deferred = EXTRACT_BITS(status_reg, 44, 1, uint8_t); + fields->uecc = EXTRACT_BITS(status_reg, 45, 1, uint8_t); + fields->cecc = EXTRACT_BITS(status_reg, 46, 1, uint8_t); + fields->reserv47 = EXTRACT_BITS(status_reg, 47, 5, uint8_t); + fields->synd_v = EXTRACT_BITS(status_reg, 53, 1, uint8_t); + fields->reserv54 = EXTRACT_BITS(status_reg, 54, 1, uint8_t); + fields->tcc = EXTRACT_BITS(status_reg, 55, 1, uint8_t); + fields->err_core_id_val = EXTRACT_BITS(status_reg, 56, 1, uint8_t); + fields->pcc = EXTRACT_BITS(status_reg, 57, 1, uint8_t); + fields->addr_v = EXTRACT_BITS(status_reg, 58, 1, uint8_t); + fields->misc_v = EXTRACT_BITS(status_reg, 59, 1, uint8_t); + fields->en = EXTRACT_BITS(status_reg, 60, 1, uint8_t); + fields->uc = EXTRACT_BITS(status_reg, 61, 1, uint8_t); + fields->overflow = EXTRACT_BITS(status_reg, 62, 1, uint8_t); + fields->val = EXTRACT_BITS(status_reg, 63, 1, uint8_t); +} + +void aca_ipid_init(aca_ipid_fields_t *fields, uint64_t ipid_reg) +{ + fields->base.raw_value = ipid_reg; + fields->instance_id_lo = EXTRACT_BITS(ipid_reg, 0, 32, uint32_t); + fields->hardware_id = EXTRACT_BITS(ipid_reg, 32, 12, uint16_t); + fields->instance_id_hi = EXTRACT_BITS(ipid_reg, 44, 4, uint8_t); + fields->aca_type = EXTRACT_BITS(ipid_reg, 48, 16, uint16_t); +} + +void aca_synd_init(aca_synd_fields_t *fields, uint64_t synd_reg) +{ + fields->base.raw_value = synd_reg; + fields->error_information = EXTRACT_BITS(synd_reg, 0, 18, uint32_t); + fields->length = EXTRACT_BITS(synd_reg, 18, 6, uint8_t); + fields->error_priority = EXTRACT_BITS(synd_reg, 24, 3, uint8_t); + fields->reserved27 = EXTRACT_BITS(synd_reg, 27, 5, uint8_t); + fields->syndrome = EXTRACT_BITS(synd_reg, 32, 7, uint16_t); + fields->reserved39 = EXTRACT_BITS(synd_reg, 39, 25, uint32_t); +} diff --git a/projects/amdsmi/src/aca-decode/aca_tables.c b/projects/amdsmi/src/aca-decode/aca_tables.c new file mode 100644 index 0000000000..aecc61e985 --- /dev/null +++ b/projects/amdsmi/src/aca-decode/aca_tables.c @@ -0,0 +1,368 @@ +/** + * @file aca_tables.c + * @brief ACA Decode Tables Implementation + * + * This file contains lookup tables and helper functions for mapping ACA error codes + * to human-readable strings. It includes: + * - Bank mapping table for hardware IDs and ACA types + * - Error type mapping table for bank-specific error codes + * - GFX error mapping tables for XCD and AID errors + * - Lookup functions to find bank names and error types + */ + +#include "aca_tables.h" +#include +#include +#include + +/** + * @brief Mapping table for hardware IDs and ACA types to bank names + */ +const aca_bank_entry_t bank_table[] = { + {0x2E, 0x02, "cs"}, + {0x2E, 0x01, "pie"}, + {0x96, 0x00, "umc"}, + {0xFF, 0x01, "psp"}, + {0x01, 0x01, "smu"}, + {0x18, 0x00, "nbio"}, + {0x46, 0x01, "pcie"}, + {0x05, 0x00, "pb"}, + {0x259, 0x00, "kpx_serdes"}, + {0x2E, 0x04, "mall"}, + {0x267, 0x00, "kpx_wafl"}, + {0x50, 0x00, "pcs_xgmi"}, + {0x6C, 0x00, "nbif"}, + {0x80, 0x00, "shub"}, + {0x170, 0x00, "usr_dp"}, + {0x180, 0x00, "usr_cp"}}; + +/** + * @brief Mapping table for bank-specific error codes to error types + */ +const aca_error_type_t error_table[] = { + {"cs", 0x0, "FTI_ILL_REQ"}, + {"cs", 0x1, "FTI_ADDR_VIOL"}, + {"cs", 0x2, "FTI_SEC_VIOL"}, + {"cs", 0x3, "FTI_ILL_RSP"}, + {"cs", 0x4, "FTI_RSP_NO_MTCH"}, + {"cs", 0x5, "FTI_PAR_ERR"}, + {"cs", 0x6, "SDP_PAR_ERR"}, + {"cs", 0x7, "ATM_PAR_ERR"}, + {"cs", 0x8, "SDP_RSP_NO_MTCH"}, + {"cs", 0x9, "SPF_PRT_ERR"}, + {"cs", 0xa, "SPF_ECC_ERR"}, + {"cs", 0xb, "SDP_UNEXP_RETRY"}, + {"cs", 0xc, "CNTR_OVFL"}, + {"cs", 0xd, "CNTR_UNFL"}, + {"cs", 0xe, "FTI_ND_ILL_REQ"}, + {"cs", 0xf, "FTI_ND_ADDR_VIOL"}, + {"cs", 0x10, "FTI_ND_SEC_VIOL"}, + {"cs", 0x11, "Hardware Assertion (HWA)"}, + {"cs", 0x12, "ST_PRT_ERR"}, + {"cs", 0x13, "ST_ECC_ERR"}, + {"cs", 0x14, "ST_TXN_ERR"}, + {"pie", 0x0, "Hardware Assertion (HWA)"}, + {"pie", 0x1, "CSW"}, + {"pie", 0x2, "GMI"}, + {"pie", 0x3, "FTI_DAT_STAT"}, + {"pie", 0x4, "DEF"}, + {"pie", 0x5, "Watchdog Timeout (WDT)"}, + {"pie", 0x6, "CNLI"}, + {"pie", 0x7, "RSLVFCI"}, + {"umc", 0x0, "On-die ECC"}, + {"umc", 0x1, "WriteDataPoisonErr"}, + {"umc", 0x2, "SdpParityErr"}, + {"umc", 0x4, "AddressCommandParityErr"}, + {"umc", 0x5, "WriteDataCrcErr"}, + {"umc", 0x6, "SramEccErr"}, + {"umc", 0x9, "EcsErr"}, + {"umc", 0xa, "ThrttlErr"}, + {"umc", 0xb, "RdCrcErr"}, + {"umc", 0xd, "MpFwErr"}, + {"umc", 0xe, "MpParErr"}, + {"umc", 0xf, "End-to-end CRC"}, + {"psp", 0x0, "Mp0HighSramError"}, + {"psp", 0x1, "Mp0LowSramError"}, + {"psp", 0x2, "Mp0IDataBank0Error"}, + {"psp", 0x3, "Mp0IDataBank1Error"}, + {"psp", 0x4, "Mp0ITagRam0Error"}, + {"psp", 0x5, "Mp0ITagRam1Error"}, + {"psp", 0x6, "Mp0DDataBank0Error"}, + {"psp", 0x7, "Mp0DDataBank1Error"}, + {"psp", 0x8, "Mp0DDataBank2Error"}, + {"psp", 0x9, "Mp0DDataBank3Error"}, + {"psp", 0xa, "Mp0DTagBank0Error"}, + {"psp", 0xb, "Mp0DTagBank1Error"}, + {"psp", 0xc, "Mp0DTagBank2Error"}, + {"psp", 0xd, "Mp0DTagBank3Error"}, + {"psp", 0xe, "Mp0DDirtyRamError"}, + {"psp", 0xf, "Mp0TlbBank0Error"}, + {"psp", 0x10, "Mp0TlbBank1Error"}, + {"psp", 0x11, "Mp0SHubIfRdBufError"}, + {"psp", 0x12, "PhyRamEccError"}, + {"psp", 0x3a, "PoisonDataConsumption"}, + {"psp", 0x3b, "SRAM_EDC"}, + {"psp", 0x3c, "SMN_Parity"}, + {"psp", 0x3d, "SMN_Timeout"}, + {"psp", 0x3f, "WAFL"}, + {"smu", 0x0, "Mp5HighSramError"}, + {"smu", 0x1, "Mp5LowSramError"}, + {"smu", 0x2, "Mp5DCacheAError"}, + {"smu", 0x3, "Mp5DCacheBError"}, + {"smu", 0x4, "Mp5DTagAError"}, + {"smu", 0x5, "Mp5DTagBError"}, + {"smu", 0x6, "Mp5ICacheAError"}, + {"smu", 0x7, "Mp5ICacheBError"}, + {"smu", 0x8, "Mp5ITagAError"}, + {"smu", 0x9, "Mp5ITagBError"}, + {"smu", 0xb, "PhyRamEccError"}, + {"smu", 0x3a, "GFX_IP_Correctable_Error"}, + {"smu", 0x3b, "GFX_IP_Fatal_Error"}, + {"smu", 0x3d, "Reserved"}, + {"smu", 0x3e, "GFX_IP_Poison_Error"}, + {"nbio", 0x0, "EccParityError"}, + {"nbio", 0x1, "PCIE_Sideband"}, + {"nbio", 0x2, "Ext_ErrEvent"}, + {"nbio", 0x3, "Egress_Poison"}, + {"nbio", 0x4, "IOHC_Internal_Poison"}, + {"nbio", 0x5, "Int_ErrEvent"}, + {"pcie", 0x0, "SDP_PARITY_ERR_LOG"}, + {"pb", 0x0, "EccError"}, + {"kpx_serdes", 0x0, "RAMECC"}, + {"kpx_serdes", 0x1, "ARCIns"}, + {"kpx_serdes", 0x2, "ARCData"}, + {"kpx_serdes", 0x3, "APB"}, + {"mall", 0x0, "CNTR_OVFL"}, + {"mall", 0x1, "CNTR_UNFL"}, + {"mall", 0x2, "CSDP_PAR_ERR"}, + {"mall", 0x3, "USDP_PAR_ERR"}, + {"mall", 0x4, "CACHE_TAG0_ERR"}, + {"mall", 0x5, "CACHE_TAG1_ERR"}, + {"mall", 0x6, "CACHE_DAT_ERR"}, + {"kpx_wafl", 0x0, "RAMECC"}, + {"kpx_wafl", 0x1, "ARCIns"}, + {"kpx_wafl", 0x2, "ARCData"}, + {"kpx_wafl", 0x3, "APB"}, + {"pcs_xgmi", 0x0, "DataLossErr"}, + {"pcs_xgmi", 0x1, "TrainingErr"}, + {"pcs_xgmi", 0x2, "FlowCtrlAckErr"}, + {"pcs_xgmi", 0x3, "RxFifoUnderflowErr"}, + {"pcs_xgmi", 0x4, "RxFifoOverflowErr"}, + {"pcs_xgmi", 0x5, "CRCErr"}, + {"pcs_xgmi", 0x6, "BERExceededErr"}, + {"pcs_xgmi", 0x7, "TxMetaDataErr_TxVcidDataErr"}, + {"pcs_xgmi", 0x8, "ReplayBufParityErr"}, + {"pcs_xgmi", 0x9, "DataParityErr"}, + {"pcs_xgmi", 0xa, "ReplayFifoOverflowErr"}, + {"pcs_xgmi", 0xb, "ReplaFifoUnderflowErr"}, + {"pcs_xgmi", 0xc, "ElasticFifoOverflowErr"}, + {"pcs_xgmi", 0xd, "DeskewErr"}, + {"pcs_xgmi", 0xe, "FlowCtrlCRCErr"}, + {"pcs_xgmi", 0xf, "DataStartupLimitErr"}, + {"pcs_xgmi", 0x10, "FCInitTimeoutErr"}, + {"pcs_xgmi", 0x11, "RecoveryTimeoutErr"}, + {"pcs_xgmi", 0x12, "ReadySerialTimeoutErr"}, + {"pcs_xgmi", 0x13, "ReadySerialAttemptErr"}, + {"pcs_xgmi", 0x14, "RecoveryAttemptErr"}, + {"pcs_xgmi", 0x15, "RecoveryRelockAttemptErr"}, + {"pcs_xgmi", 0x16, "ReplayAttemptErr"}, + {"pcs_xgmi", 0x17, "SyncHdrErr"}, + {"pcs_xgmi", 0x18, "TxReplayTimeoutErr"}, + {"pcs_xgmi", 0x19, "RxReplayTimeoutErr"}, + {"pcs_xgmi", 0x1a, "LinkSubTxTimeoutErr"}, + {"pcs_xgmi", 0x1b, "LinkSubRxTimeoutErr"}, + {"pcs_xgmi", 0x1c, "RxCMDPktErr"}, + {"nbif", 0x0, "TIMEOUT_ERR"}, + {"nbif", 0x1, "SRAM_ECC_ERR"}, + {"nbif", 0x2, "NTB_ERR_EVENT"}, + {"nbif", 0x3, "SDP_PARITY_ERR"}, + {"shub", 0x0, "TIMEOUT_ERR"}, + {"shub", 0x1, "SRAM_ECC_ERR"}, + {"shub", 0x2, "NTB_ERR_EVENT"}, + {"shub", 0x3, "SDP_PARITY_ERR"}, + {"usr_dp", 0x0, "MstCMDErr"}, + {"usr_dp", 0x1, "MstRxFIFOErr"}, + {"usr_dp", 0x2, "MstDeskewErr"}, + {"usr_dp", 0x3, "MstDetectTimeoutErr"}, + {"usr_dp", 0x4, "MstFlowControlErr"}, + {"usr_dp", 0x5, "MstDataValidFifoErr"}, + {"usr_dp", 0x6, "macLinkStateErr"}, + {"usr_dp", 0x7, "DeskewErr"}, + {"usr_dp", 0x8, "InitTimeoutErr"}, + {"usr_dp", 0x9, "InitAttemptErr"}, + {"usr_dp", 0xa, "RecoveryTimeoutErr"}, + {"usr_dp", 0xb, "RecoveryAttemptErr"}, + {"usr_dp", 0xc, "EyeTrainingTimeoutErr"}, + {"usr_dp", 0xd, "DataStartupLimitErr"}, + {"usr_dp", 0xe, "LS0ExitErr"}, + {"usr_dp", 0xf, "PLLpowerStateUpdateTimeoutErr"}, + {"usr_dp", 0x10, "RxFifoErr"}, + {"usr_dp", 0x11, "LcuErr"}, + {"usr_dp", 0x12, "convCECCErr"}, + {"usr_dp", 0x13, "convUECCErr"}, + {"usr_dp", 0x15, "rxDataLossErr"}, + {"usr_dp", 0x16, "ReplayCECCErr"}, + {"usr_dp", 0x17, "ReplayUECCErr"}, + {"usr_dp", 0x18, "CRCErr"}, + {"usr_dp", 0x19, "BERExceededErr"}, + {"usr_dp", 0x1a, "FCInitTimeoutErr"}, + {"usr_dp", 0x1b, "FCInitAttemptErr"}, + {"usr_dp", 0x1c, "ReplayTimoutErr"}, + {"usr_dp", 0x1d, "ReplayAttemptErr"}, + {"usr_dp", 0x1e, "ReplayUnderflowErr"}, + {"usr_dp", 0x1f, "ReplayOverflowErr"}, + {"usr_cp", 0x0, "PacketTypeErr"}, + {"usr_cp", 0x1, "RxFifoErr"}, + {"usr_cp", 0x2, "DeskewErr"}, + {"usr_cp", 0x3, "RxDetectTimeoutErr"}, + {"usr_cp", 0x4, "DataParityErr"}, + {"usr_cp", 0x5, "DataLossErr"}, + {"usr_cp", 0x6, "LcuErr"}, + {"usr_cp", 0x7, "HB1HandshakeTimeoutErr"}, + {"usr_cp", 0x8, "HB2HandshakeTimeoutErr"}, + {"usr_cp", 0x9, "ClkSleepRspTimeoutErr"}, + {"usr_cp", 0xa, "ClkWakeRspTimeoutErr"}, + {"usr_cp", 0xb, "resetAttackErr"}, + {"usr_cp", 0xc, "remoteLinkFatalErr"}, +}; + +/** + * @brief Error GFX mapping table for XCD errors + */ +const aca_error_entry_t xcd_error_table[] = { + {0x0, "GfxGcError"}, + {0x1, "GfxGcError"}, + {0x2, "GfxGcError"}, + {0x3, "GfxGcError"}, + {0x4, "GfxGcError"}, + {0x5, "GfxGcError"}, + {0x6, "GfxGcError"}, + {0x7, "GfxGcError"}, + {0x8, "GfxGcError"}, + {0x9, "GfxGcError"}, + {0xa, "GfxGcError"}, + {0xb, "GfxGcError"}, + {0xc, "GfxGcError"}, + {0xd, "GfxGcError"}, + {0xe, "GfxGcError"}, + {0xf, "GfxGcError"}, + {0x10, "GfxGcError"}, + {0x28, "Reserved"}, + {0x2a, "Reserved"}}; + +/** + * @brief Error GFX mapping table for AID errors + */ +const aca_error_entry_t aid_error_table[] = { + {0x0, "GfxGcError"}, + {0x1, "GfxGcError"}, + {0x2, "GfxGcError"}, + {0x3, "GfxGcError"}, + {0x4, "GfxGcError"}, + {0x5, "GfxMmhubError"}, + {0x6, "GfxMmhubError"}, + {0x7, "GfxMmhubError"}, + {0x8, "GfxMmhubError"}, + {0x9, "GfxMmhubError"}, + {0xa, "GfxMmhubError"}, + {0xb, "GfxMmhubError"}, + {0xc, "GfxMmhubError"}, + {0xd, "GfxGcError"}, + {0xe, "GfxVcnError"}, + {0xf, "GfxVcnError"}, + {0x10, "GfxVcnError"}, + {0x11, "GfxVcnError"}, + {0x12, "GfxVcnError"}, + {0x13, "GfxVcnError"}, + {0x14, "GfxVcnError"}, + {0x15, "GfxVcnError"}, + {0x16, "GfxVcnError"}, + {0x17, "GfxVcnError"}, + {0x18, "GfxVcnError"}, + {0x19, "GfxVcnError"}, + {0x1a, "GfxVcnError"}, + {0x1b, "GfxVcnError"}, + {0x1c, "GfxVcnError"}, + {0x1d, "GfxVcnError"}, + {0x1e, "GfxVcnError"}, + {0x1f, "GfxVcnError"}, + {0x20, "GfxVcnError"}, + {0x21, "GfxSdmaError"}, + {0x22, "GfxSdmaError"}, + {0x23, "GfxSdmaError"}, + {0x24, "GfxSdmaError"}, + {0x25, "GfxHdpError"}, + {0x26, "GfxAthubError"}, + {0x27, "GfxGcError"}, + {0x28, "Reserved"}, + {0x29, "Reserved"}, + {0x2a, "Reserved"}, + {0x2b, "Reserved"}}; + +const size_t NUM_BANKS = sizeof(bank_table) / sizeof(bank_table[0]); +const size_t NUM_ERRORS = sizeof(error_table) / sizeof(error_table[0]); +const size_t NUM_XCD_ERRORS = sizeof(xcd_error_table) / sizeof(xcd_error_table[0]); +const size_t NUM_AID_ERRORS = sizeof(aid_error_table) / sizeof(aid_error_table[0]); + +int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name) +{ + if (!bank_name) + { + return -1; + } + + for (size_t i = 0; i < NUM_BANKS; i++) + { + if (bank_table[i].hw_id == hw_id && + bank_table[i].aca_type == aca_type) + { + *bank_name = bank_table[i].name; + return 0; + } + } + + *bank_name = "UNKNOWN"; + return 1; +} + +int find_error_type_by_bank(const char *bank, uint32_t error_code, const char **error_type) +{ + if (!bank || !error_type) + { + return -1; + } + + for (size_t i = 0; i < NUM_ERRORS; i++) + { + if (error_code == error_table[i].error_code && + strcmp(bank, error_table[i].bank) == 0) + { + *error_type = error_table[i].type; + return 0; + } + } + + *error_type = "UNKNOWN"; + return 1; +} + +int find_error_in_table(const aca_error_entry_t *table, size_t table_size, + uint32_t error_code, const char **error_type) +{ + if (!table || !error_type) + { + return -1; + } + + for (size_t i = 0; i < table_size; i++) + { + if (table[i].error_code == error_code) + { + *error_type = table[i].type; + return 0; + } + } + + *error_type = "UNKNOWN"; + return 1; +} diff --git a/projects/amdsmi/src/aca-decode/error_map.c b/projects/amdsmi/src/aca-decode/error_map.c new file mode 100644 index 0000000000..3a051465a0 --- /dev/null +++ b/projects/amdsmi/src/aca-decode/error_map.c @@ -0,0 +1,59 @@ +#include "error_map.h" +#include + +static const error_map_entry_t error_map[] = { + {1, "Boot-Time Errors", "FW Load", "CPER", "Fail-to-init"}, + {2, "Boot-Time Errors", "HBM BIST Test", "CPER", "Fail-to-init"}, + {3, "Boot-Time Errors", "HBM Memory Test", "CPER", "Fail-to-init"}, + {4, "Boot-Time Errors", "HBM Training", "CPER", "Fail-to-init"}, + {5, "Boot-Time Errors", "Unhandled", "CPER", "Fail-to-init"}, + {6, "Boot-Time Errors", "Unknown", "CPER", "Fail-to-init"}, + {7, "Boot-Time Errors", "USR CP Link Training", "CPER", "Fail-to-init"}, + {8, "Boot-Time Errors", "USR DP Link Training", "CPER", "Fail-to-init"}, + {9, "Boot-Time Errors", "WAFL Link Training", "CPER", "Fail-to-init"}, + {10, "Boot-Time Errors", "XGMI Link Training", "CPER", "Fail-to-init"}, + {11, "Boot-Time Errors", "Boot Controller Data Abort", "CPER", "Fail-to-init"}, + {12, "Boot-Time Errors", "Boot Controller Generic", "CPER ", "Fail-to-init"}, + {13, "Off-Package Link Errors", "PCIe AER", "CPER", "Corrected"}, + {14, "Off-Package Link Errors", "PCIe AER", "CPER", "Fatal"}, + {15, "Off-Package Link Errors", "WAFL", "CPER", "Corrected"}, + {16, "Off-Package Link Errors", "WAFL", "CPER", "Fatal"}, + {17, "Off-Package Link Errors", "XGMI", "CPER", "Corrected"}, + {18, "Off-Package Link Errors", "XGMI", "CPER", "Fatal"}, + {19, "HBM Errors", "Bad Page Retirement Threshold", "CPER", "Fatal"}, + {20, "HBM Errors", "On-die ECC", "CPER", "Fatal"}, + {21, "HBM Errors", "End-to-end CRC", "CPER", "Fatal"}, + {22, "HBM Errors", "On-die ECC", "CPER", "Uncorrected, Non-fatal"}, + {23, "HBM Errors", "End-to-end CRC", "CPER", "Uncorrected, Non-fatal"}, + {24, "HBM Errors", "All", "CPER", "Corrected"}, + {25, "HBM Errors", "All Others", "CPER", "Fatal"}, + {26, "Device Internal Errors", "Hardware Assertion (HWA)", "CPER", "Fatal"}, + {27, "Device Internal Errors", "Watchdog Timeout (WDT)", "CPER", "Fatal"}, + {28, "Device Internal Errors", "All Others", "CPER", "Uncorrected, Non-fatal"}, + {29, "Device Internal Errors", "All Others", "CPER", "Corrected"}, + {30, "Device Internal Errors", "All Others", "CPER", "Fatal"}}; + +static const size_t NUM_ERROR_ENTRIES = sizeof(error_map) / sizeof(error_map[0]); + +int get_error_id(const char *error_category, const char *error_type, const char *error_severity) +{ + if (!error_category || !error_type || !error_severity || + strcmp(error_category, "UNKNOWN") == 0 || + strcmp(error_type, "UNKNOWN") == 0 || + strcmp(error_severity, "UNKNOWN") == 0) + { + return -1; + } + + for (size_t i = 0; i < NUM_ERROR_ENTRIES; i++) + { + if (strcmp(error_map[i].error_category, error_category) == 0 && + strcmp(error_map[i].error_type, error_type) == 0 && + strcmp(error_map[i].error_severity, error_severity) == 0) + { + return (int)error_map[i].id; + } + } + + return -1; +} diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 8c9217d076..3093784fc7 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -46,6 +46,7 @@ #include "amd_smi/amdsmi.h" #include "amd_smi/impl/fdinfo.h" #include "amd_smi/impl/amd_smi_common.h" +#include "amd_smi/impl/amd_smi_cper.h" #include "amd_smi/impl/amd_smi_system.h" #include "amd_smi/impl/amd_smi_socket.h" #include "amd_smi/impl/amd_smi_gpu_device.h" @@ -3950,6 +3951,65 @@ amdsmi_get_gpu_cper_entries( cursor); } +amdsmi_status_t amdsmi_get_afids_from_cper( + char* cper_buffer, uint32_t buf_size, uint64_t* afids, uint32_t* num_afids) { + + AMDSMI_CHECK_INIT(); + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] begin\n"; + LOG_DEBUG(ss); + + if(!cper_buffer) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper_buffer should be a valid memory address\n"; + LOG_ERROR(ss); + return AMDSMI_STATUS_INVAL; + } + else if(!buf_size) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] buf_size should be greater than 0\n"; + LOG_ERROR(ss); + return AMDSMI_STATUS_INVAL; + } + else if(!afids) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] afids should be a valid memory address\n"; + LOG_ERROR(ss); + return AMDSMI_STATUS_INVAL; + } + else if(!num_afids) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] num_afids should be a valid memory address\n"; + LOG_ERROR(ss); + return AMDSMI_STATUS_INVAL; + } + else if(!*num_afids) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] num_afids should be greater than 0\n"; + LOG_ERROR(ss); + return AMDSMI_STATUS_INVAL; + } + + const amdsmi_cper_hdr_t *cper = reinterpret_cast(cper_buffer); + if(cper->record_length > buf_size) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper buffer size " << std::dec << buf_size << " is smaller than cper record length " << std::dec << cper->record_length << "\n"; + LOG_ERROR(ss); + return AMDSMI_STATUS_INVAL; + } + else if(strncmp(cper->signature, "CPER", 4) != 0) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper buffer does not have the correct signature\n"; + LOG_ERROR(ss); + return AMDSMI_STATUS_INVAL; + } + + int i = 0; + for(int afid: cper_decode(cper)) { + if(i < *num_afids) { + afids[i] = afid; + } + ++i; + } + *num_afids = i; + + return AMDSMI_STATUS_SUCCESS; +} + amdsmi_status_t amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list) { AMDSMI_CHECK_INIT(); diff --git a/projects/amdsmi/src/amd_smi/amd_smi_cper.cc b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc new file mode 100644 index 0000000000..aa4a118ff9 --- /dev/null +++ b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc @@ -0,0 +1,567 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "aca-decode/aca_decode.h" +#include "amd_smi/impl/amd_smi_cper.h" +#include "rocm_smi/rocm_smi_logger.h" + +namespace { +static std::vector +amdsmi_get_gpu_cper_headers(const char *buffer, size_t buffer_sz) { + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] buffer_sz: " << buffer_sz; + LOG_DEBUG(ss); + + std::vector headers; + if(!buffer) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] buffer is null"; + LOG_ERROR(ss); + return headers; + } + static constexpr char cper_signature[] = "CPER"; + static constexpr size_t cper_signature_size = sizeof(cper_signature) - 1; + for(size_t data_idx = 0; + buffer_sz >= cper_signature_size && + data_idx < buffer_sz - cper_signature_size; + ++data_idx) { + + const amdsmi_cper_hdr_t *hdr = reinterpret_cast( + &buffer[data_idx]); + if(hdr->signature[0] != 'C' || hdr->signature[1] != 'P' || + hdr->signature[2] != 'E' || hdr->signature[3] != 'R' ) { + continue; + } + if(hdr->signature_end != 0xFFFFFFFF) { + continue; + } + if(hdr->record_length > buffer_sz) { + continue; + } + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] add header at data_idx: " << data_idx + << ", sig: " << hdr->signature[0] << hdr->signature[1] << hdr->signature[2] << hdr->signature[3]; + LOG_DEBUG(ss); + headers.emplace_back(hdr); + } + return headers; +} + +struct CperFileCtx { + amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR; + std::unique_ptr buffer; + long file_size = 0; +}; + +static auto amdsmi_read_cper_file(const std::string &filepath) -> CperFileCtx { + + std::ostringstream ss; + + CperFileCtx ctx; + ctx.status = AMDSMI_STATUS_FILE_ERROR; + ctx.file_size = 0; + + struct stat file_stats; + if (stat(filepath.c_str(), &file_stats) == 0) { + if (!S_ISREG(file_stats.st_mode)) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file is not a regular file: " + << filepath << ", errno: " << errno << "): " << strerror(errno); + return ctx; + } + } else { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file does not exist: " + << filepath << ", errno: " << errno << "): " << strerror(errno); + ctx.status = AMDSMI_STATUS_NOT_SUPPORTED; + return ctx; + } + + ctx.file_size = file_stats.st_size; + ctx.buffer = std::make_unique(ctx.file_size); + int file = open(filepath.c_str(), O_RDONLY); + if (file == -1) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] failed to open file: " + << filepath << ", errno:()" << errno << "): " << strerror(errno); + LOG_ERROR(ss); + return ctx; + } + long bytes_read = read(file, ctx.buffer.get(), ctx.file_size); + if (bytes_read <= 0) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] failed to read complete file, read only " + << bytes_read << " of " << ctx.file_size << " bytes"; + LOG_ERROR(ss); + return ctx; + } + close(file); + + ctx.status = AMDSMI_STATUS_SUCCESS; + ctx.file_size = bytes_read; + return ctx; +} + +#define GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ +{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, \ + (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }; + +/* Machine Check Exception */ +#define CPER_NOTIFY_MCE \ + GUID_INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \ + 0xE1, 0x49, 0x13, 0xBB) +#define CPER_NOTIFY_CMC \ + GUID_INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \ + 0xEB, 0xD4, 0xF8, 0x90) +#define BOOT_TYPE \ + GUID_INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98,0xF3, 0x62, \ + 0xD4, 0x64, 0xB3, 0x8F) +#define AMD_OOB_CRASHDUMP \ + GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0xB0, 0xD0, 0x73, 0x65, \ + 0x72, 0x5F, 0xD6, 0xAE) +#define AMD_GPU_NONSTANDARD_ERROR \ + GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0x81, 0xA2, 0xAC, 0x69, \ + 0x17, 0x80, 0x55, 0x1D) +#define PROC_ERR_SECTION_TYPE \ + GUID_INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \ + 0x24, 0x2B, 0x6E, 0x1D) + +static amdsmi_cper_guid_t mce = CPER_NOTIFY_MCE; +static amdsmi_cper_guid_t cmc = CPER_NOTIFY_CMC; +static amdsmi_cper_guid_t bt = BOOT_TYPE; +static amdsmi_cper_guid_t cr = AMD_OOB_CRASHDUMP; +static amdsmi_cper_guid_t nonstd = AMD_GPU_NONSTANDARD_ERROR; +static amdsmi_cper_guid_t proc_err = PROC_ERR_SECTION_TYPE; + +static int cper_is_cr(const amdsmi_cper_guid_t *guid) +{ + return !memcmp(&cr, guid, sizeof(amdsmi_cper_guid_t)); +} + +static int cper_is_nonstd(const amdsmi_cper_guid_t *guid) +{ + return !memcmp(&nonstd, guid, sizeof(amdsmi_cper_guid_t)); +} + +static int cper_is_proc_err(const amdsmi_cper_guid_t *guid) +{ + return !memcmp(&proc_err, guid, sizeof(amdsmi_cper_guid_t)); +} + +static int cper_is_bt(const amdsmi_cper_guid_t *guid) +{ + return !memcmp(&bt, guid, sizeof(amdsmi_cper_guid_t)); +} + +static int cper_num_sec(const amdsmi_cper_hdr_t *hdr) +{ + return hdr->sec_cnt; +} + +static const amdsmi_cper_guid_t *get_sec_desc_type(const struct cper_sec_desc *desc) +{ + return &desc->sec_type; +} + +static const amdsmi_cper_guid_t *get_cper_type(const amdsmi_cper_hdr_t *hdr) +{ + return &hdr->notify_type; +} + +static void* cper_get_sec_desc_offset(const amdsmi_cper_hdr_t *hdr, int idx) +{ + char *offset; + + if (idx >= hdr->sec_cnt) + return 0; + + offset = (char *)hdr + sizeof(amdsmi_cper_hdr_t); + offset += sizeof(struct cper_sec_desc) * idx; + + return offset; +} + +static void* cper_get_sec_offset(const amdsmi_cper_hdr_t *hdr, int idx) +{ + struct cper_sec_desc *tmp_desc; + char *offset; + + if (idx >= hdr->sec_cnt) + return 0; + + tmp_desc = reinterpret_cast( + (char *)hdr + sizeof(amdsmi_cper_hdr_t) + sizeof(struct cper_sec_desc) * idx + ); + + return (char *)hdr + tmp_desc->sec_offset; +} + +static int cper_dump_sec_desc(const struct cper_sec_desc *desc) +{ + std::ostringstream ss; + + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~SECTION DESCRIPTION~~~\n"; + + ss << "[SEC DESC] REV Major = 0x" << std::hex << static_cast(desc->revision_major) << "\n"; + ss << "[SEC DESC] REV Minor = 0x" << std::hex << static_cast(desc->revision_minor) << "\n"; + ss << "[SEC DESC] Length = 0x" << std::hex << desc->sec_length << "\n"; + ss << "[SEC DESC] Offset = 0x" << std::hex << desc->sec_offset << "\n"; + + ss << "[SEC DESC] fru_id = " << desc->fru_id << "\n"; + ss << "[SEC DESC] fru_text = " << desc->fru_text << "\n"; + + ss << std::dec << "\n"; + + if (cper_is_cr(&desc->sec_type)) + ss << "[SEC DESC] AMD CrashDump Section\n"; + else if (cper_is_nonstd(&desc->sec_type)) + ss << "[SEC DESC] AMD NonStandard Section\n"; + else if (cper_is_proc_err(&desc->sec_type)) + ss << "[SEC DESC] AMD Proc Error Section\n"; + else + ss << "UNKNOWN ERROR TYPE!!\n"; + + ss << "~~~~SECTION DESCRIPTION~~~\n\n"; + + LOG_DEBUG(ss); + return 0; +} + +static int aca_decode_fatal(const cper_sec_crashdump_data &data) +{ + std::ostringstream ss; + + const uint64_t *register_array = reinterpret_cast(&data.dump.fatal_err); + aca_raw_data_t raw_data; + raw_data.aca_status = register_array[0]; + raw_data.aca_ipid = register_array[2]; + raw_data.aca_synd = register_array[3]; + + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n"; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n"; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n"; + + raw_data.flags = 0; + raw_data.hw_revision = 1; + + aca_error_info_t error_info = aca_decode(&raw_data); + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] fatal error_info.afid: " << std::dec << error_info.afid << "\n"; + LOG_DEBUG(ss); + + return error_info.afid; +} + +static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes) { + + std::ostringstream ss; + if(num_bytes != CPER_ACA_REG_COUNT * sizeof(uint32_t)) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Size of register array must be " << std::dec << (CPER_ACA_REG_COUNT * sizeof(uint32_t)) << " bytes\n"; + LOG_ERROR(ss); + return AMDSMI_STATUS_INVAL; + } + const uint64_t *register_array = reinterpret_cast(reg_dump); + aca_raw_data_t raw_data; + raw_data.aca_status = register_array[2]; + raw_data.aca_ipid = register_array[5]; + raw_data.aca_synd = register_array[6]; + + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n"; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n"; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n"; + + raw_data.flags = 0; + raw_data.hw_revision = 1; + + aca_error_info_t error_info = aca_decode(&raw_data); + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] non-fatal error_info.afid: " << std::dec << error_info.afid << "\n"; + LOG_DEBUG(ss); + + return error_info.afid; +} + +static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err) +{ + std::ostringstream ss; + + struct cper_sec_nonstd_err_body *body; + char *offset; + + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~NON STANDARD SECTION~~~\n"; + + ss << "[NonSTD SEC] Err Info Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_info_cnt << "\n"; + ss << "[NonSTD SEC] Err Context Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_context_cnt << "\n"; + + if (nonstd_err->hdr.valid_bits.err_context_cnt != nonstd_err->hdr.valid_bits.err_context_cnt) { + ss << "~~~~Malformed Non Standard Section!~~~~\n\n"; + goto exit; + } + + body = reinterpret_cast( + (char *)nonstd_err + sizeof(struct cper_sec_nonstd_err_hdr) + ); + + ss << "[NonSTD SEC] Reg Ctx Type = 0x" << std::hex << body->err_ctx.reg_ctx_type << "\n"; + ss << "[NonSTD SEC] Reg Array Size = 0x" << std::hex << body->err_ctx.reg_arr_size << "\n"; + + for (int i = 0; i < CPER_ACA_REG_COUNT; i++) { + ss << "[NonSTD SEC] reg_dump[" << std::dec << i << "] = 0x" << std::hex << body->err_ctx.reg_dump[i] << "\n"; + } + +exit: + ss << std::dec << "~~~~NON STANDARD SECTION~~~\n\n"; + + LOG_DEBUG(ss); + + return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump)); +} + +static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump) +{ + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - FATAL~~~\n"; + + ss << "[Crash Dump - Fatal] status_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.status_lo << "\n"; + ss << "[Crash Dump - Fatal] status_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.status_hi << "\n"; + ss << "[Crash Dump - Fatal] addr_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.addr_lo << "\n"; + ss << "[Crash Dump - Fatal] addr_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.addr_hi << "\n"; + ss << "[Crash Dump - Fatal] ipid_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.ipid_lo << "\n"; + ss << "[Crash Dump - Fatal] ipid_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.ipid_hi << "\n"; + ss << "[Crash Dump - Fatal] synd_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.synd_lo << "\n"; + ss << "[Crash Dump - Fatal] synd_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.synd_hi << "\n"; + + ss << std::dec << "~~~~CRASH DUMP - FATAL~~~\n\n"; + + LOG_DEBUG(ss); + + return aca_decode_fatal(crashdump->data); +} + +static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump) +{ + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - BOOT TIME~~~\n"; + + for (int i = 0; i < CPER_MAX_OAM_COUNT; i++) { + ss << "[Crash Dump - Boot] bootmsg[" << std::dec << i << "] = 0x" << std::hex << crashdump->data.dump.boot_err.msg[i] << "\n"; + } + + ss << "~~~~CRASH DUMP - BOOT TIME~~~\n\n"; + LOG_DEBUG(ss); + + return aca_decode_fatal(crashdump->data); +} + +} //namespace + +amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path( + const char *amdgpu_ring_cper_file, + uint32_t severity_mask, + char *cper_data, + uint64_t *buf_size, + amdsmi_cper_hdr_t **cper_hdrs, + uint64_t *entry_count, + uint64_t *cursor) { + + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] begin\n" + << ", amdgpu_ring_cper_file: " << amdgpu_ring_cper_file + << ", severity_mask: " << severity_mask; + LOG_DEBUG(ss); + + if(!cper_data) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_data should be a valid memory address\n"; + LOG_ERROR(ss); + if(entry_count) {*entry_count = 0;} + if(buf_size) { *buf_size = 0; } + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!buf_size) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be a valid memory address"; + LOG_ERROR(ss); + if(entry_count) {*entry_count = 0;} + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!entry_count) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be a valid memory address"; + LOG_ERROR(ss); + *buf_size = 0; + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!*buf_size) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be greater than zero"; + LOG_ERROR(ss); + *entry_count = 0; + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!*entry_count) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be greater than 0"; + LOG_ERROR(ss); + *buf_size = 0; + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!cper_hdrs) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs should be a valid memory address"; + LOG_ERROR(ss); + *entry_count = 0; + *buf_size = 0; + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + else if(!cursor) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cursor should be a valid memory address"; + LOG_ERROR(ss); + *entry_count = 0; + *buf_size = 0; + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + + auto ctx = amdsmi_read_cper_file(amdgpu_ring_cper_file); + if(ctx.status != AMDSMI_STATUS_SUCCESS) { + *entry_count = 0; + *buf_size = 0; + return ctx.status; + } + + auto headers = amdsmi_get_gpu_cper_headers(ctx.buffer.get(), ctx.file_size); + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] num headers: " << headers.size(); + LOG_DEBUG(ss); + + uint64_t data_idx = 0; + uint64_t header_idx = 0; + size_t num_headers_copied = 0; + for(const amdsmi_cper_hdr_t *header: headers) { + if(((1 << header->error_severity) & severity_mask) != + static_cast(1 << header->error_severity)) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x" + << std::hex << (1 << header->error_severity) << ", given severity_mask: 0x" + << std::hex << severity_mask << ", record_length:" + << std::dec << header->record_length; + LOG_DEBUG(ss); + continue; + } + else { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x" + << std::hex << (1 << header->error_severity) << ", given severity_mask: 0x" + << std::hex << severity_mask << ", record_length:" + << std::dec << header->record_length; + LOG_DEBUG(ss); + } + if((*buf_size - data_idx) < header->record_length ) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buffer filled up without copying all cper entries, buf_size: " << std::dec << *buf_size; + LOG_ERROR(ss); + *entry_count = num_headers_copied; + *buf_size = data_idx; + return (data_idx == 0) ? + AMDSMI_STATUS_OUT_OF_RESOURCES : + AMDSMI_STATUS_MORE_DATA; + } + if(num_headers_copied == *entry_count) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs filled up before finished with copying all header pointers, entry_count: " << std::dec << *entry_count; + LOG_ERROR(ss); + *entry_count = num_headers_copied; + *buf_size = data_idx; + return (data_idx == 0) ? + AMDSMI_STATUS_OUT_OF_RESOURCES : + AMDSMI_STATUS_MORE_DATA; + } + if(*cursor != header_idx) { + ++header_idx; + continue; + } + cper_hdrs[num_headers_copied] = reinterpret_cast(&cper_data[data_idx]); + ++num_headers_copied; + *cursor = ++header_idx; + std::memcpy( + &cper_data[data_idx], + reinterpret_cast(header), + header->record_length); + data_idx += header->record_length; + } + *entry_count = num_headers_copied; + *buf_size = data_idx; + + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ + << "[CPER] *entry_count: " << entry_count + << ", *cursor: " << cursor + << ", *buf_size: " << buf_size; + + LOG_DEBUG(ss); + return AMDSMI_STATUS_SUCCESS; +} + +std::vector cper_decode(const amdsmi_cper_hdr_t *cper) { + + std::vector afids; + std::ostringstream ss; + + for (int i = 0; i < cper_num_sec(cper); i ++) { + void *sec_desc_offset = cper_get_sec_desc_offset(cper, i); + void *sec_offset = cper_get_sec_offset(cper, i); + const amdsmi_cper_guid_t *sec_guid = get_sec_desc_type(static_cast(sec_desc_offset)); + const amdsmi_cper_guid_t *cper_guid = get_cper_type(cper); + + cper_dump_sec_desc(static_cast(sec_desc_offset)); + + if (cper_is_cr(sec_guid)) { + if (cper_is_bt(cper_guid)) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding boot crash dump\n"; + LOG_DEBUG(ss); + afids.emplace_back(cper_dump_cr_boot(static_cast(sec_offset))); + } + else { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding crash dump\n"; + LOG_DEBUG(ss); + afids.emplace_back(cper_dump_cr_fatal(static_cast(sec_offset))); + } + } + else if (cper_is_nonstd(sec_guid)) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding non-standard error\n"; + LOG_DEBUG(ss); + afids.emplace_back(cper_dump_nonstd_err(static_cast(sec_offset))); + } + else if (cper_is_proc_err(sec_guid)) { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding proc error section type\n"; + LOG_DEBUG(ss); + afids.emplace_back(cper_dump_nonstd_err(static_cast(sec_offset))); + } + else { + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Unknown error type!!\n"; + for(int i = 0; i < sizeof(sec_guid->b); ++i) { + ss << std::hex << static_cast(sec_guid->b[i]) << ":"; + } + ss << "\n"; + LOG_ERROR(ss); + } + } + + return afids; +} + diff --git a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc index 9219a52267..ff0de75866 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc @@ -1031,142 +1031,6 @@ static auto amdsmi_read_cper_file(const std::string &filepath) -> CperFileCtx { ctx.file_size = bytes_read; return ctx; } - -amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path( - const char *amdgpu_ring_cper_file, - uint32_t severity_mask, - char *cper_data, - uint64_t *buf_size, - amdsmi_cper_hdr_t **cper_hdrs, - uint64_t *entry_count, - uint64_t *cursor) { - - std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] begin\n" - << ", amdgpu_ring_cper_file: " << amdgpu_ring_cper_file - << ", severity_mask: " << severity_mask; - LOG_DEBUG(ss); - - if(!cper_data) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_data should be a valid memory address\n"; - LOG_ERROR(ss); - if(entry_count) {*entry_count = 0;} - if(buf_size) { *buf_size = 0; } - return AMDSMI_STATUS_OUT_OF_RESOURCES; - } - else if(!buf_size) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be a valid memory address"; - LOG_ERROR(ss); - if(entry_count) {*entry_count = 0;} - return AMDSMI_STATUS_OUT_OF_RESOURCES; - } - else if(!entry_count) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be a valid memory address"; - LOG_ERROR(ss); - *buf_size = 0; - return AMDSMI_STATUS_OUT_OF_RESOURCES; - } - else if(!*buf_size) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be greater than zero"; - LOG_ERROR(ss); - *entry_count = 0; - return AMDSMI_STATUS_OUT_OF_RESOURCES; - } - else if(!*entry_count) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be greater than 0"; - LOG_ERROR(ss); - *buf_size = 0; - return AMDSMI_STATUS_OUT_OF_RESOURCES; - } - else if(!cper_hdrs) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs should be a valid memory address"; - LOG_ERROR(ss); - *entry_count = 0; - *buf_size = 0; - return AMDSMI_STATUS_OUT_OF_RESOURCES; - } - else if(!cursor) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cursor should be a valid memory address"; - LOG_ERROR(ss); - *entry_count = 0; - *buf_size = 0; - return AMDSMI_STATUS_OUT_OF_RESOURCES; - } - - auto ctx = amdsmi_read_cper_file(amdgpu_ring_cper_file); - if(ctx.status != AMDSMI_STATUS_SUCCESS) { - *entry_count = 0; - *buf_size = 0; - return ctx.status; - } - - auto headers = amdsmi_get_gpu_cper_headers(ctx.buffer.get(), ctx.file_size); - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] num headers: " << headers.size(); - LOG_DEBUG(ss); - - uint64_t data_idx = 0; - uint64_t header_idx = 0; - size_t num_headers_copied = 0; - for(const amdsmi_cper_hdr_t *header: headers) { - if(((1 << header->error_severity) & severity_mask) != - static_cast(1 << header->error_severity)) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x" - << std::hex << (1 << header->error_severity) << ", given severity_mask: 0x" - << std::hex << severity_mask << ", record_length:" - << std::dec << header->record_length; - LOG_DEBUG(ss); - continue; - } - else { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x" - << std::hex << (1 << header->error_severity) << ", given severity_mask: 0x" - << std::hex << severity_mask << ", record_length:" - << std::dec << header->record_length; - LOG_DEBUG(ss); - } - if((*buf_size - data_idx) < header->record_length ) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buffer filled up without copying all cper entries, buf_size: " << std::dec << *buf_size; - LOG_ERROR(ss); - *entry_count = num_headers_copied; - *buf_size = data_idx; - return (data_idx == 0) ? - AMDSMI_STATUS_OUT_OF_RESOURCES : - AMDSMI_STATUS_MORE_DATA; - } - if(num_headers_copied == *entry_count) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs filled up before finished with copying all header pointers, entry_count: " << std::dec << *entry_count; - LOG_ERROR(ss); - *entry_count = num_headers_copied; - *buf_size = data_idx; - return (data_idx == 0) ? - AMDSMI_STATUS_OUT_OF_RESOURCES : - AMDSMI_STATUS_MORE_DATA; - } - if(*cursor != header_idx) { - ++header_idx; - continue; - } - cper_hdrs[num_headers_copied] = reinterpret_cast(&cper_data[data_idx]); - ++num_headers_copied; - *cursor = ++header_idx; - std::memcpy( - &cper_data[data_idx], - reinterpret_cast(header), - header->record_length); - data_idx += header->record_length; - } - *entry_count = num_headers_copied; - *buf_size = data_idx; - - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ - << "[CPER] *entry_count: " << entry_count - << ", *cursor: " << cursor - << ", *buf_size: " << buf_size; - - LOG_DEBUG(ss); - return AMDSMI_STATUS_SUCCESS; -} - void amdsmi_wait_for_user_input(void) { for (;;) { std::cout << "\n\t**Press any key to continue**" << std::endl;