[SWDEV-522623] Add afid functionality to API and CLI (#330)

Change-Id: I015bde926491d54e09da8f39b05650515711e09f

[SWDEV-522623] Add afid functionality to API and CLI


Change-Id: I015bde926491d54e09da8f39b05650515711e09f

Signed-off-by: Oosman Saeed <oossaeed@amd.com>
Co-authored-by: Oosman Saeed <oossaeed@amd.com>
This commit is contained in:
Saeed, Oosman
2025-05-15 21:49:56 -05:00
committed by GitHub
orang tua d4f057f95f
melakukan 1bb1f8acc2
21 mengubah file dengan 2228 tambahan dan 209 penghapusan
+47 -15
Melihat File
@@ -33,6 +33,7 @@ from amdsmi_cli_exceptions import AmdSmiInvalidParameterException, AmdSmiRequire
from amdsmi_helpers import AMDSMIHelpers
from amdsmi_logger import AMDSMILogger
from amdsmi import amdsmi_exception, amdsmi_interface
from pathlib import Path
class AMDSMICommands():
"""This class contains all the commands corresponding to AMDSMIParser
@@ -6325,9 +6326,35 @@ class AMDSMICommands():
with self.logger.destination.open('a', encoding="utf-8") as output_file:
output_file.write(legend_output + '\n')
def __pvtDumpAfids(self, cper_file):
# 1) Fetch the CPER “file” and ensure we have raw bytes
raw_data = cper_file
if hasattr(raw_data, "read"):
# fetch_cper_file returned a fileobject
raw = raw_data.read()
elif isinstance(raw_data, Path):
# Path: read the bytes directly
raw = raw_data.read_bytes()
elif isinstance(raw_data, str):
# fetch_cper_file returned a filename
with open(raw_data, "rb") as f:
raw = f.read()
else:
# assume it's already bytes
raw = raw_data
size = len(raw)
self.helpers.hexdump_to_string(raw)
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
print(f"AFIDS: ", end="")
for afid in afids:
print(afid, end=" ")
print("")
def ras(self, args, multiple_devices=False, gpu=None, cper=None,
severity=None, folder=None, file_limit=None, follow=None):
def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,
severity=None, folder=None, file_limit=None, cper_file=None, follow=None):
"""
Retrieve and process CPER (RAS) entries for a target GPU.
@@ -6338,23 +6365,32 @@ class AMDSMICommands():
The output file name is auto-generated using the timestamp from the CPER header data (converted from
the headers "YYYY/MM/DD HH:MM:SS" format), along with the GPU/platform ID and error severity.
"""
# GPU handle logic.
if gpu:
args.gpu = gpu
if cper:
args.cper = cper
if afid:
args.afid = afid
if severity:
args.severity = severity
if folder:
args.folder = folder
if file_limit:
args.file_limit = file_limit
if cper_file:
args.cper_file = cper_file
if follow:
args.follow = follow
if args.gpu == None:
args.gpu = self.device_handles
#Fetching AFID
if args.afid and args.cper_file:
self.__pvtDumpAfids(args.cper_file)
return
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
@@ -6362,7 +6398,6 @@ class AMDSMICommands():
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras)
if handled_multiple_gpus:
return
args.gpu = device_handle
# Parse severity mask dynamically from the --severity option.
@@ -6381,17 +6416,15 @@ class AMDSMICommands():
severity_mask |= (1 << 0)
elif sev in ("nonfatal-corrected", "corrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
severity_mask |= (1 << 2)
severity_mask |= (1 << 2)
cursor = 0
buffer_size = 1048576
if args.cper:
# Start from cursor 0 (no timestamp argument provided).
cursor = 0
buffer_size = 1048576
file_limit = int(args.file_limit) if args.file_limit else 1000
# Main loop: continuously retrieve CPER entries if --follow is set.
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
# Print header only when dumping to a folder
if args.follow and not getattr(self, "_cper_follow_prompted", False):
print("Press CTRL + C to stop.")
@@ -6409,12 +6442,11 @@ class AMDSMICommands():
if partition_id != 0:
logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}")
return
if args.folder and args.gpu:
print(f"Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}")
elif args.folder:
if args.folder and not getattr(self, "_cper_folder_prompted", False):
print(f"Dumping CPER file header entries in folder {args.folder}")
self._cper_folder_prompted = True
self.logger.set_cper_exit_message(False)
self.stop = False
+61 -21
Melihat File
@@ -1078,7 +1078,7 @@ class AMDSMIHelpers():
msg = (
"WARNING: User is missing the following required groups: %s. "
"Please add user to these groups."
) % ", ".join(sorted(missing_groups))
) % ", ".join(sodurted(missing_groups))
print(msg)
logging.warning(msg)
@@ -1116,7 +1116,7 @@ class AMDSMIHelpers():
self._cper_warning_printed = True
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
for entry_index, entry in enumerate(entries.values()):
@@ -1138,7 +1138,7 @@ class AMDSMIHelpers():
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
time.sleep(1)
@@ -1156,7 +1156,7 @@ class AMDSMIHelpers():
self._cper_warning_printed = True
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
# Loop through all entries in the dictionary.
@@ -1180,14 +1180,16 @@ class AMDSMIHelpers():
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
def dump_gpu_entries(self, folder, entries, cper_data, device_handle):
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
if folder:
@@ -1220,7 +1222,7 @@ class AMDSMIHelpers():
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
@@ -1241,9 +1243,11 @@ class AMDSMIHelpers():
def dump_all_entries(self, folder, entries, cper_data, device_handle):
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
if folder:
@@ -1276,7 +1280,7 @@ class AMDSMIHelpers():
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
try:
@@ -1293,9 +1297,11 @@ class AMDSMIHelpers():
def dump_all_entries_follow(self, folder, entries, cper_data, device_handle):
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
if folder:
@@ -1328,7 +1334,7 @@ class AMDSMIHelpers():
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
time.sleep(1)
@@ -1346,9 +1352,11 @@ class AMDSMIHelpers():
def dump_gpu_entries_follow(self, folder, entries, cper_data, device_handle):
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
if folder:
@@ -1381,7 +1389,7 @@ class AMDSMIHelpers():
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
time.sleep(1)
@@ -1396,3 +1404,35 @@ class AMDSMIHelpers():
else:
print(json.dumps(entries, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
def hexdump_to_string(self, data: Union[bytes, List[int]]) -> str:
"""
Convert binary data to a hexdump string.
Args:
data: bytes object or list of integer byte values (0255).
Returns:
A multiline string, each line showing:
offset (in hex), hex bytes (16 per line), and printable ASCII.
"""
# Normalize to list of ints
if isinstance(data, bytes):
data_ints = list(data)
else:
# allow list of ints or single-character strings
data_ints = [b if isinstance(b, int) else ord(b) for b in data]
lines: List[str] = []
size = len(data_ints)
for offset in range(0, size, 16):
chunk = data_ints[offset : offset + 16]
hex_values = " ".join(f"{b:02x}" for b in chunk)
# pad hex_values to 16*3-1 = 47 chars (two hex digits + space)
hex_values = hex_values.ljust(16 * 3 - 1)
ascii_values = "".join(chr(b) if 32 <= b <= 126 else "." for b in chunk)
lines.append(f"{offset:08x} {hex_values} |{ascii_values}|")
return "\n".join(lines)
+7 -5
Melihat File
@@ -300,7 +300,7 @@ class AMDSMIParser(argparse.ArgumentParser):
return CheckOutputFilePath
def _check_input_file_path(self):
def _check_cper_file_path(self):
""" Argument action validator:
Returns a path to a file from the input file path provided.
If the file doesn't exist or is empty raise error
@@ -310,8 +310,7 @@ class AMDSMIParser(argparse.ArgumentParser):
def __call__(self, parser, args, values, option_string=None):
path = Path(values)
if not path.exists():
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), values)
raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct. ")
if path.is_dir():
raise argparse.ArgumentTypeError(
@@ -1413,12 +1412,13 @@ class AMDSMIParser(argparse.ArgumentParser):
# Help text for RAS arguments
cper_help = "Trigger CPER data retrieval"
afid_help = "Generate an AFID (AMD Field ID) using CPER record, which is similar to XID."
severity_choices = ["nonfatal-uncorrected", "fatal", "nonfatal-corrected", "all"]
severity_choices_str = ", ".join(severity_choices)
severity_help = f"Set the SEVERITY filters from the following:\n {severity_choices_str}"
folder_help = "Folder to dump CPER report files"
file_limit_help = "Maximum number of entries per output file"
cper_file_help = "Full path of the cper record file to generate the AFID"
follow_help = "Continuously monitor for new entries"
ras_parser = subparsers.add_parser("ras", help=ras_help, description=ras_description)
@@ -1427,10 +1427,12 @@ class AMDSMIParser(argparse.ArgumentParser):
ras_parser.set_defaults(func=func)
# Required flags and arguments:
ras_parser.add_argument("--cper", action="store_true", required=True, help=cper_help)
ras_parser.add_argument("--cper", action="store_true", required=False, help=cper_help)
ras_parser.add_argument("--afid", action="store_true", required=False, help=afid_help)
ras_parser.add_argument("--severity", type=str.lower, nargs='+', default=['all'], help=severity_help, choices=severity_choices, metavar='SEVERITY')
ras_parser.add_argument("--folder", type=str, action=self._check_folder_path(), default=False, help=folder_help)
ras_parser.add_argument("--file_limit", type=self._positive_int, action='store', default=1000, help=file_limit_help)
ras_parser.add_argument("--cper_file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help)
ras_parser.add_argument("--follow", action="store_true", default=False, help=follow_help)
# Add common modifiers and device selection arguments.
+49
Melihat File
@@ -5274,3 +5274,52 @@ try:
except AmdSmiException as e:
print(e)
```
### amdsmi_get_afids_from_cper
Description: Get the AFIDs from CPER buffer
Input parameters:
* `processor_handle` device which to query
* `severity_mask` the severity mask of the entries to be retrieved
* `buffer_size` pointer to a variable that specifies the size of the cper_data
* `cursor` pointer to a variable that will contain the cursor for the next call
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
Field | Description
---|---
`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. |
`notify_type` | The notification type associated with the CPER entry. |
`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. |
`signature` | A 4-byte signature identifying the entry, typically `CPER`. |
`revision` | The revision number of the CPER record format. |
`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. |
`sec_cnt` | The count of sections included in the CPER entry. |
`record_length` | The total length in bytes of the CPER entry. |
`platform_id` | A character array identifying the GPU or platform. |
`creator_id` | A character array indicating the creator of the CPER entry. |
`record_id` | A unique identifier for the CPER entry. |
`flags` | Reserved flags related to the CPER entry. |
`persistence_info` | Reserved information related to persistence. |
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
* `AmdSmiLibraryException`
* `AmdSmiParameterException`
Example:
```python
for device in devices:
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
print("CPER entries for device", device)
for key, entry in entries.items():
print("Entry", key)
print(" Error Severity:", entry.get("error_severity", "Unknown"))
print(" Notify Type:", entry.get("notify_type", "Unknown"))
print(" Timestamp:", entry.get("timestamp", ""))
print()
print("New Cursor Position:", new_cursor)
except AmdSmiException as e:
print(e)
```
+64
Melihat File
@@ -0,0 +1,64 @@
/**
* @file aca_decode.h
* @brief Internal decoder interface and data structures
*/
#ifndef ACA_DECODE_H
#define ACA_DECODE_H
#ifdef __cplusplus
extern "C" {
#endif
#include "aca_fields.h"
/**
* @brief Internal decoder structure with parsed register fields
*/
typedef struct
{
uint64_t aca_status; /**< Raw status register value */
uint64_t aca_ipid; /**< Raw IPID register value */
uint64_t aca_synd; /**< Raw syndrome register value */
uint32_t flags; /**< Decoder flags */
uint16_t hw_revision; /**< Hardware hw_revision */
aca_status_fields_t status; /**< Parsed status fields */
aca_ipid_fields_t ipid; /**< Parsed IPID fields */
aca_synd_fields_t synd; /**< Parsed syndrome fields */
} aca_decoder_t;
/**
* @brief Structure containing raw ACA error data from hardware
*/
typedef struct
{
uint64_t aca_status; /**< Raw status register value */
uint64_t aca_ipid; /**< Raw IPID register value */
uint64_t aca_synd; /**< Raw syndrome register value */
uint32_t flags; /**< Flags from descriptor */
uint16_t hw_revision; /**< Hardware hw_revision number */
} aca_raw_data_t;
/**
* @brief Structure containing decoded error information
*/
typedef struct
{
const char *bank_ref; /**< Reference to bank name string */
const char *error_type_ref; /**< Reference to error type string */
const char *severity_ref; /**< Reference to error severity string */
const char *category_ref; /**< Reference to error category string */
int afid; /**< AFID value (AMD Field ID) */
} aca_error_info_t;
/**
* @brief Main decode function that processes raw ACA error data
* @param[in] raw_data Pointer to structure containing raw ACA error data
* @return Decoded error information structure
*/
aca_error_info_t aca_decode(const aca_raw_data_t *raw_data);
#ifdef __cplusplus
}
#endif
#endif /* ACA_DECODE_H */
+110
Melihat File
@@ -0,0 +1,110 @@
/**
* @file aca_fields.h
* @brief ACA register field definitions and manipulation functions
*
* Contains structures and functions for decoding and handling
* ACA register fields. It provides field
* definitions for status, IPID, and syndrome registers, along with
* functions to initialize and access these fields.
*/
#ifndef ACA_FIELDS_H
#define ACA_FIELDS_H
#include <stdint.h>
/**
* @brief Base structure for ACA fields containing raw register value
*/
typedef struct
{
uint64_t raw_value; /**< Raw 64-bit register value */
} aca_fields_t;
/**
* @brief Structure containing decoded ACA status register fields
*/
typedef struct
{
aca_fields_t base;
uint16_t error_code;
uint8_t error_code_ext;
uint8_t reserv22;
uint8_t addr_lsb;
uint8_t reserv30;
uint8_t err_core_id;
uint8_t reserv38;
uint8_t scrub;
uint8_t reserv41;
uint8_t poison;
uint8_t deferred;
uint8_t uecc;
uint8_t cecc;
uint8_t reserv47;
uint8_t synd_v;
uint8_t reserv54;
uint8_t tcc;
uint8_t err_core_id_val;
uint8_t pcc;
uint8_t addr_v;
uint8_t misc_v;
uint8_t en;
uint8_t uc;
uint8_t overflow;
uint8_t val;
} aca_status_fields_t;
/**
* @brief Structure containing decoded ACA IPID register fields
*/
typedef struct
{
aca_fields_t base;
uint32_t instance_id_lo;
uint16_t hardware_id;
uint16_t aca_type;
uint8_t instance_id_hi;
} aca_ipid_fields_t;
/**
* @brief Structure containing decoded ACA syndrome register fields
*/
typedef struct
{
aca_fields_t base;
uint32_t error_information;
uint8_t length;
uint8_t error_priority;
uint8_t reserved27;
uint16_t syndrome;
uint32_t reserved39;
} aca_synd_fields_t;
/**
* @brief Reads the raw value from an ACA field structure
* @param[in] fields Pointer to the ACA fields structure
* @return The raw 64-bit value stored in the structure
*/
uint64_t aca_fields_read(const aca_fields_t *fields);
/**
* @brief Initializes ACA status fields from a raw status register value
* @param[out] fields Pointer to the status fields structure to initialize
* @param[in] status_reg Raw 64-bit status register value
*/
void aca_status_init(aca_status_fields_t *fields, uint64_t status_reg);
/**
* @brief Initializes ACA IPID fields from a raw IPID register value
* @param[out] fields Pointer to the IPID fields structure to initialize
* @param[in] ipid_reg Raw 64-bit IPID register value
*/
void aca_ipid_init(aca_ipid_fields_t *fields, uint64_t ipid_reg);
/**
* @brief Initializes ACA syndrome fields from a raw syndrome register value
* @param[out] fields Pointer to the syndrome fields structure to initialize
* @param[in] synd_reg Raw 64-bit syndrome register value
*/
void aca_synd_init(aca_synd_fields_t *fields, uint64_t synd_reg);
#endif
+84
Melihat File
@@ -0,0 +1,84 @@
/**
* @file aca_tables.h
* @brief ACA lookup table definitions and helper functions
* @details Contains data structures and functions definitions for mapping ACA Registers
* into their corresponding names and types.
*/
#ifndef ACA_TABLES_H
#define ACA_TABLES_H
#include <stdint.h>
#include <stddef.h>
/**
* @brief Structure mapping hardware ID and ACA type to bank names
*/
typedef struct
{
uint16_t hw_id; /**< Hardware ID value */
uint16_t aca_type; /**< ACA type identifier */
const char *name; /**< Bank name string */
} aca_bank_entry_t;
/**
* @brief Structure mapping bank-specific error codes to error types
*/
typedef struct
{
const char *bank; /**< Bank name string */
uint32_t error_code; /**< Error code value */
const char *type; /**< Error type string */
} aca_error_type_t;
/**
* @brief Structure for generic error code to error type mapping
*/
typedef struct
{
uint32_t error_code; /**< Error code value */
const char *type; /**< Error type string */
} aca_error_entry_t;
// External table declarations
extern const aca_bank_entry_t bank_table[];
extern const aca_error_type_t error_table[];
extern const aca_error_entry_t xcd_error_table[];
extern const aca_error_entry_t aid_error_table[];
// Table size constants
extern const size_t NUM_BANKS;
extern const size_t NUM_ERRORS;
extern const size_t NUM_XCD_ERRORS;
extern const size_t NUM_AID_ERRORS;
/**
* @brief Find bank name based on hardware ID and ACA type
* @param[in] hw_id Hardware ID value
* @param[in] aca_type ACA type value
* @param[out] bank_name Pointer to store result string
* @return 0 on success, 1 if not found, -1 on parameter error
*/
int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name);
/**
* @brief Find error type for a specific bank and error code
* @param[in] bank Bank name string
* @param[in] error_code Error code value
* @param[out] error_type Pointer to store result string
* @return 0 on success, 1 if not found, -1 on parameter error
*/
int find_error_type_by_bank(const char *bank, uint32_t error_code, const char **error_type);
/**
* @brief Generic lookup for error codes in an error table
* @param[in] table Pointer to error table
* @param[in] table_size Number of table entries
* @param[in] error_code Error code to look up
* @param[out] error_type Pointer to store result string
* @return 0 on success, 1 if not found, -1 on parameter error
*/
int find_error_in_table(const aca_error_entry_t *table, size_t table_size,
uint32_t error_code, const char **error_type);
#endif
+27
Melihat File
@@ -0,0 +1,27 @@
#ifndef ERROR_MAP_H
#define ERROR_MAP_H
#include <stdint.h>
/**
* @brief Structure representing an error mapping entry
*/
typedef struct
{
uint32_t id;
const char *error_category;
const char *error_type;
const char *method;
const char *error_severity;
} error_map_entry_t;
/**
* @brief Get error ID based on category, type and severity
* @param[in] error_category Error category string
* @param[in] error_type Error type string
* @param[in] error_severity Error severity string
* @return Error ID if found, -1 if not found
*/
int get_error_id(const char *error_category, const char *error_type, const char *error_severity);
#endif /* ERROR_MAP_H */
+32 -1
Melihat File
@@ -150,7 +150,7 @@ typedef enum {
#define AMDSMI_MAX_NUM_JPEG 32
/**
* @brief new for gpu metrics v1.8, document presents NUM_JPEG_ENG_V1
* @brief Introduced in gpu metrics v1.8, document presents NUM_JPEG_ENG_V1
* but will change to AMDSMI_MAX_NUM_JPEG_ENG_V1 for continuity
*/
#define AMDSMI_MAX_NUM_JPEG_ENG_V1 40
@@ -182,6 +182,11 @@ typedef enum {
*/
#define AMDSMI_MAX_NUM_XCP 8
/**
* @brief Max Number of AFIDs that will be inside one cper entry
*/
#define MAX_NUMBER_OF_AFIDS_PER_RECORD 12
/* string format */
#define AMDSMI_TIME_FORMAT "%02d:%02d:%02d.%03d"
#define AMDSMI_DATE_FORMAT "%04d-%02d-%02d:%02d:%02d:%02d.%03d"
@@ -4795,6 +4800,32 @@ amdsmi_get_gpu_cper_entries(amdsmi_processor_handle processor_handle, uint32_t s
/** @} End tagECCInfo */
/**
* @brief Get the AFIDs from CPER buffer
*
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf}
* @platform{guest_mvf} @platform{guest_windows}
*
* @details A utility function which retrieves the AFIDs from the CPER record.
*
* @param[in] cper_buffer a pointer to the buffer with one CPER record. The caller must make sure the whole CPER record is loaded into the buffer.
*
* @param[in] buf_size is the size of the cper_buffer.
*
* @param[out] afids a pointer to an array of uint64_t to which the AF IDs will be written
*
* @param[in,out] num_afids As input, the value passed through this parameter is the number of
* uint64_t that may be safely written to the memory pointed to by @p afids. This is the limit
* on how many AF IDs will be written to @p afids. On return, @p num_afids will contain the
* number of AF IDs written to @p afids, or the number of AF IDs that could have been written
* if enough memory had been provided. It is suggest to pass MAX_NUMBER_OF_AFIDS_PER_RECORD for all
* AF Ids.
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_afids_from_cper(
char* cper_buffer, uint32_t buf_size, uint64_t* afids, uint32_t* num_afids);
/*****************************************************************************/
/** @defgroup tagErrorQuery Error Queries
* These functions provide error information about AMDSMI calls as well as
+223
Melihat File
@@ -0,0 +1,223 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#pragma once
#include "amd_smi/amdsmi.h"
#pragma pack(1)
#define CPER_MAX_OAM_COUNT (8)
typedef enum cper_error_severity {
CPER_SEV_FATAL_UNCORRECTED = 0,
CPER_SEV_FATAL = 1,
CPER_SEV_FATAL_CORRECTED = 2,
CPER_SEV_UNUSED = 10,
};
typedef enum cper_aca_reg {
CPER_ACA_REG_CTL_LO = 0,
CPER_ACA_REG_CTL_HI = 1,
CPER_ACA_REG_STATUS_LO = 2,
CPER_ACA_REG_STATUS_HI = 3,
CPER_ACA_REG_ADDR_LO = 4,
CPER_ACA_REG_ADDR_HI = 5,
CPER_ACA_REG_MISC0_LO = 6,
CPER_ACA_REG_MISC0_HI = 7,
CPER_ACA_REG_CONFIG_LO = 8,
CPER_ACA_REG_CONFIG_HI = 9,
CPER_ACA_REG_IPID_LO = 10,
CPER_ACA_REG_IPID_HI = 11,
CPER_ACA_REG_SYND_LO = 12,
CPER_ACA_REG_SYND_HI = 13,
CPER_ACA_REG_COUNT = 32,
};
struct cper_sec_desc {
uint32_t sec_offset; /* Offset from the start of CPER entry */
uint32_t sec_length;
uint8_t revision_minor; /* CPER_SEC_MINOR_REV_1 */
uint8_t revision_major; /* CPER_SEC_MAJOR_REV_22 */
union {
struct {
uint8_t fru_id : 1;
uint8_t fru_text : 1;
uint8_t reserved : 6;
} valid_bits;
uint8_t valid_mask;
};
uint8_t reserved;
union {
struct {
uint32_t primary : 1;
uint32_t reserved1 : 2;
uint32_t exceed_err_threshold : 1;
uint32_t latent_err : 1; /* "Deferred" error Creation*/
uint32_t reserved2 : 27;
} flags_bits;
uint32_t flags_mask;
};
amdsmi_cper_guid_t sec_type; /* AMD non-Standard, AMD Crashdump */
char fru_id[16]; /* FRU Serial ID */
amdsmi_cper_sev_t severity;
char fru_text[20]; /* "OAM%d" */
};
struct cper_sec_nonstd_err_info {
amdsmi_cper_guid_t error_type;
union {
struct {
uint64_t ms_chk : 1;
uint64_t target_addr_id : 1;
uint64_t req_id : 1;
uint64_t resp_id : 1;
uint64_t instr_ptr : 1;
uint64_t reserved : 59;
} valid_bits;
uint64_t valid_mask;
};
union {
struct {
uint64_t err_type_valid : 1;
uint64_t pcc_valid : 1;
uint64_t uncorr_valid : 1;
uint64_t precise_ip_valid : 1;
uint64_t restartable_ip_valid : 1;
uint64_t overflow_valid : 1;
uint64_t reserved1 : 10;
uint64_t err_type : 2;
uint64_t pcc : 1;
uint64_t uncorr : 1;
uint64_t precised_ip : 1;
uint64_t restartable_ip : 1;
uint64_t overflow : 1;
uint64_t reserved2 : 41;
} ms_chk_bits;
uint64_t ms_chk_mask;
};
uint64_t target_addr_id;
uint64_t req_id;
uint64_t resp_id;
uint64_t instr_ptr;
};
struct cper_sec_nonstd_err_ctx {
uint16_t reg_ctx_type;
uint16_t reg_arr_size;
uint32_t msr_addr;
uint64_t mm_reg_addr;
uint32_t reg_dump[CPER_ACA_REG_COUNT]; /* This buffer can grow */
};
struct cper_sec_nonstd_err_hdr {
union {
struct {
uint64_t apic_id : 1;
uint64_t fw_id : 1;
uint64_t err_info_cnt : 6; /* should match context_cnt */
uint64_t err_context_cnt : 6; /* should match info_cnt */
} valid_bits;
uint64_t valid_mask;
};
uint64_t apic_id;
char fw_id[48];
};
struct cper_sec_nonstd_err_body {
struct cper_sec_nonstd_err_info err_info;
struct cper_sec_nonstd_err_ctx err_ctx;
};
struct cper_sec_nonstd_err {
struct cper_sec_nonstd_err_hdr hdr;
struct cper_sec_nonstd_err_body body[]; /* Variable Size, today only 1 entry */
};
struct cper_sec_crashdump_data {
uint16_t reg_ctx_type;
uint16_t reg_arr_size;
uint32_t reserved1;
uint64_t reserved2;
union {
struct {
uint32_t status_lo;
uint32_t status_hi;
uint32_t addr_lo;
uint32_t addr_hi;
uint32_t ipid_lo;
uint32_t ipid_hi;
uint32_t synd_lo;
uint32_t synd_hi;
} fatal_err;
struct {
uint64_t msg[CPER_MAX_OAM_COUNT];
} boot_err;
} dump;
};
struct cper_sec_crashdump {
uint64_t reserved1;
uint64_t reserved2;
char fw_id[48];
uint64_t reserved3[8];
struct cper_sec_crashdump_data data;
};
struct cper_sec {
union {
struct {
uint8_t fru_id : 1;
uint8_t fru_text : 1;
uint8_t reserved : 6;
} valid_bits;
uint8_t valid_mask;
};
union {
struct cper_sec_crashdump crashdump;
struct cper_sec_nonstd_err runtime_err;
};
};
/* General CPER record structure */
struct cper_1_0 {
struct cper_hdr *hdr;
struct cper_sec_desc *sec_desc; /* Variable Size */
struct cper_sec *sec; /* Variable Size */
};
#pragma pack()
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask,
char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs,
uint64_t *entry_count, uint64_t *cursor);
std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper);
-1
Melihat File
@@ -56,7 +56,6 @@ std::string smi_split_string(std::string str, char delim);
std::string smi_amdgpu_get_status_string(amdsmi_status_t ret, bool fullStatus);
amdsmi_status_t smi_clear_char_and_reinitialize(char buffer[], uint32_t len,
std::string newString);
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask, char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs, uint64_t *entry_count, uint64_t *cursor);
/**
* @brief Wait for user input, a debugging function to pause the program
*
+93 -26
Melihat File
@@ -60,6 +60,9 @@ AMDSMI_MAX_NUM_JPEG = 32
AMDSMI_MAX_NUM_XCC = 8
AMDSMI_MAX_NUM_XCP = 8
# max num afids per cper record
MAX_NUMBER_OF_AFIDS_PER_RECORD = 12
# Max number of DPM policies
AMDSMI_MAX_NUM_PM_POLICIES = 32
@@ -1888,7 +1891,6 @@ def amdsmi_get_gpu_asic_info(
# Remove commas from vendor name for clean output
asic_info["vendor_name"] = asic_info["vendor_name"].replace(',', '')
# logging.debug("amdsmi_interface.py | amdsmi_get_gpu_asic_info | return_dictionary = \n" + str(json.dumps(asic_info, indent=4)))
return asic_info
@@ -2300,9 +2302,10 @@ def notifyTypeToString(notify_type_b):
idx = idx +1
return "".join(guid[::-1])
def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
def amdsmi_get_gpu_cper_entries(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
severity_mask: int,
buffer_size: int = 4*1048576,
buffer_size: int = 4 * 1048576,
cursor: int = 0
) -> Tuple[List[Dict[str, Any]], int]:
@@ -2316,6 +2319,7 @@ def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processo
buf_size = ctypes.c_uint64(buffer_size)
entry_count = ctypes.c_uint64(20)
cur = ctypes.c_uint64(cursor)
# Allocate a pointer for the CPER header array.
cper_hdrs_array = (ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * 20)()
cper_hdrs = ctypes.cast(cper_hdrs_array, ctypes.POINTER(ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t)))
@@ -2336,51 +2340,114 @@ def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processo
entries = {}
cper_data = []
offset = 0
# Iterate over each entry using its variable record_length.
for i in range(entry_count.value):
entry_address = ctypes.addressof(buf) + offset
entry_ptr = ctypes.cast(entry_address, ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t))
# Extract the raw bytes and size of the entry.
cper_data.append({
"bytes":list((entry_ptr.contents.record_length * ctypes.c_byte).from_address(entry_address)),
"size":entry_ptr.contents.record_length
"bytes": list((entry_ptr.contents.record_length * ctypes.c_byte).from_address(entry_address)),
"size": entry_ptr.contents.record_length
})
# Extract the timestamp fields.
year = entry_ptr.contents.timestamp.year
# Adjust the year if it's less than 100. You can tweak this logic based on your expected data.
if year < 100:
year += 2000
if year < 100: # Adjust the year if it's less than 100.
year += 2000
formatted_timestamp = (
f"{year:04d}/"
f"{entry_ptr.contents.timestamp.month:02d}/"
f"{entry_ptr.contents.timestamp.day:02d} "
f"{entry_ptr.contents.timestamp.hours:02d}:"
f"{entry_ptr.contents.timestamp.minutes:02d}:"
f"{entry_ptr.contents.timestamp.seconds:02d}"
f"{year:04d}/"
f"{entry_ptr.contents.timestamp.month:02d}/"
f"{entry_ptr.contents.timestamp.day:02d} "
f"{entry_ptr.contents.timestamp.hours:02d}:"
f"{entry_ptr.contents.timestamp.minutes:02d}:"
f"{entry_ptr.contents.timestamp.seconds:02d}"
)
# Create a dictionary for the CPER entry.
cper_entry = {
"error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get(entry_ptr.contents.error_severity, "AMDSMI_CPER_SEV_UNUSED").replace("AMDSMI_CPER_SEV_", "").lower(),
"error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get(
entry_ptr.contents.error_severity, "AMDSMI_CPER_SEV_UNUSED"
).replace("AMDSMI_CPER_SEV_", "").lower(),
"notify_type": _notifyTypeToString(entry_ptr.contents.notify_type.b),
"timestamp": formatted_timestamp,
"signature" : entry_ptr.contents.signature,
"revision" : entry_ptr.contents.revision,
"signature_end" : hex(entry_ptr.contents.signature_end),
"sec_cnt" : entry_ptr.contents.sec_cnt,
"record_length" : entry_ptr.contents.record_length,
"platform_id" : entry_ptr.contents.platform_id,
"creator_id" : entry_ptr.contents.creator_id,
"record_id" : entry_ptr.contents.record_id,
"flags" : entry_ptr.contents.flags,
"persistence_info" : entry_ptr.contents.persistence_info,
"signature": entry_ptr.contents.signature,
"revision": entry_ptr.contents.revision,
"signature_end": hex(entry_ptr.contents.signature_end),
"sec_cnt": entry_ptr.contents.sec_cnt,
"record_length": entry_ptr.contents.record_length,
"platform_id": entry_ptr.contents.platform_id,
"creator_id": entry_ptr.contents.creator_id,
"record_id": entry_ptr.contents.record_id,
"flags": entry_ptr.contents.flags,
"persistence_info": entry_ptr.contents.persistence_info,
#"reserved" : entry_ptr.contents.reserved
#"cper_valid_bit" : entry_ptr.contents.cper_valid_bits,
#"partition_id" : entry_ptr.contents.partition_id,
}
entries[i] = cper_entry.copy()
offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset
offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset.
return entries, cur.value, cper_data
def amdsmi_get_afids_from_cper(
cper_afid_data: Union[bytes, bytearray, List[Dict[str, Any]]]
) -> Tuple[List[int], int]:
"""
Extract AFIDs from one or more CPER blobs.
Args:
cper_afid_data: Either
- raw bytes or bytearray of a single CPER record, or
- a list of dicts each with keys "bytes" (List[int]) and "size" (int).
Returns:
Tuple[List[int], int]: A tuple containing:
- A list of extracted AFIDs.
- The total count of AFIDs.
"""
# Normalize single blob into a list of records
if isinstance(cper_afid_data, (bytes, bytearray)):
cper_records = [{
"bytes": list(cper_afid_data),
"size": len(cper_afid_data)
}]
else:
cper_records = cper_afid_data
all_afids: List[int] = []
for record in cper_records:
raw_bytes = bytes(record["bytes"])
record_size = record["size"]
# Wrap as char*
buf = ctypes.create_string_buffer(raw_bytes, record_size)
buf_ptr = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char))
afid_array = (ctypes.c_uint64 * MAX_NUMBER_OF_AFIDS_PER_RECORD)()
num_afids_ct = ctypes.c_uint32(MAX_NUMBER_OF_AFIDS_PER_RECORD)
# Call the wrapper function
status = amdsmi_wrapper.amdsmi_get_afids_from_cper(
buf_ptr,
ctypes.c_uint32(record_size),
afid_array,
ctypes.byref(num_afids_ct)
)
if status != amdsmi_wrapper.AMDSMI_STATUS_SUCCESS:
raise AmdSmiLibraryException(f"get_afids failed: {status}")
# Collect exactly the decoded AFIDs
count = num_afids_ct.value
all_afids.extend(afid_array[i] for i in range(count))
return all_afids, len(all_afids)
def amdsmi_get_gpu_board_info(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> Dict[str, Any]:
+6 -3
Melihat File
@@ -2642,6 +2642,9 @@ amdsmi_cper_hdr_t = struct_amdsmi_cper_hdr_t
amdsmi_get_gpu_cper_entries = _libraries['libamd_smi.so'].amdsmi_get_gpu_cper_entries
amdsmi_get_gpu_cper_entries.restype = amdsmi_status_t
amdsmi_get_gpu_cper_entries.argtypes = [amdsmi_processor_handle, uint32_t, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.POINTER(struct_amdsmi_cper_hdr_t)), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64)]
amdsmi_get_afids_from_cper = _libraries['libamd_smi.so'].amdsmi_get_afids_from_cper
amdsmi_get_afids_from_cper.restype = amdsmi_status_t
amdsmi_get_afids_from_cper.argtypes = [ctypes.POINTER(ctypes.c_char), uint32_t, ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint32)]
amdsmi_get_gpu_ecc_status = _libraries['libamd_smi.so'].amdsmi_get_gpu_ecc_status
amdsmi_get_gpu_ecc_status.restype = amdsmi_status_t
amdsmi_get_gpu_ecc_status.argtypes = [amdsmi_processor_handle, amdsmi_gpu_block_t, ctypes.POINTER(amdsmi_ras_err_state_t)]
@@ -3171,9 +3174,9 @@ __all__ = \
'amdsmi_free_name_value_pairs', 'amdsmi_freq_ind_t',
'amdsmi_freq_volt_region_t', 'amdsmi_frequencies_t',
'amdsmi_frequency_range_t', 'amdsmi_fw_block_t',
'amdsmi_fw_info_t', 'amdsmi_get_clk_freq',
'amdsmi_get_clock_info', 'amdsmi_get_cpu_cclk_limit',
'amdsmi_get_cpu_core_boostlimit',
'amdsmi_fw_info_t', 'amdsmi_get_afids_from_cper',
'amdsmi_get_clk_freq', 'amdsmi_get_clock_info',
'amdsmi_get_cpu_cclk_limit', 'amdsmi_get_cpu_core_boostlimit',
'amdsmi_get_cpu_core_current_freq_limit',
'amdsmi_get_cpu_core_energy',
'amdsmi_get_cpu_current_io_bandwidth',
+10 -1
Melihat File
@@ -16,6 +16,7 @@ set(INC_DIR "${PROJECT_SOURCE_DIR}/include/amd_smi")
set(SRC_LIST
"${SRC_DIR}/amd_smi.cc"
"${SRC_DIR}/amd_smi_cper.cc"
"${SRC_DIR}/amd_smi_common.cc"
"${SRC_DIR}/amd_smi_drm.cc"
"${SRC_DIR}/amd_smi_gpu_device.cc"
@@ -29,6 +30,7 @@ set(SRC_LIST
set(INC_LIST
"${INC_DIR}/amdsmi.h"
"${INC_DIR}/impl/amd_smi_common.h"
"${INC_DIR}/impl/amd_smi_cper.h"
"${INC_DIR}/impl/amd_smi_processor.h"
"${INC_DIR}/impl/amd_smi_drm.h"
"${INC_DIR}/impl/amd_smi_gpu_device.h"
@@ -38,6 +40,13 @@ set(INC_LIST
"${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi.h"
"${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi_utils.h")
set(ACA_SRC_DIR "aca-decode")
set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c ${ACA_SRC_DIR}/aca_tables.c
${ACA_SRC_DIR}/error_map.c)
set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/aca-decode")
set(INC_LIST ${INC_LIST} ${ACA_INC_DIR}/aca_decode.h ${ACA_INC_DIR}/aca_fields.h ${ACA_INC_DIR}/aca_tables.h
${ACA_INC_DIR}/error_map.h)
if(ENABLE_ESMI_LIB)
list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi.h)
list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi_monitor.h)
@@ -72,7 +81,7 @@ target_link_libraries(amd_smi_ex ${AMD_SMI})
add_library(${AMD_SMI} ${SRC_LIST} ${INC_LIST})
target_link_libraries(${AMD_SMI} pthread rt dl ${DRM_LIBRARIES} ${AMDGPU_DRM_LIBRARIES})
target_include_directories(${AMD_SMI} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/rocm_smi/include
${PROJECT_SOURCE_DIR}/common/shared_mutex)
${PROJECT_SOURCE_DIR}/common/shared_mutex ${ACA_INC_DIR})
# use the target_include_directories() command to specify the include directories for the target
target_include_directories(${AMD_SMI} PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+285
Melihat File
@@ -0,0 +1,285 @@
/**
* @file aca_decode.c
* @brief Implementation of ACA error decoding functions
*
* This file contains functions for decoding and analyzing ACA error information from
* raw register data. It provides functionality to determine error severity, bank
* information, and specific error types based on hardware-specific error codes.
*/
#include "aca_decode.h"
#include "aca_tables.h"
#include "error_map.h"
#include <string.h>
/**
* @brief Gets the bank name based on hardware ID and ACA type
* @param[in] decoder Pointer to the ACA decoder structure
* @param[out] bank_name Pointer to a string containing the bank name
* @return 0 on success, -1 on failure
*/
static int
aca_decoder_get_bank(const aca_decoder_t *decoder, const char **bank_name)
{
if (!decoder || !bank_name)
{
return -1;
}
const aca_ipid_fields_t *ipid = &decoder->ipid;
return find_bank_name(ipid->hardware_id, ipid->aca_type, bank_name);
}
/**
* @brief Determines the error severity based on status fields
* @param[in] status Pointer to the ACA status fields structure
* @return String indicating error severity: "Fatal", "Uncorrected, Non-fatal", "Corrected", or "UNKNOWN"
*/
static const char *get_error_severity(const aca_status_fields_t *status)
{
if (status->poison)
return "Uncorrected, Non-fatal";
if (status->pcc)
return "Fatal";
if (!status->pcc && status->uc && status->tcc)
return "Fatal";
if (!status->pcc && status->uc && !status->tcc)
return "Uncorrected, Non-fatal";
if (!status->pcc && !status->uc && !status->tcc && status->deferred)
return "Uncorrected, Non-fatal";
if (!status->pcc && !status->uc && !status->tcc && !status->deferred)
return "Corrected";
return "UNKNOWN";
}
/**
* @brief Determines the error category based on bank and error type
* @param[in] bank Pointer to the bank name
* @param[in] error_type Pointer to the error type
* @return String indicating error category: "HBM Errors", "Off-Package Link Errors", or "Device Internal Errors"
*/
static const char *get_error_category(const char *bank, const char *error_type)
{
if (!bank || !error_type)
{
return "UNKNOWN";
}
if (strcmp(bank, "umc") == 0)
{
if (strcmp(error_type, "On-die ECC") == 0 ||
strcmp(error_type, "WriteDataPoisonErr") == 0 ||
strcmp(error_type, "AddressCommandParityErr") == 0 ||
strcmp(error_type, "WriteDataCrcErr") == 0 ||
strcmp(error_type, "EcsErr") == 0 ||
strcmp(error_type, "RdCrcErr") == 0 ||
strcmp(error_type, "End-to-end CRC") == 0)
{
return "HBM Errors";
}
}
else if (strcmp(bank, "pcs_xgmi") == 0 ||
strcmp(bank, "kpx_serdes") == 0 ||
strcmp(bank, "kpx_wafl") == 0 ||
(strcmp(bank, "psp") == 0 && strcmp(error_type, "WAFL") == 0))
{
return "Off-Package Link Errors";
}
return "Device Internal Errors";
}
/**
* @brief Determines the service error type from error attributes
* @param[in] error_category Pointer to the error category string
* @param[in] error_bank Pointer to the error bank string
* @param[in] error_type Pointer to the error type string
* @param[in] error_severity Pointer to the error severity string
* @param[out] service_error_type Pointer to store the resulting service error type string
* @return 0 on success, non-zero on failure
*/
static int get_service_error_type(const char *error_category, const char *error_bank, const char *error_type,
const char *error_severity, const char **service_error_type)
{
if (!error_category || !error_type || !error_severity || !service_error_type ||
strcmp(error_category, "UNKNOWN") == 0 ||
strcmp(error_type, "UNKNOWN") == 0 ||
strcmp(error_severity, "UNKNOWN") == 0)
{
return -1;
}
if (strcmp(error_type, "Bad Page Retirement Threshold") == 0)
{
*service_error_type = "Bad Page Retirement Threshold";
return 0;
}
if (strcmp(error_type, "RdCrcErr") == 0)
{
*service_error_type = "End-to-end CRC";
return 0;
}
if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Corrected") == 0))
{
*service_error_type = "All";
return 0;
}
if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Fatal") == 0) &&
(strcmp(error_type, "On-die ECC") != 0) && (strcmp(error_type, "End-to-end CRC") != 0))
{
*service_error_type = "All Others";
return 0;
}
if (strcmp(error_category, "Device Internal Errors") == 0)
{
if ((strcmp(error_severity, "Uncorrected, Non-fatal") == 0 ||
strcmp(error_severity, "Corrected") == 0 ||
strcmp(error_severity, "Fatal") == 0) &&
strcmp(error_type, "Hardware Assertion (HWA)") != 0 &&
strcmp(error_type, "Watchdog Timeout (WDT)") != 0)
{
*service_error_type = "All Others";
return 0;
}
}
if (strcmp(error_category, "Off-Package Link Errors") == 0)
{
if (strcmp(error_bank, "pcs_xgmi") == 0)
{
*service_error_type = "XGMI";
return 0;
}
if (strcmp(error_bank, "kpx_wafl") == 0)
{
*service_error_type = "WAFL";
return 0;
}
}
return -1;
}
/**
* @brief Extracts error information from the decoder and populates the info structure
* @param[in] decoder Pointer to the ACA decoder structure
* @param[out] info Pointer to the error info structure to be populated
*/
static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_info_t *info)
{
const char *bank;
const char *error_type;
int result;
result = aca_decoder_get_bank(decoder, &bank);
if (result < 0)
{
bank = "UNKNOWN";
}
info->bank_ref = bank;
// 0b1000 indicate error threshold has been exceeded, and is always fatal
if (decoder->flags & 0x8)
{
info->severity_ref = "Fatal";
}
else
{
info->severity_ref = get_error_severity(&decoder->status);
}
if (decoder->status.error_code_ext >= 0x3A && decoder->status.error_code_ext <= 0x3E)
{
uint32_t instance_id = decoder->ipid.instance_id_lo;
uint32_t error_info = decoder->synd.error_information & 0xFF;
if ((instance_id == 0x36430400 || instance_id == 0x38430400 ||
instance_id == 0x36430401 || instance_id == 0x38430401) &&
find_error_in_table(xcd_error_table, NUM_XCD_ERRORS, error_info, &error_type) == 0)
{
info->error_type_ref = error_type;
}
else if ((instance_id == 0x3B30400 || instance_id == 0x3B30401) &&
find_error_in_table(aid_error_table, NUM_AID_ERRORS, error_info, &error_type) == 0)
{
info->error_type_ref = error_type;
}
else
{
info->error_type_ref = "UNKNOWN";
}
}
// 0b1000 indicate error threshold has been exceeded
else if (decoder->flags & 0x8)
{
info->error_type_ref = "Bad Page Retirement Threshold";
}
else
{
if (find_error_type_by_bank(bank, decoder->status.error_code_ext, &error_type) == 0)
{
info->error_type_ref = error_type;
}
else
{
info->error_type_ref = "UNKNOWN";
}
}
// 0b1000 indicate error threshold has been exceeded, and is always a HBM error
if (decoder->flags & 0x8)
{
info->category_ref = "HBM Errors";
}
else
{
info->category_ref = get_error_category(bank, info->error_type_ref);
}
const char *service_error;
if (get_service_error_type(info->category_ref, info->bank_ref, info->error_type_ref, info->severity_ref, &service_error) != 0)
{
service_error = info->error_type_ref;
}
info->afid = get_error_id(info->category_ref, service_error, info->severity_ref);
}
/**
* @brief Initializes an ACA decoder structure with raw register values
* @param[out] decoder Pointer to the decoder structure to initialize
* @param[in] hw_revision Hardware hw_revision number
* @param[in] flags Decoder flags
* @param[in] status_reg Raw status register value
* @param[in] ipid_reg Raw IPID register value
* @param[in] synd_reg Raw syndrome register value
*/
static void aca_decoder_init(aca_decoder_t *decoder, uint16_t hw_revision, uint32_t flags,
uint64_t status_reg, uint64_t ipid_reg, uint64_t synd_reg)
{
memset(decoder, 0, sizeof(aca_decoder_t));
decoder->hw_revision = hw_revision;
decoder->flags = flags;
decoder->aca_status = status_reg;
decoder->aca_ipid = ipid_reg;
decoder->aca_synd = synd_reg;
aca_status_init(&decoder->status, status_reg);
aca_ipid_init(&decoder->ipid, ipid_reg);
aca_synd_init(&decoder->synd, synd_reg);
}
aca_error_info_t aca_decode(const aca_raw_data_t *raw_data)
{
aca_decoder_t decoder = {0};
aca_error_info_t info = {0};
aca_decoder_init(&decoder,
raw_data->hw_revision,
raw_data->flags,
raw_data->aca_status,
raw_data->aca_ipid,
raw_data->aca_synd);
aca_decoder_get_error_info(&decoder, &info);
return info;
}
+76
Melihat File
@@ -0,0 +1,76 @@
/**
* @file aca_fields.c
* @brief Implementation of ACA register field handling
*
* This file contains functions for initializing and reading various ACA register fields
* including status, IPID, and syndrome registers. Each function
* extracts specific bit fields from raw register values and populates corresponding
* field structures.
*/
#include "aca_fields.h"
/**
* @brief Extracts a bit field from a value
* @param[in] value The source value to extract bits from
* @param[in] start Starting bit position
* @param[in] count Number of bits to extract
* @param[in] type The type to cast the extracted bits to
* @return The extracted bits as a value of the specified type
*/
#define EXTRACT_BITS(value, start, count, type) ((type)(((value) >> (start)) & ((1ULL << (count)) - 1)))
uint64_t aca_fields_read(const aca_fields_t *fields)
{
return fields->raw_value;
}
void aca_status_init(aca_status_fields_t *fields, uint64_t status_reg)
{
fields->base.raw_value = status_reg;
fields->error_code = EXTRACT_BITS(status_reg, 0, 16, uint16_t);
fields->error_code_ext = EXTRACT_BITS(status_reg, 16, 6, uint8_t);
fields->reserv22 = EXTRACT_BITS(status_reg, 22, 2, uint8_t);
fields->addr_lsb = EXTRACT_BITS(status_reg, 24, 6, uint8_t);
fields->reserv30 = EXTRACT_BITS(status_reg, 30, 2, uint8_t);
fields->err_core_id = EXTRACT_BITS(status_reg, 32, 6, uint8_t);
fields->reserv38 = EXTRACT_BITS(status_reg, 38, 2, uint8_t);
fields->scrub = EXTRACT_BITS(status_reg, 40, 1, uint8_t);
fields->reserv41 = EXTRACT_BITS(status_reg, 41, 2, uint8_t);
fields->poison = EXTRACT_BITS(status_reg, 43, 1, uint8_t);
fields->deferred = EXTRACT_BITS(status_reg, 44, 1, uint8_t);
fields->uecc = EXTRACT_BITS(status_reg, 45, 1, uint8_t);
fields->cecc = EXTRACT_BITS(status_reg, 46, 1, uint8_t);
fields->reserv47 = EXTRACT_BITS(status_reg, 47, 5, uint8_t);
fields->synd_v = EXTRACT_BITS(status_reg, 53, 1, uint8_t);
fields->reserv54 = EXTRACT_BITS(status_reg, 54, 1, uint8_t);
fields->tcc = EXTRACT_BITS(status_reg, 55, 1, uint8_t);
fields->err_core_id_val = EXTRACT_BITS(status_reg, 56, 1, uint8_t);
fields->pcc = EXTRACT_BITS(status_reg, 57, 1, uint8_t);
fields->addr_v = EXTRACT_BITS(status_reg, 58, 1, uint8_t);
fields->misc_v = EXTRACT_BITS(status_reg, 59, 1, uint8_t);
fields->en = EXTRACT_BITS(status_reg, 60, 1, uint8_t);
fields->uc = EXTRACT_BITS(status_reg, 61, 1, uint8_t);
fields->overflow = EXTRACT_BITS(status_reg, 62, 1, uint8_t);
fields->val = EXTRACT_BITS(status_reg, 63, 1, uint8_t);
}
void aca_ipid_init(aca_ipid_fields_t *fields, uint64_t ipid_reg)
{
fields->base.raw_value = ipid_reg;
fields->instance_id_lo = EXTRACT_BITS(ipid_reg, 0, 32, uint32_t);
fields->hardware_id = EXTRACT_BITS(ipid_reg, 32, 12, uint16_t);
fields->instance_id_hi = EXTRACT_BITS(ipid_reg, 44, 4, uint8_t);
fields->aca_type = EXTRACT_BITS(ipid_reg, 48, 16, uint16_t);
}
void aca_synd_init(aca_synd_fields_t *fields, uint64_t synd_reg)
{
fields->base.raw_value = synd_reg;
fields->error_information = EXTRACT_BITS(synd_reg, 0, 18, uint32_t);
fields->length = EXTRACT_BITS(synd_reg, 18, 6, uint8_t);
fields->error_priority = EXTRACT_BITS(synd_reg, 24, 3, uint8_t);
fields->reserved27 = EXTRACT_BITS(synd_reg, 27, 5, uint8_t);
fields->syndrome = EXTRACT_BITS(synd_reg, 32, 7, uint16_t);
fields->reserved39 = EXTRACT_BITS(synd_reg, 39, 25, uint32_t);
}
+368
Melihat File
@@ -0,0 +1,368 @@
/**
* @file aca_tables.c
* @brief ACA Decode Tables Implementation
*
* This file contains lookup tables and helper functions for mapping ACA error codes
* to human-readable strings. It includes:
* - Bank mapping table for hardware IDs and ACA types
* - Error type mapping table for bank-specific error codes
* - GFX error mapping tables for XCD and AID errors
* - Lookup functions to find bank names and error types
*/
#include "aca_tables.h"
#include <stdint.h>
#include <stddef.h>
#include <string.h>
/**
* @brief Mapping table for hardware IDs and ACA types to bank names
*/
const aca_bank_entry_t bank_table[] = {
{0x2E, 0x02, "cs"},
{0x2E, 0x01, "pie"},
{0x96, 0x00, "umc"},
{0xFF, 0x01, "psp"},
{0x01, 0x01, "smu"},
{0x18, 0x00, "nbio"},
{0x46, 0x01, "pcie"},
{0x05, 0x00, "pb"},
{0x259, 0x00, "kpx_serdes"},
{0x2E, 0x04, "mall"},
{0x267, 0x00, "kpx_wafl"},
{0x50, 0x00, "pcs_xgmi"},
{0x6C, 0x00, "nbif"},
{0x80, 0x00, "shub"},
{0x170, 0x00, "usr_dp"},
{0x180, 0x00, "usr_cp"}};
/**
* @brief Mapping table for bank-specific error codes to error types
*/
const aca_error_type_t error_table[] = {
{"cs", 0x0, "FTI_ILL_REQ"},
{"cs", 0x1, "FTI_ADDR_VIOL"},
{"cs", 0x2, "FTI_SEC_VIOL"},
{"cs", 0x3, "FTI_ILL_RSP"},
{"cs", 0x4, "FTI_RSP_NO_MTCH"},
{"cs", 0x5, "FTI_PAR_ERR"},
{"cs", 0x6, "SDP_PAR_ERR"},
{"cs", 0x7, "ATM_PAR_ERR"},
{"cs", 0x8, "SDP_RSP_NO_MTCH"},
{"cs", 0x9, "SPF_PRT_ERR"},
{"cs", 0xa, "SPF_ECC_ERR"},
{"cs", 0xb, "SDP_UNEXP_RETRY"},
{"cs", 0xc, "CNTR_OVFL"},
{"cs", 0xd, "CNTR_UNFL"},
{"cs", 0xe, "FTI_ND_ILL_REQ"},
{"cs", 0xf, "FTI_ND_ADDR_VIOL"},
{"cs", 0x10, "FTI_ND_SEC_VIOL"},
{"cs", 0x11, "Hardware Assertion (HWA)"},
{"cs", 0x12, "ST_PRT_ERR"},
{"cs", 0x13, "ST_ECC_ERR"},
{"cs", 0x14, "ST_TXN_ERR"},
{"pie", 0x0, "Hardware Assertion (HWA)"},
{"pie", 0x1, "CSW"},
{"pie", 0x2, "GMI"},
{"pie", 0x3, "FTI_DAT_STAT"},
{"pie", 0x4, "DEF"},
{"pie", 0x5, "Watchdog Timeout (WDT)"},
{"pie", 0x6, "CNLI"},
{"pie", 0x7, "RSLVFCI"},
{"umc", 0x0, "On-die ECC"},
{"umc", 0x1, "WriteDataPoisonErr"},
{"umc", 0x2, "SdpParityErr"},
{"umc", 0x4, "AddressCommandParityErr"},
{"umc", 0x5, "WriteDataCrcErr"},
{"umc", 0x6, "SramEccErr"},
{"umc", 0x9, "EcsErr"},
{"umc", 0xa, "ThrttlErr"},
{"umc", 0xb, "RdCrcErr"},
{"umc", 0xd, "MpFwErr"},
{"umc", 0xe, "MpParErr"},
{"umc", 0xf, "End-to-end CRC"},
{"psp", 0x0, "Mp0HighSramError"},
{"psp", 0x1, "Mp0LowSramError"},
{"psp", 0x2, "Mp0IDataBank0Error"},
{"psp", 0x3, "Mp0IDataBank1Error"},
{"psp", 0x4, "Mp0ITagRam0Error"},
{"psp", 0x5, "Mp0ITagRam1Error"},
{"psp", 0x6, "Mp0DDataBank0Error"},
{"psp", 0x7, "Mp0DDataBank1Error"},
{"psp", 0x8, "Mp0DDataBank2Error"},
{"psp", 0x9, "Mp0DDataBank3Error"},
{"psp", 0xa, "Mp0DTagBank0Error"},
{"psp", 0xb, "Mp0DTagBank1Error"},
{"psp", 0xc, "Mp0DTagBank2Error"},
{"psp", 0xd, "Mp0DTagBank3Error"},
{"psp", 0xe, "Mp0DDirtyRamError"},
{"psp", 0xf, "Mp0TlbBank0Error"},
{"psp", 0x10, "Mp0TlbBank1Error"},
{"psp", 0x11, "Mp0SHubIfRdBufError"},
{"psp", 0x12, "PhyRamEccError"},
{"psp", 0x3a, "PoisonDataConsumption"},
{"psp", 0x3b, "SRAM_EDC"},
{"psp", 0x3c, "SMN_Parity"},
{"psp", 0x3d, "SMN_Timeout"},
{"psp", 0x3f, "WAFL"},
{"smu", 0x0, "Mp5HighSramError"},
{"smu", 0x1, "Mp5LowSramError"},
{"smu", 0x2, "Mp5DCacheAError"},
{"smu", 0x3, "Mp5DCacheBError"},
{"smu", 0x4, "Mp5DTagAError"},
{"smu", 0x5, "Mp5DTagBError"},
{"smu", 0x6, "Mp5ICacheAError"},
{"smu", 0x7, "Mp5ICacheBError"},
{"smu", 0x8, "Mp5ITagAError"},
{"smu", 0x9, "Mp5ITagBError"},
{"smu", 0xb, "PhyRamEccError"},
{"smu", 0x3a, "GFX_IP_Correctable_Error"},
{"smu", 0x3b, "GFX_IP_Fatal_Error"},
{"smu", 0x3d, "Reserved"},
{"smu", 0x3e, "GFX_IP_Poison_Error"},
{"nbio", 0x0, "EccParityError"},
{"nbio", 0x1, "PCIE_Sideband"},
{"nbio", 0x2, "Ext_ErrEvent"},
{"nbio", 0x3, "Egress_Poison"},
{"nbio", 0x4, "IOHC_Internal_Poison"},
{"nbio", 0x5, "Int_ErrEvent"},
{"pcie", 0x0, "SDP_PARITY_ERR_LOG"},
{"pb", 0x0, "EccError"},
{"kpx_serdes", 0x0, "RAMECC"},
{"kpx_serdes", 0x1, "ARCIns"},
{"kpx_serdes", 0x2, "ARCData"},
{"kpx_serdes", 0x3, "APB"},
{"mall", 0x0, "CNTR_OVFL"},
{"mall", 0x1, "CNTR_UNFL"},
{"mall", 0x2, "CSDP_PAR_ERR"},
{"mall", 0x3, "USDP_PAR_ERR"},
{"mall", 0x4, "CACHE_TAG0_ERR"},
{"mall", 0x5, "CACHE_TAG1_ERR"},
{"mall", 0x6, "CACHE_DAT_ERR"},
{"kpx_wafl", 0x0, "RAMECC"},
{"kpx_wafl", 0x1, "ARCIns"},
{"kpx_wafl", 0x2, "ARCData"},
{"kpx_wafl", 0x3, "APB"},
{"pcs_xgmi", 0x0, "DataLossErr"},
{"pcs_xgmi", 0x1, "TrainingErr"},
{"pcs_xgmi", 0x2, "FlowCtrlAckErr"},
{"pcs_xgmi", 0x3, "RxFifoUnderflowErr"},
{"pcs_xgmi", 0x4, "RxFifoOverflowErr"},
{"pcs_xgmi", 0x5, "CRCErr"},
{"pcs_xgmi", 0x6, "BERExceededErr"},
{"pcs_xgmi", 0x7, "TxMetaDataErr_TxVcidDataErr"},
{"pcs_xgmi", 0x8, "ReplayBufParityErr"},
{"pcs_xgmi", 0x9, "DataParityErr"},
{"pcs_xgmi", 0xa, "ReplayFifoOverflowErr"},
{"pcs_xgmi", 0xb, "ReplaFifoUnderflowErr"},
{"pcs_xgmi", 0xc, "ElasticFifoOverflowErr"},
{"pcs_xgmi", 0xd, "DeskewErr"},
{"pcs_xgmi", 0xe, "FlowCtrlCRCErr"},
{"pcs_xgmi", 0xf, "DataStartupLimitErr"},
{"pcs_xgmi", 0x10, "FCInitTimeoutErr"},
{"pcs_xgmi", 0x11, "RecoveryTimeoutErr"},
{"pcs_xgmi", 0x12, "ReadySerialTimeoutErr"},
{"pcs_xgmi", 0x13, "ReadySerialAttemptErr"},
{"pcs_xgmi", 0x14, "RecoveryAttemptErr"},
{"pcs_xgmi", 0x15, "RecoveryRelockAttemptErr"},
{"pcs_xgmi", 0x16, "ReplayAttemptErr"},
{"pcs_xgmi", 0x17, "SyncHdrErr"},
{"pcs_xgmi", 0x18, "TxReplayTimeoutErr"},
{"pcs_xgmi", 0x19, "RxReplayTimeoutErr"},
{"pcs_xgmi", 0x1a, "LinkSubTxTimeoutErr"},
{"pcs_xgmi", 0x1b, "LinkSubRxTimeoutErr"},
{"pcs_xgmi", 0x1c, "RxCMDPktErr"},
{"nbif", 0x0, "TIMEOUT_ERR"},
{"nbif", 0x1, "SRAM_ECC_ERR"},
{"nbif", 0x2, "NTB_ERR_EVENT"},
{"nbif", 0x3, "SDP_PARITY_ERR"},
{"shub", 0x0, "TIMEOUT_ERR"},
{"shub", 0x1, "SRAM_ECC_ERR"},
{"shub", 0x2, "NTB_ERR_EVENT"},
{"shub", 0x3, "SDP_PARITY_ERR"},
{"usr_dp", 0x0, "MstCMDErr"},
{"usr_dp", 0x1, "MstRxFIFOErr"},
{"usr_dp", 0x2, "MstDeskewErr"},
{"usr_dp", 0x3, "MstDetectTimeoutErr"},
{"usr_dp", 0x4, "MstFlowControlErr"},
{"usr_dp", 0x5, "MstDataValidFifoErr"},
{"usr_dp", 0x6, "macLinkStateErr"},
{"usr_dp", 0x7, "DeskewErr"},
{"usr_dp", 0x8, "InitTimeoutErr"},
{"usr_dp", 0x9, "InitAttemptErr"},
{"usr_dp", 0xa, "RecoveryTimeoutErr"},
{"usr_dp", 0xb, "RecoveryAttemptErr"},
{"usr_dp", 0xc, "EyeTrainingTimeoutErr"},
{"usr_dp", 0xd, "DataStartupLimitErr"},
{"usr_dp", 0xe, "LS0ExitErr"},
{"usr_dp", 0xf, "PLLpowerStateUpdateTimeoutErr"},
{"usr_dp", 0x10, "RxFifoErr"},
{"usr_dp", 0x11, "LcuErr"},
{"usr_dp", 0x12, "convCECCErr"},
{"usr_dp", 0x13, "convUECCErr"},
{"usr_dp", 0x15, "rxDataLossErr"},
{"usr_dp", 0x16, "ReplayCECCErr"},
{"usr_dp", 0x17, "ReplayUECCErr"},
{"usr_dp", 0x18, "CRCErr"},
{"usr_dp", 0x19, "BERExceededErr"},
{"usr_dp", 0x1a, "FCInitTimeoutErr"},
{"usr_dp", 0x1b, "FCInitAttemptErr"},
{"usr_dp", 0x1c, "ReplayTimoutErr"},
{"usr_dp", 0x1d, "ReplayAttemptErr"},
{"usr_dp", 0x1e, "ReplayUnderflowErr"},
{"usr_dp", 0x1f, "ReplayOverflowErr"},
{"usr_cp", 0x0, "PacketTypeErr"},
{"usr_cp", 0x1, "RxFifoErr"},
{"usr_cp", 0x2, "DeskewErr"},
{"usr_cp", 0x3, "RxDetectTimeoutErr"},
{"usr_cp", 0x4, "DataParityErr"},
{"usr_cp", 0x5, "DataLossErr"},
{"usr_cp", 0x6, "LcuErr"},
{"usr_cp", 0x7, "HB1HandshakeTimeoutErr"},
{"usr_cp", 0x8, "HB2HandshakeTimeoutErr"},
{"usr_cp", 0x9, "ClkSleepRspTimeoutErr"},
{"usr_cp", 0xa, "ClkWakeRspTimeoutErr"},
{"usr_cp", 0xb, "resetAttackErr"},
{"usr_cp", 0xc, "remoteLinkFatalErr"},
};
/**
* @brief Error GFX mapping table for XCD errors
*/
const aca_error_entry_t xcd_error_table[] = {
{0x0, "GfxGcError"},
{0x1, "GfxGcError"},
{0x2, "GfxGcError"},
{0x3, "GfxGcError"},
{0x4, "GfxGcError"},
{0x5, "GfxGcError"},
{0x6, "GfxGcError"},
{0x7, "GfxGcError"},
{0x8, "GfxGcError"},
{0x9, "GfxGcError"},
{0xa, "GfxGcError"},
{0xb, "GfxGcError"},
{0xc, "GfxGcError"},
{0xd, "GfxGcError"},
{0xe, "GfxGcError"},
{0xf, "GfxGcError"},
{0x10, "GfxGcError"},
{0x28, "Reserved"},
{0x2a, "Reserved"}};
/**
* @brief Error GFX mapping table for AID errors
*/
const aca_error_entry_t aid_error_table[] = {
{0x0, "GfxGcError"},
{0x1, "GfxGcError"},
{0x2, "GfxGcError"},
{0x3, "GfxGcError"},
{0x4, "GfxGcError"},
{0x5, "GfxMmhubError"},
{0x6, "GfxMmhubError"},
{0x7, "GfxMmhubError"},
{0x8, "GfxMmhubError"},
{0x9, "GfxMmhubError"},
{0xa, "GfxMmhubError"},
{0xb, "GfxMmhubError"},
{0xc, "GfxMmhubError"},
{0xd, "GfxGcError"},
{0xe, "GfxVcnError"},
{0xf, "GfxVcnError"},
{0x10, "GfxVcnError"},
{0x11, "GfxVcnError"},
{0x12, "GfxVcnError"},
{0x13, "GfxVcnError"},
{0x14, "GfxVcnError"},
{0x15, "GfxVcnError"},
{0x16, "GfxVcnError"},
{0x17, "GfxVcnError"},
{0x18, "GfxVcnError"},
{0x19, "GfxVcnError"},
{0x1a, "GfxVcnError"},
{0x1b, "GfxVcnError"},
{0x1c, "GfxVcnError"},
{0x1d, "GfxVcnError"},
{0x1e, "GfxVcnError"},
{0x1f, "GfxVcnError"},
{0x20, "GfxVcnError"},
{0x21, "GfxSdmaError"},
{0x22, "GfxSdmaError"},
{0x23, "GfxSdmaError"},
{0x24, "GfxSdmaError"},
{0x25, "GfxHdpError"},
{0x26, "GfxAthubError"},
{0x27, "GfxGcError"},
{0x28, "Reserved"},
{0x29, "Reserved"},
{0x2a, "Reserved"},
{0x2b, "Reserved"}};
const size_t NUM_BANKS = sizeof(bank_table) / sizeof(bank_table[0]);
const size_t NUM_ERRORS = sizeof(error_table) / sizeof(error_table[0]);
const size_t NUM_XCD_ERRORS = sizeof(xcd_error_table) / sizeof(xcd_error_table[0]);
const size_t NUM_AID_ERRORS = sizeof(aid_error_table) / sizeof(aid_error_table[0]);
int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name)
{
if (!bank_name)
{
return -1;
}
for (size_t i = 0; i < NUM_BANKS; i++)
{
if (bank_table[i].hw_id == hw_id &&
bank_table[i].aca_type == aca_type)
{
*bank_name = bank_table[i].name;
return 0;
}
}
*bank_name = "UNKNOWN";
return 1;
}
int find_error_type_by_bank(const char *bank, uint32_t error_code, const char **error_type)
{
if (!bank || !error_type)
{
return -1;
}
for (size_t i = 0; i < NUM_ERRORS; i++)
{
if (error_code == error_table[i].error_code &&
strcmp(bank, error_table[i].bank) == 0)
{
*error_type = error_table[i].type;
return 0;
}
}
*error_type = "UNKNOWN";
return 1;
}
int find_error_in_table(const aca_error_entry_t *table, size_t table_size,
uint32_t error_code, const char **error_type)
{
if (!table || !error_type)
{
return -1;
}
for (size_t i = 0; i < table_size; i++)
{
if (table[i].error_code == error_code)
{
*error_type = table[i].type;
return 0;
}
}
*error_type = "UNKNOWN";
return 1;
}
+59
Melihat File
@@ -0,0 +1,59 @@
#include "error_map.h"
#include <string.h>
static const error_map_entry_t error_map[] = {
{1, "Boot-Time Errors", "FW Load", "CPER", "Fail-to-init"},
{2, "Boot-Time Errors", "HBM BIST Test", "CPER", "Fail-to-init"},
{3, "Boot-Time Errors", "HBM Memory Test", "CPER", "Fail-to-init"},
{4, "Boot-Time Errors", "HBM Training", "CPER", "Fail-to-init"},
{5, "Boot-Time Errors", "Unhandled", "CPER", "Fail-to-init"},
{6, "Boot-Time Errors", "Unknown", "CPER", "Fail-to-init"},
{7, "Boot-Time Errors", "USR CP Link Training", "CPER", "Fail-to-init"},
{8, "Boot-Time Errors", "USR DP Link Training", "CPER", "Fail-to-init"},
{9, "Boot-Time Errors", "WAFL Link Training", "CPER", "Fail-to-init"},
{10, "Boot-Time Errors", "XGMI Link Training", "CPER", "Fail-to-init"},
{11, "Boot-Time Errors", "Boot Controller Data Abort", "CPER", "Fail-to-init"},
{12, "Boot-Time Errors", "Boot Controller Generic", "CPER ", "Fail-to-init"},
{13, "Off-Package Link Errors", "PCIe AER", "CPER", "Corrected"},
{14, "Off-Package Link Errors", "PCIe AER", "CPER", "Fatal"},
{15, "Off-Package Link Errors", "WAFL", "CPER", "Corrected"},
{16, "Off-Package Link Errors", "WAFL", "CPER", "Fatal"},
{17, "Off-Package Link Errors", "XGMI", "CPER", "Corrected"},
{18, "Off-Package Link Errors", "XGMI", "CPER", "Fatal"},
{19, "HBM Errors", "Bad Page Retirement Threshold", "CPER", "Fatal"},
{20, "HBM Errors", "On-die ECC", "CPER", "Fatal"},
{21, "HBM Errors", "End-to-end CRC", "CPER", "Fatal"},
{22, "HBM Errors", "On-die ECC", "CPER", "Uncorrected, Non-fatal"},
{23, "HBM Errors", "End-to-end CRC", "CPER", "Uncorrected, Non-fatal"},
{24, "HBM Errors", "All", "CPER", "Corrected"},
{25, "HBM Errors", "All Others", "CPER", "Fatal"},
{26, "Device Internal Errors", "Hardware Assertion (HWA)", "CPER", "Fatal"},
{27, "Device Internal Errors", "Watchdog Timeout (WDT)", "CPER", "Fatal"},
{28, "Device Internal Errors", "All Others", "CPER", "Uncorrected, Non-fatal"},
{29, "Device Internal Errors", "All Others", "CPER", "Corrected"},
{30, "Device Internal Errors", "All Others", "CPER", "Fatal"}};
static const size_t NUM_ERROR_ENTRIES = sizeof(error_map) / sizeof(error_map[0]);
int get_error_id(const char *error_category, const char *error_type, const char *error_severity)
{
if (!error_category || !error_type || !error_severity ||
strcmp(error_category, "UNKNOWN") == 0 ||
strcmp(error_type, "UNKNOWN") == 0 ||
strcmp(error_severity, "UNKNOWN") == 0)
{
return -1;
}
for (size_t i = 0; i < NUM_ERROR_ENTRIES; i++)
{
if (strcmp(error_map[i].error_category, error_category) == 0 &&
strcmp(error_map[i].error_type, error_type) == 0 &&
strcmp(error_map[i].error_severity, error_severity) == 0)
{
return (int)error_map[i].id;
}
}
return -1;
}
+60
Melihat File
@@ -46,6 +46,7 @@
#include "amd_smi/amdsmi.h"
#include "amd_smi/impl/fdinfo.h"
#include "amd_smi/impl/amd_smi_common.h"
#include "amd_smi/impl/amd_smi_cper.h"
#include "amd_smi/impl/amd_smi_system.h"
#include "amd_smi/impl/amd_smi_socket.h"
#include "amd_smi/impl/amd_smi_gpu_device.h"
@@ -3950,6 +3951,65 @@ amdsmi_get_gpu_cper_entries(
cursor);
}
amdsmi_status_t amdsmi_get_afids_from_cper(
char* cper_buffer, uint32_t buf_size, uint64_t* afids, uint32_t* num_afids) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] begin\n";
LOG_DEBUG(ss);
if(!cper_buffer) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper_buffer should be a valid memory address\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
else if(!buf_size) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] buf_size should be greater than 0\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
else if(!afids) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] afids should be a valid memory address\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
else if(!num_afids) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] num_afids should be a valid memory address\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
else if(!*num_afids) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] num_afids should be greater than 0\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
const amdsmi_cper_hdr_t *cper = reinterpret_cast<const amdsmi_cper_hdr_t *>(cper_buffer);
if(cper->record_length > buf_size) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper buffer size " << std::dec << buf_size << " is smaller than cper record length " << std::dec << cper->record_length << "\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
else if(strncmp(cper->signature, "CPER", 4) != 0) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper buffer does not have the correct signature\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
int i = 0;
for(int afid: cper_decode(cper)) {
if(i < *num_afids) {
afids[i] = afid;
}
++i;
}
*num_afids = i;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list) {
AMDSMI_CHECK_INIT();
+567
Melihat File
@@ -0,0 +1,567 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <memory>
#include <cstring>
#include <sstream>
#include <vector>
#include "aca-decode/aca_decode.h"
#include "amd_smi/impl/amd_smi_cper.h"
#include "rocm_smi/rocm_smi_logger.h"
namespace {
static std::vector<const amdsmi_cper_hdr_t *>
amdsmi_get_gpu_cper_headers(const char *buffer, size_t buffer_sz) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] buffer_sz: " << buffer_sz;
LOG_DEBUG(ss);
std::vector<const amdsmi_cper_hdr_t *> headers;
if(!buffer) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] buffer is null";
LOG_ERROR(ss);
return headers;
}
static constexpr char cper_signature[] = "CPER";
static constexpr size_t cper_signature_size = sizeof(cper_signature) - 1;
for(size_t data_idx = 0;
buffer_sz >= cper_signature_size &&
data_idx < buffer_sz - cper_signature_size;
++data_idx) {
const amdsmi_cper_hdr_t *hdr = reinterpret_cast<const amdsmi_cper_hdr_t *>(
&buffer[data_idx]);
if(hdr->signature[0] != 'C' || hdr->signature[1] != 'P' ||
hdr->signature[2] != 'E' || hdr->signature[3] != 'R' ) {
continue;
}
if(hdr->signature_end != 0xFFFFFFFF) {
continue;
}
if(hdr->record_length > buffer_sz) {
continue;
}
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] add header at data_idx: " << data_idx
<< ", sig: " << hdr->signature[0] << hdr->signature[1] << hdr->signature[2] << hdr->signature[3];
LOG_DEBUG(ss);
headers.emplace_back(hdr);
}
return headers;
}
struct CperFileCtx {
amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR;
std::unique_ptr<char[]> buffer;
long file_size = 0;
};
static auto amdsmi_read_cper_file(const std::string &filepath) -> CperFileCtx {
std::ostringstream ss;
CperFileCtx ctx;
ctx.status = AMDSMI_STATUS_FILE_ERROR;
ctx.file_size = 0;
struct stat file_stats;
if (stat(filepath.c_str(), &file_stats) == 0) {
if (!S_ISREG(file_stats.st_mode)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file is not a regular file: "
<< filepath << ", errno: " << errno << "): " << strerror(errno);
return ctx;
}
} else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file does not exist: "
<< filepath << ", errno: " << errno << "): " << strerror(errno);
ctx.status = AMDSMI_STATUS_NOT_SUPPORTED;
return ctx;
}
ctx.file_size = file_stats.st_size;
ctx.buffer = std::make_unique<char[]>(ctx.file_size);
int file = open(filepath.c_str(), O_RDONLY);
if (file == -1) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] failed to open file: "
<< filepath << ", errno:()" << errno << "): " << strerror(errno);
LOG_ERROR(ss);
return ctx;
}
long bytes_read = read(file, ctx.buffer.get(), ctx.file_size);
if (bytes_read <= 0) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] failed to read complete file, read only "
<< bytes_read << " of " << ctx.file_size << " bytes";
LOG_ERROR(ss);
return ctx;
}
close(file);
ctx.status = AMDSMI_STATUS_SUCCESS;
ctx.file_size = bytes_read;
return ctx;
}
#define GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
(b) & 0xff, ((b) >> 8) & 0xff, \
(c) & 0xff, ((c) >> 8) & 0xff, \
(d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) };
/* Machine Check Exception */
#define CPER_NOTIFY_MCE \
GUID_INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \
0xE1, 0x49, 0x13, 0xBB)
#define CPER_NOTIFY_CMC \
GUID_INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \
0xEB, 0xD4, 0xF8, 0x90)
#define BOOT_TYPE \
GUID_INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98,0xF3, 0x62, \
0xD4, 0x64, 0xB3, 0x8F)
#define AMD_OOB_CRASHDUMP \
GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0xB0, 0xD0, 0x73, 0x65, \
0x72, 0x5F, 0xD6, 0xAE)
#define AMD_GPU_NONSTANDARD_ERROR \
GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0x81, 0xA2, 0xAC, 0x69, \
0x17, 0x80, 0x55, 0x1D)
#define PROC_ERR_SECTION_TYPE \
GUID_INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \
0x24, 0x2B, 0x6E, 0x1D)
static amdsmi_cper_guid_t mce = CPER_NOTIFY_MCE;
static amdsmi_cper_guid_t cmc = CPER_NOTIFY_CMC;
static amdsmi_cper_guid_t bt = BOOT_TYPE;
static amdsmi_cper_guid_t cr = AMD_OOB_CRASHDUMP;
static amdsmi_cper_guid_t nonstd = AMD_GPU_NONSTANDARD_ERROR;
static amdsmi_cper_guid_t proc_err = PROC_ERR_SECTION_TYPE;
static int cper_is_cr(const amdsmi_cper_guid_t *guid)
{
return !memcmp(&cr, guid, sizeof(amdsmi_cper_guid_t));
}
static int cper_is_nonstd(const amdsmi_cper_guid_t *guid)
{
return !memcmp(&nonstd, guid, sizeof(amdsmi_cper_guid_t));
}
static int cper_is_proc_err(const amdsmi_cper_guid_t *guid)
{
return !memcmp(&proc_err, guid, sizeof(amdsmi_cper_guid_t));
}
static int cper_is_bt(const amdsmi_cper_guid_t *guid)
{
return !memcmp(&bt, guid, sizeof(amdsmi_cper_guid_t));
}
static int cper_num_sec(const amdsmi_cper_hdr_t *hdr)
{
return hdr->sec_cnt;
}
static const amdsmi_cper_guid_t *get_sec_desc_type(const struct cper_sec_desc *desc)
{
return &desc->sec_type;
}
static const amdsmi_cper_guid_t *get_cper_type(const amdsmi_cper_hdr_t *hdr)
{
return &hdr->notify_type;
}
static void* cper_get_sec_desc_offset(const amdsmi_cper_hdr_t *hdr, int idx)
{
char *offset;
if (idx >= hdr->sec_cnt)
return 0;
offset = (char *)hdr + sizeof(amdsmi_cper_hdr_t);
offset += sizeof(struct cper_sec_desc) * idx;
return offset;
}
static void* cper_get_sec_offset(const amdsmi_cper_hdr_t *hdr, int idx)
{
struct cper_sec_desc *tmp_desc;
char *offset;
if (idx >= hdr->sec_cnt)
return 0;
tmp_desc = reinterpret_cast<struct cper_sec_desc *>(
(char *)hdr + sizeof(amdsmi_cper_hdr_t) + sizeof(struct cper_sec_desc) * idx
);
return (char *)hdr + tmp_desc->sec_offset;
}
static int cper_dump_sec_desc(const struct cper_sec_desc *desc)
{
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~SECTION DESCRIPTION~~~\n";
ss << "[SEC DESC] REV Major = 0x" << std::hex << static_cast<int>(desc->revision_major) << "\n";
ss << "[SEC DESC] REV Minor = 0x" << std::hex << static_cast<int>(desc->revision_minor) << "\n";
ss << "[SEC DESC] Length = 0x" << std::hex << desc->sec_length << "\n";
ss << "[SEC DESC] Offset = 0x" << std::hex << desc->sec_offset << "\n";
ss << "[SEC DESC] fru_id = " << desc->fru_id << "\n";
ss << "[SEC DESC] fru_text = " << desc->fru_text << "\n";
ss << std::dec << "\n";
if (cper_is_cr(&desc->sec_type))
ss << "[SEC DESC] AMD CrashDump Section\n";
else if (cper_is_nonstd(&desc->sec_type))
ss << "[SEC DESC] AMD NonStandard Section\n";
else if (cper_is_proc_err(&desc->sec_type))
ss << "[SEC DESC] AMD Proc Error Section\n";
else
ss << "UNKNOWN ERROR TYPE!!\n";
ss << "~~~~SECTION DESCRIPTION~~~\n\n";
LOG_DEBUG(ss);
return 0;
}
static int aca_decode_fatal(const cper_sec_crashdump_data &data)
{
std::ostringstream ss;
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(&data.dump.fatal_err);
aca_raw_data_t raw_data;
raw_data.aca_status = register_array[0];
raw_data.aca_ipid = register_array[2];
raw_data.aca_synd = register_array[3];
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n";
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n";
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n";
raw_data.flags = 0;
raw_data.hw_revision = 1;
aca_error_info_t error_info = aca_decode(&raw_data);
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] fatal error_info.afid: " << std::dec << error_info.afid << "\n";
LOG_DEBUG(ss);
return error_info.afid;
}
static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes) {
std::ostringstream ss;
if(num_bytes != CPER_ACA_REG_COUNT * sizeof(uint32_t)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Size of register array must be " << std::dec << (CPER_ACA_REG_COUNT * sizeof(uint32_t)) << " bytes\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(reg_dump);
aca_raw_data_t raw_data;
raw_data.aca_status = register_array[2];
raw_data.aca_ipid = register_array[5];
raw_data.aca_synd = register_array[6];
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n";
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n";
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n";
raw_data.flags = 0;
raw_data.hw_revision = 1;
aca_error_info_t error_info = aca_decode(&raw_data);
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] non-fatal error_info.afid: " << std::dec << error_info.afid << "\n";
LOG_DEBUG(ss);
return error_info.afid;
}
static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err)
{
std::ostringstream ss;
struct cper_sec_nonstd_err_body *body;
char *offset;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~NON STANDARD SECTION~~~\n";
ss << "[NonSTD SEC] Err Info Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_info_cnt << "\n";
ss << "[NonSTD SEC] Err Context Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_context_cnt << "\n";
if (nonstd_err->hdr.valid_bits.err_context_cnt != nonstd_err->hdr.valid_bits.err_context_cnt) {
ss << "~~~~Malformed Non Standard Section!~~~~\n\n";
goto exit;
}
body = reinterpret_cast<struct cper_sec_nonstd_err_body *>(
(char *)nonstd_err + sizeof(struct cper_sec_nonstd_err_hdr)
);
ss << "[NonSTD SEC] Reg Ctx Type = 0x" << std::hex << body->err_ctx.reg_ctx_type << "\n";
ss << "[NonSTD SEC] Reg Array Size = 0x" << std::hex << body->err_ctx.reg_arr_size << "\n";
for (int i = 0; i < CPER_ACA_REG_COUNT; i++) {
ss << "[NonSTD SEC] reg_dump[" << std::dec << i << "] = 0x" << std::hex << body->err_ctx.reg_dump[i] << "\n";
}
exit:
ss << std::dec << "~~~~NON STANDARD SECTION~~~\n\n";
LOG_DEBUG(ss);
return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump));
}
static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump)
{
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - FATAL~~~\n";
ss << "[Crash Dump - Fatal] status_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.status_lo << "\n";
ss << "[Crash Dump - Fatal] status_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.status_hi << "\n";
ss << "[Crash Dump - Fatal] addr_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.addr_lo << "\n";
ss << "[Crash Dump - Fatal] addr_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.addr_hi << "\n";
ss << "[Crash Dump - Fatal] ipid_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.ipid_lo << "\n";
ss << "[Crash Dump - Fatal] ipid_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.ipid_hi << "\n";
ss << "[Crash Dump - Fatal] synd_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.synd_lo << "\n";
ss << "[Crash Dump - Fatal] synd_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.synd_hi << "\n";
ss << std::dec << "~~~~CRASH DUMP - FATAL~~~\n\n";
LOG_DEBUG(ss);
return aca_decode_fatal(crashdump->data);
}
static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump)
{
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - BOOT TIME~~~\n";
for (int i = 0; i < CPER_MAX_OAM_COUNT; i++) {
ss << "[Crash Dump - Boot] bootmsg[" << std::dec << i << "] = 0x" << std::hex << crashdump->data.dump.boot_err.msg[i] << "\n";
}
ss << "~~~~CRASH DUMP - BOOT TIME~~~\n\n";
LOG_DEBUG(ss);
return aca_decode_fatal(crashdump->data);
}
} //namespace
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
const char *amdgpu_ring_cper_file,
uint32_t severity_mask,
char *cper_data,
uint64_t *buf_size,
amdsmi_cper_hdr_t **cper_hdrs,
uint64_t *entry_count,
uint64_t *cursor) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] begin\n"
<< ", amdgpu_ring_cper_file: " << amdgpu_ring_cper_file
<< ", severity_mask: " << severity_mask;
LOG_DEBUG(ss);
if(!cper_data) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_data should be a valid memory address\n";
LOG_ERROR(ss);
if(entry_count) {*entry_count = 0;}
if(buf_size) { *buf_size = 0; }
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!buf_size) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be a valid memory address";
LOG_ERROR(ss);
if(entry_count) {*entry_count = 0;}
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!entry_count) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be a valid memory address";
LOG_ERROR(ss);
*buf_size = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!*buf_size) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be greater than zero";
LOG_ERROR(ss);
*entry_count = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!*entry_count) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be greater than 0";
LOG_ERROR(ss);
*buf_size = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!cper_hdrs) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs should be a valid memory address";
LOG_ERROR(ss);
*entry_count = 0;
*buf_size = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!cursor) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cursor should be a valid memory address";
LOG_ERROR(ss);
*entry_count = 0;
*buf_size = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
auto ctx = amdsmi_read_cper_file(amdgpu_ring_cper_file);
if(ctx.status != AMDSMI_STATUS_SUCCESS) {
*entry_count = 0;
*buf_size = 0;
return ctx.status;
}
auto headers = amdsmi_get_gpu_cper_headers(ctx.buffer.get(), ctx.file_size);
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] num headers: " << headers.size();
LOG_DEBUG(ss);
uint64_t data_idx = 0;
uint64_t header_idx = 0;
size_t num_headers_copied = 0;
for(const amdsmi_cper_hdr_t *header: headers) {
if(((1 << header->error_severity) & severity_mask) !=
static_cast<uint32_t>(1 << header->error_severity)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x"
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
<< std::hex << severity_mask << ", record_length:"
<< std::dec << header->record_length;
LOG_DEBUG(ss);
continue;
}
else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x"
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
<< std::hex << severity_mask << ", record_length:"
<< std::dec << header->record_length;
LOG_DEBUG(ss);
}
if((*buf_size - data_idx) < header->record_length ) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buffer filled up without copying all cper entries, buf_size: " << std::dec << *buf_size;
LOG_ERROR(ss);
*entry_count = num_headers_copied;
*buf_size = data_idx;
return (data_idx == 0) ?
AMDSMI_STATUS_OUT_OF_RESOURCES :
AMDSMI_STATUS_MORE_DATA;
}
if(num_headers_copied == *entry_count) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs filled up before finished with copying all header pointers, entry_count: " << std::dec << *entry_count;
LOG_ERROR(ss);
*entry_count = num_headers_copied;
*buf_size = data_idx;
return (data_idx == 0) ?
AMDSMI_STATUS_OUT_OF_RESOURCES :
AMDSMI_STATUS_MORE_DATA;
}
if(*cursor != header_idx) {
++header_idx;
continue;
}
cper_hdrs[num_headers_copied] = reinterpret_cast<amdsmi_cper_hdr_t*>(&cper_data[data_idx]);
++num_headers_copied;
*cursor = ++header_idx;
std::memcpy(
&cper_data[data_idx],
reinterpret_cast<const char*>(header),
header->record_length);
data_idx += header->record_length;
}
*entry_count = num_headers_copied;
*buf_size = data_idx;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] *entry_count: " << entry_count
<< ", *cursor: " << cursor
<< ", *buf_size: " << buf_size;
LOG_DEBUG(ss);
return AMDSMI_STATUS_SUCCESS;
}
std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper) {
std::vector<int> afids;
std::ostringstream ss;
for (int i = 0; i < cper_num_sec(cper); i ++) {
void *sec_desc_offset = cper_get_sec_desc_offset(cper, i);
void *sec_offset = cper_get_sec_offset(cper, i);
const amdsmi_cper_guid_t *sec_guid = get_sec_desc_type(static_cast<struct cper_sec_desc *>(sec_desc_offset));
const amdsmi_cper_guid_t *cper_guid = get_cper_type(cper);
cper_dump_sec_desc(static_cast<struct cper_sec_desc *>(sec_desc_offset));
if (cper_is_cr(sec_guid)) {
if (cper_is_bt(cper_guid)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding boot crash dump\n";
LOG_DEBUG(ss);
afids.emplace_back(cper_dump_cr_boot(static_cast<struct cper_sec_crashdump *>(sec_offset)));
}
else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding crash dump\n";
LOG_DEBUG(ss);
afids.emplace_back(cper_dump_cr_fatal(static_cast<struct cper_sec_crashdump *>(sec_offset)));
}
}
else if (cper_is_nonstd(sec_guid)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding non-standard error\n";
LOG_DEBUG(ss);
afids.emplace_back(cper_dump_nonstd_err(static_cast<struct cper_sec_nonstd_err *>(sec_offset)));
}
else if (cper_is_proc_err(sec_guid)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding proc error section type\n";
LOG_DEBUG(ss);
afids.emplace_back(cper_dump_nonstd_err(static_cast<struct cper_sec_nonstd_err *>(sec_offset)));
}
else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Unknown error type!!\n";
for(int i = 0; i < sizeof(sec_guid->b); ++i) {
ss << std::hex << static_cast<int>(sec_guid->b[i]) << ":";
}
ss << "\n";
LOG_ERROR(ss);
}
}
return afids;
}
-136
Melihat File
@@ -1031,142 +1031,6 @@ static auto amdsmi_read_cper_file(const std::string &filepath) -> CperFileCtx {
ctx.file_size = bytes_read;
return ctx;
}
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
const char *amdgpu_ring_cper_file,
uint32_t severity_mask,
char *cper_data,
uint64_t *buf_size,
amdsmi_cper_hdr_t **cper_hdrs,
uint64_t *entry_count,
uint64_t *cursor) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] begin\n"
<< ", amdgpu_ring_cper_file: " << amdgpu_ring_cper_file
<< ", severity_mask: " << severity_mask;
LOG_DEBUG(ss);
if(!cper_data) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_data should be a valid memory address\n";
LOG_ERROR(ss);
if(entry_count) {*entry_count = 0;}
if(buf_size) { *buf_size = 0; }
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!buf_size) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be a valid memory address";
LOG_ERROR(ss);
if(entry_count) {*entry_count = 0;}
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!entry_count) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be a valid memory address";
LOG_ERROR(ss);
*buf_size = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!*buf_size) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be greater than zero";
LOG_ERROR(ss);
*entry_count = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!*entry_count) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be greater than 0";
LOG_ERROR(ss);
*buf_size = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!cper_hdrs) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs should be a valid memory address";
LOG_ERROR(ss);
*entry_count = 0;
*buf_size = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
else if(!cursor) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cursor should be a valid memory address";
LOG_ERROR(ss);
*entry_count = 0;
*buf_size = 0;
return AMDSMI_STATUS_OUT_OF_RESOURCES;
}
auto ctx = amdsmi_read_cper_file(amdgpu_ring_cper_file);
if(ctx.status != AMDSMI_STATUS_SUCCESS) {
*entry_count = 0;
*buf_size = 0;
return ctx.status;
}
auto headers = amdsmi_get_gpu_cper_headers(ctx.buffer.get(), ctx.file_size);
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] num headers: " << headers.size();
LOG_DEBUG(ss);
uint64_t data_idx = 0;
uint64_t header_idx = 0;
size_t num_headers_copied = 0;
for(const amdsmi_cper_hdr_t *header: headers) {
if(((1 << header->error_severity) & severity_mask) !=
static_cast<uint32_t>(1 << header->error_severity)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x"
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
<< std::hex << severity_mask << ", record_length:"
<< std::dec << header->record_length;
LOG_DEBUG(ss);
continue;
}
else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x"
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
<< std::hex << severity_mask << ", record_length:"
<< std::dec << header->record_length;
LOG_DEBUG(ss);
}
if((*buf_size - data_idx) < header->record_length ) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buffer filled up without copying all cper entries, buf_size: " << std::dec << *buf_size;
LOG_ERROR(ss);
*entry_count = num_headers_copied;
*buf_size = data_idx;
return (data_idx == 0) ?
AMDSMI_STATUS_OUT_OF_RESOURCES :
AMDSMI_STATUS_MORE_DATA;
}
if(num_headers_copied == *entry_count) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs filled up before finished with copying all header pointers, entry_count: " << std::dec << *entry_count;
LOG_ERROR(ss);
*entry_count = num_headers_copied;
*buf_size = data_idx;
return (data_idx == 0) ?
AMDSMI_STATUS_OUT_OF_RESOURCES :
AMDSMI_STATUS_MORE_DATA;
}
if(*cursor != header_idx) {
++header_idx;
continue;
}
cper_hdrs[num_headers_copied] = reinterpret_cast<amdsmi_cper_hdr_t*>(&cper_data[data_idx]);
++num_headers_copied;
*cursor = ++header_idx;
std::memcpy(
&cper_data[data_idx],
reinterpret_cast<const char*>(header),
header->record_length);
data_idx += header->record_length;
}
*entry_count = num_headers_copied;
*buf_size = data_idx;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] *entry_count: " << entry_count
<< ", *cursor: " << cursor
<< ", *buf_size: " << buf_size;
LOG_DEBUG(ss);
return AMDSMI_STATUS_SUCCESS;
}
void amdsmi_wait_for_user_input(void) {
for (;;) {
std::cout << "\n\t**Press any key to continue**" << std::endl;