[SWDEV-522623] Add afid functionality to API and CLI (#330)
Change-Id: I015bde926491d54e09da8f39b05650515711e09f
[SWDEV-522623] Add afid functionality to API and CLI
Change-Id: I015bde926491d54e09da8f39b05650515711e09f
Signed-off-by: Oosman Saeed <oossaeed@amd.com>
Co-authored-by: Oosman Saeed <oossaeed@amd.com>
[ROCm/amdsmi commit: 1bb1f8acc2]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
f637f2818e
Коммит
ffc8b09056
@@ -33,6 +33,7 @@ from amdsmi_cli_exceptions import AmdSmiInvalidParameterException, AmdSmiRequire
|
||||
from amdsmi_helpers import AMDSMIHelpers
|
||||
from amdsmi_logger import AMDSMILogger
|
||||
from amdsmi import amdsmi_exception, amdsmi_interface
|
||||
from pathlib import Path
|
||||
|
||||
class AMDSMICommands():
|
||||
"""This class contains all the commands corresponding to AMDSMIParser
|
||||
@@ -6325,9 +6326,35 @@ class AMDSMICommands():
|
||||
with self.logger.destination.open('a', encoding="utf-8") as output_file:
|
||||
output_file.write(legend_output + '\n')
|
||||
|
||||
def __pvtDumpAfids(self, cper_file):
|
||||
# 1) Fetch the CPER “file” and ensure we have raw bytes
|
||||
raw_data = cper_file
|
||||
if hasattr(raw_data, "read"):
|
||||
# fetch_cper_file returned a file‐object
|
||||
raw = raw_data.read()
|
||||
elif isinstance(raw_data, Path):
|
||||
# Path: read the bytes directly
|
||||
raw = raw_data.read_bytes()
|
||||
elif isinstance(raw_data, str):
|
||||
# fetch_cper_file returned a filename
|
||||
with open(raw_data, "rb") as f:
|
||||
raw = f.read()
|
||||
else:
|
||||
# assume it's already bytes
|
||||
raw = raw_data
|
||||
size = len(raw)
|
||||
self.helpers.hexdump_to_string(raw)
|
||||
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
|
||||
print(f"AFIDS: ", end="")
|
||||
for afid in afids:
|
||||
print(afid, end=" ")
|
||||
print("")
|
||||
|
||||
def ras(self, args, multiple_devices=False, gpu=None, cper=None,
|
||||
severity=None, folder=None, file_limit=None, follow=None):
|
||||
|
||||
|
||||
|
||||
def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,
|
||||
severity=None, folder=None, file_limit=None, cper_file=None, follow=None):
|
||||
"""
|
||||
Retrieve and process CPER (RAS) entries for a target GPU.
|
||||
|
||||
@@ -6338,23 +6365,32 @@ class AMDSMICommands():
|
||||
The output file name is auto-generated using the timestamp from the CPER header data (converted from
|
||||
the header’s "YYYY/MM/DD HH:MM:SS" format), along with the GPU/platform ID and error severity.
|
||||
"""
|
||||
|
||||
# GPU handle logic.
|
||||
if gpu:
|
||||
args.gpu = gpu
|
||||
if cper:
|
||||
args.cper = cper
|
||||
if afid:
|
||||
args.afid = afid
|
||||
if severity:
|
||||
args.severity = severity
|
||||
if folder:
|
||||
args.folder = folder
|
||||
if file_limit:
|
||||
args.file_limit = file_limit
|
||||
if cper_file:
|
||||
args.cper_file = cper_file
|
||||
if follow:
|
||||
args.follow = follow
|
||||
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
#Fetching AFID
|
||||
if args.afid and args.cper_file:
|
||||
self.__pvtDumpAfids(args.cper_file)
|
||||
return
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
@@ -6362,7 +6398,6 @@ class AMDSMICommands():
|
||||
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras)
|
||||
if handled_multiple_gpus:
|
||||
return
|
||||
|
||||
args.gpu = device_handle
|
||||
|
||||
# Parse severity mask dynamically from the --severity option.
|
||||
@@ -6381,17 +6416,15 @@ class AMDSMICommands():
|
||||
severity_mask |= (1 << 0)
|
||||
elif sev in ("nonfatal-corrected", "corrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
|
||||
severity_mask |= (1 << 2)
|
||||
|
||||
severity_mask |= (1 << 2)
|
||||
|
||||
cursor = 0
|
||||
buffer_size = 1048576
|
||||
if args.cper:
|
||||
# Start from cursor 0 (no timestamp argument provided).
|
||||
cursor = 0
|
||||
buffer_size = 1048576
|
||||
file_limit = int(args.file_limit) if args.file_limit else 1000
|
||||
|
||||
# Main loop: continuously retrieve CPER entries if --follow is set.
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
# Print header only when dumping to a folder
|
||||
if args.follow and not getattr(self, "_cper_follow_prompted", False):
|
||||
print("Press CTRL + C to stop.")
|
||||
@@ -6409,12 +6442,11 @@ class AMDSMICommands():
|
||||
if partition_id != 0:
|
||||
logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}")
|
||||
return
|
||||
|
||||
if args.folder and args.gpu:
|
||||
print(f"Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}")
|
||||
elif args.folder:
|
||||
|
||||
if args.folder and not getattr(self, "_cper_folder_prompted", False):
|
||||
print(f"Dumping CPER file header entries in folder {args.folder}")
|
||||
|
||||
self._cper_folder_prompted = True
|
||||
|
||||
self.logger.set_cper_exit_message(False)
|
||||
self.stop = False
|
||||
|
||||
|
||||
@@ -1078,7 +1078,7 @@ class AMDSMIHelpers():
|
||||
msg = (
|
||||
"WARNING: User is missing the following required groups: %s. "
|
||||
"Please add user to these groups."
|
||||
) % ", ".join(sorted(missing_groups))
|
||||
) % ", ".join(sodurted(missing_groups))
|
||||
print(msg)
|
||||
logging.warning(msg)
|
||||
|
||||
@@ -1116,7 +1116,7 @@ class AMDSMIHelpers():
|
||||
self._cper_warning_printed = True
|
||||
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
@@ -1138,7 +1138,7 @@ class AMDSMIHelpers():
|
||||
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
time.sleep(1)
|
||||
|
||||
@@ -1156,7 +1156,7 @@ class AMDSMIHelpers():
|
||||
self._cper_warning_printed = True
|
||||
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
@@ -1180,14 +1180,16 @@ class AMDSMIHelpers():
|
||||
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
|
||||
|
||||
def dump_gpu_entries(self, folder, entries, cper_data, device_handle):
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
@@ -1220,7 +1222,7 @@ class AMDSMIHelpers():
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
|
||||
|
||||
@@ -1241,9 +1243,11 @@ class AMDSMIHelpers():
|
||||
|
||||
|
||||
def dump_all_entries(self, folder, entries, cper_data, device_handle):
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
@@ -1276,7 +1280,7 @@ class AMDSMIHelpers():
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
|
||||
try:
|
||||
@@ -1293,9 +1297,11 @@ class AMDSMIHelpers():
|
||||
|
||||
|
||||
def dump_all_entries_follow(self, folder, entries, cper_data, device_handle):
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
@@ -1328,7 +1334,7 @@ class AMDSMIHelpers():
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
time.sleep(1)
|
||||
|
||||
@@ -1346,9 +1352,11 @@ class AMDSMIHelpers():
|
||||
|
||||
|
||||
def dump_gpu_entries_follow(self, folder, entries, cper_data, device_handle):
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
@@ -1381,7 +1389,7 @@ class AMDSMIHelpers():
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
time.sleep(1)
|
||||
|
||||
@@ -1396,3 +1404,35 @@ class AMDSMIHelpers():
|
||||
else:
|
||||
print(json.dumps(entries, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
def hexdump_to_string(self, data: Union[bytes, List[int]]) -> str:
|
||||
"""
|
||||
Convert binary data to a hexdump string.
|
||||
|
||||
Args:
|
||||
data: bytes object or list of integer byte values (0–255).
|
||||
|
||||
Returns:
|
||||
A multiline string, each line showing:
|
||||
offset (in hex), hex bytes (16 per line), and printable ASCII.
|
||||
"""
|
||||
# Normalize to list of ints
|
||||
if isinstance(data, bytes):
|
||||
data_ints = list(data)
|
||||
else:
|
||||
# allow list of ints or single-character strings
|
||||
data_ints = [b if isinstance(b, int) else ord(b) for b in data]
|
||||
|
||||
lines: List[str] = []
|
||||
size = len(data_ints)
|
||||
|
||||
for offset in range(0, size, 16):
|
||||
chunk = data_ints[offset : offset + 16]
|
||||
hex_values = " ".join(f"{b:02x}" for b in chunk)
|
||||
# pad hex_values to 16*3-1 = 47 chars (two hex digits + space)
|
||||
hex_values = hex_values.ljust(16 * 3 - 1)
|
||||
ascii_values = "".join(chr(b) if 32 <= b <= 126 else "." for b in chunk)
|
||||
lines.append(f"{offset:08x} {hex_values} |{ascii_values}|")
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -300,7 +300,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
return CheckOutputFilePath
|
||||
|
||||
|
||||
def _check_input_file_path(self):
|
||||
def _check_cper_file_path(self):
|
||||
""" Argument action validator:
|
||||
Returns a path to a file from the input file path provided.
|
||||
If the file doesn't exist or is empty raise error
|
||||
@@ -310,8 +310,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
def __call__(self, parser, args, values, option_string=None):
|
||||
path = Path(values)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(
|
||||
errno.ENOENT, os.strerror(errno.ENOENT), values)
|
||||
raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct. ")
|
||||
|
||||
if path.is_dir():
|
||||
raise argparse.ArgumentTypeError(
|
||||
@@ -1413,12 +1412,13 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
|
||||
# Help text for RAS arguments
|
||||
cper_help = "Trigger CPER data retrieval"
|
||||
|
||||
afid_help = "Generate an AFID (AMD Field ID) using CPER record, which is similar to XID."
|
||||
severity_choices = ["nonfatal-uncorrected", "fatal", "nonfatal-corrected", "all"]
|
||||
severity_choices_str = ", ".join(severity_choices)
|
||||
severity_help = f"Set the SEVERITY filters from the following:\n {severity_choices_str}"
|
||||
folder_help = "Folder to dump CPER report files"
|
||||
file_limit_help = "Maximum number of entries per output file"
|
||||
cper_file_help = "Full path of the cper record file to generate the AFID"
|
||||
follow_help = "Continuously monitor for new entries"
|
||||
|
||||
ras_parser = subparsers.add_parser("ras", help=ras_help, description=ras_description)
|
||||
@@ -1427,10 +1427,12 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
ras_parser.set_defaults(func=func)
|
||||
|
||||
# Required flags and arguments:
|
||||
ras_parser.add_argument("--cper", action="store_true", required=True, help=cper_help)
|
||||
ras_parser.add_argument("--cper", action="store_true", required=False, help=cper_help)
|
||||
ras_parser.add_argument("--afid", action="store_true", required=False, help=afid_help)
|
||||
ras_parser.add_argument("--severity", type=str.lower, nargs='+', default=['all'], help=severity_help, choices=severity_choices, metavar='SEVERITY')
|
||||
ras_parser.add_argument("--folder", type=str, action=self._check_folder_path(), default=False, help=folder_help)
|
||||
ras_parser.add_argument("--file_limit", type=self._positive_int, action='store', default=1000, help=file_limit_help)
|
||||
ras_parser.add_argument("--cper_file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help)
|
||||
ras_parser.add_argument("--follow", action="store_true", default=False, help=follow_help)
|
||||
|
||||
# Add common modifiers and device selection arguments.
|
||||
|
||||
@@ -5274,3 +5274,52 @@ try:
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
### amdsmi_get_afids_from_cper
|
||||
|
||||
Description: Get the AFIDs from CPER buffer
|
||||
|
||||
Input parameters:
|
||||
* `processor_handle` device which to query
|
||||
* `severity_mask` the severity mask of the entries to be retrieved
|
||||
* `buffer_size` pointer to a variable that specifies the size of the cper_data
|
||||
* `cursor` pointer to a variable that will contain the cursor for the next call
|
||||
|
||||
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. |
|
||||
`notify_type` | The notification type associated with the CPER entry. |
|
||||
`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. |
|
||||
`signature` | A 4-byte signature identifying the entry, typically `CPER`. |
|
||||
`revision` | The revision number of the CPER record format. |
|
||||
`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. |
|
||||
`sec_cnt` | The count of sections included in the CPER entry. |
|
||||
`record_length` | The total length in bytes of the CPER entry. |
|
||||
`platform_id` | A character array identifying the GPU or platform. |
|
||||
`creator_id` | A character array indicating the creator of the CPER entry. |
|
||||
`record_id` | A unique identifier for the CPER entry. |
|
||||
`flags` | Reserved flags related to the CPER entry. |
|
||||
`persistence_info` | Reserved information related to persistence. |
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
for device in devices:
|
||||
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
|
||||
print("CPER entries for device", device)
|
||||
for key, entry in entries.items():
|
||||
print("Entry", key)
|
||||
print(" Error Severity:", entry.get("error_severity", "Unknown"))
|
||||
print(" Notify Type:", entry.get("notify_type", "Unknown"))
|
||||
print(" Timestamp:", entry.get("timestamp", ""))
|
||||
print()
|
||||
print("New Cursor Position:", new_cursor)
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
Исполняемый файл
+64
@@ -0,0 +1,64 @@
|
||||
/**
|
||||
* @file aca_decode.h
|
||||
* @brief Internal decoder interface and data structures
|
||||
*/
|
||||
#ifndef ACA_DECODE_H
|
||||
#define ACA_DECODE_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "aca_fields.h"
|
||||
|
||||
/**
|
||||
* @brief Internal decoder structure with parsed register fields
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t aca_status; /**< Raw status register value */
|
||||
uint64_t aca_ipid; /**< Raw IPID register value */
|
||||
uint64_t aca_synd; /**< Raw syndrome register value */
|
||||
uint32_t flags; /**< Decoder flags */
|
||||
uint16_t hw_revision; /**< Hardware hw_revision */
|
||||
|
||||
aca_status_fields_t status; /**< Parsed status fields */
|
||||
aca_ipid_fields_t ipid; /**< Parsed IPID fields */
|
||||
aca_synd_fields_t synd; /**< Parsed syndrome fields */
|
||||
} aca_decoder_t;
|
||||
|
||||
/**
|
||||
* @brief Structure containing raw ACA error data from hardware
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t aca_status; /**< Raw status register value */
|
||||
uint64_t aca_ipid; /**< Raw IPID register value */
|
||||
uint64_t aca_synd; /**< Raw syndrome register value */
|
||||
uint32_t flags; /**< Flags from descriptor */
|
||||
uint16_t hw_revision; /**< Hardware hw_revision number */
|
||||
} aca_raw_data_t;
|
||||
|
||||
/**
|
||||
* @brief Structure containing decoded error information
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
const char *bank_ref; /**< Reference to bank name string */
|
||||
const char *error_type_ref; /**< Reference to error type string */
|
||||
const char *severity_ref; /**< Reference to error severity string */
|
||||
const char *category_ref; /**< Reference to error category string */
|
||||
int afid; /**< AFID value (AMD Field ID) */
|
||||
} aca_error_info_t;
|
||||
|
||||
/**
|
||||
* @brief Main decode function that processes raw ACA error data
|
||||
* @param[in] raw_data Pointer to structure containing raw ACA error data
|
||||
* @return Decoded error information structure
|
||||
*/
|
||||
aca_error_info_t aca_decode(const aca_raw_data_t *raw_data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif /* ACA_DECODE_H */
|
||||
@@ -0,0 +1,110 @@
|
||||
/**
|
||||
* @file aca_fields.h
|
||||
* @brief ACA register field definitions and manipulation functions
|
||||
*
|
||||
* Contains structures and functions for decoding and handling
|
||||
* ACA register fields. It provides field
|
||||
* definitions for status, IPID, and syndrome registers, along with
|
||||
* functions to initialize and access these fields.
|
||||
*/
|
||||
#ifndef ACA_FIELDS_H
|
||||
#define ACA_FIELDS_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* @brief Base structure for ACA fields containing raw register value
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t raw_value; /**< Raw 64-bit register value */
|
||||
} aca_fields_t;
|
||||
|
||||
/**
|
||||
* @brief Structure containing decoded ACA status register fields
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
aca_fields_t base;
|
||||
uint16_t error_code;
|
||||
uint8_t error_code_ext;
|
||||
uint8_t reserv22;
|
||||
uint8_t addr_lsb;
|
||||
uint8_t reserv30;
|
||||
uint8_t err_core_id;
|
||||
uint8_t reserv38;
|
||||
uint8_t scrub;
|
||||
uint8_t reserv41;
|
||||
uint8_t poison;
|
||||
uint8_t deferred;
|
||||
uint8_t uecc;
|
||||
uint8_t cecc;
|
||||
uint8_t reserv47;
|
||||
uint8_t synd_v;
|
||||
uint8_t reserv54;
|
||||
uint8_t tcc;
|
||||
uint8_t err_core_id_val;
|
||||
uint8_t pcc;
|
||||
uint8_t addr_v;
|
||||
uint8_t misc_v;
|
||||
uint8_t en;
|
||||
uint8_t uc;
|
||||
uint8_t overflow;
|
||||
uint8_t val;
|
||||
} aca_status_fields_t;
|
||||
|
||||
/**
|
||||
* @brief Structure containing decoded ACA IPID register fields
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
aca_fields_t base;
|
||||
uint32_t instance_id_lo;
|
||||
uint16_t hardware_id;
|
||||
uint16_t aca_type;
|
||||
uint8_t instance_id_hi;
|
||||
} aca_ipid_fields_t;
|
||||
|
||||
/**
|
||||
* @brief Structure containing decoded ACA syndrome register fields
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
aca_fields_t base;
|
||||
uint32_t error_information;
|
||||
uint8_t length;
|
||||
uint8_t error_priority;
|
||||
uint8_t reserved27;
|
||||
uint16_t syndrome;
|
||||
uint32_t reserved39;
|
||||
} aca_synd_fields_t;
|
||||
|
||||
/**
|
||||
* @brief Reads the raw value from an ACA field structure
|
||||
* @param[in] fields Pointer to the ACA fields structure
|
||||
* @return The raw 64-bit value stored in the structure
|
||||
*/
|
||||
uint64_t aca_fields_read(const aca_fields_t *fields);
|
||||
|
||||
/**
|
||||
* @brief Initializes ACA status fields from a raw status register value
|
||||
* @param[out] fields Pointer to the status fields structure to initialize
|
||||
* @param[in] status_reg Raw 64-bit status register value
|
||||
*/
|
||||
void aca_status_init(aca_status_fields_t *fields, uint64_t status_reg);
|
||||
|
||||
/**
|
||||
* @brief Initializes ACA IPID fields from a raw IPID register value
|
||||
* @param[out] fields Pointer to the IPID fields structure to initialize
|
||||
* @param[in] ipid_reg Raw 64-bit IPID register value
|
||||
*/
|
||||
void aca_ipid_init(aca_ipid_fields_t *fields, uint64_t ipid_reg);
|
||||
|
||||
/**
|
||||
* @brief Initializes ACA syndrome fields from a raw syndrome register value
|
||||
* @param[out] fields Pointer to the syndrome fields structure to initialize
|
||||
* @param[in] synd_reg Raw 64-bit syndrome register value
|
||||
*/
|
||||
void aca_synd_init(aca_synd_fields_t *fields, uint64_t synd_reg);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,84 @@
|
||||
/**
|
||||
* @file aca_tables.h
|
||||
* @brief ACA lookup table definitions and helper functions
|
||||
* @details Contains data structures and functions definitions for mapping ACA Registers
|
||||
* into their corresponding names and types.
|
||||
*/
|
||||
|
||||
#ifndef ACA_TABLES_H
|
||||
#define ACA_TABLES_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/**
|
||||
* @brief Structure mapping hardware ID and ACA type to bank names
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint16_t hw_id; /**< Hardware ID value */
|
||||
uint16_t aca_type; /**< ACA type identifier */
|
||||
const char *name; /**< Bank name string */
|
||||
} aca_bank_entry_t;
|
||||
|
||||
/**
|
||||
* @brief Structure mapping bank-specific error codes to error types
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
const char *bank; /**< Bank name string */
|
||||
uint32_t error_code; /**< Error code value */
|
||||
const char *type; /**< Error type string */
|
||||
} aca_error_type_t;
|
||||
|
||||
/**
|
||||
* @brief Structure for generic error code to error type mapping
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint32_t error_code; /**< Error code value */
|
||||
const char *type; /**< Error type string */
|
||||
} aca_error_entry_t;
|
||||
|
||||
// External table declarations
|
||||
extern const aca_bank_entry_t bank_table[];
|
||||
extern const aca_error_type_t error_table[];
|
||||
extern const aca_error_entry_t xcd_error_table[];
|
||||
extern const aca_error_entry_t aid_error_table[];
|
||||
|
||||
// Table size constants
|
||||
extern const size_t NUM_BANKS;
|
||||
extern const size_t NUM_ERRORS;
|
||||
extern const size_t NUM_XCD_ERRORS;
|
||||
extern const size_t NUM_AID_ERRORS;
|
||||
|
||||
/**
|
||||
* @brief Find bank name based on hardware ID and ACA type
|
||||
* @param[in] hw_id Hardware ID value
|
||||
* @param[in] aca_type ACA type value
|
||||
* @param[out] bank_name Pointer to store result string
|
||||
* @return 0 on success, 1 if not found, -1 on parameter error
|
||||
*/
|
||||
int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name);
|
||||
|
||||
/**
|
||||
* @brief Find error type for a specific bank and error code
|
||||
* @param[in] bank Bank name string
|
||||
* @param[in] error_code Error code value
|
||||
* @param[out] error_type Pointer to store result string
|
||||
* @return 0 on success, 1 if not found, -1 on parameter error
|
||||
*/
|
||||
int find_error_type_by_bank(const char *bank, uint32_t error_code, const char **error_type);
|
||||
|
||||
/**
|
||||
* @brief Generic lookup for error codes in an error table
|
||||
* @param[in] table Pointer to error table
|
||||
* @param[in] table_size Number of table entries
|
||||
* @param[in] error_code Error code to look up
|
||||
* @param[out] error_type Pointer to store result string
|
||||
* @return 0 on success, 1 if not found, -1 on parameter error
|
||||
*/
|
||||
int find_error_in_table(const aca_error_entry_t *table, size_t table_size,
|
||||
uint32_t error_code, const char **error_type);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,27 @@
|
||||
#ifndef ERROR_MAP_H
|
||||
#define ERROR_MAP_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* @brief Structure representing an error mapping entry
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint32_t id;
|
||||
const char *error_category;
|
||||
const char *error_type;
|
||||
const char *method;
|
||||
const char *error_severity;
|
||||
} error_map_entry_t;
|
||||
|
||||
/**
|
||||
* @brief Get error ID based on category, type and severity
|
||||
* @param[in] error_category Error category string
|
||||
* @param[in] error_type Error type string
|
||||
* @param[in] error_severity Error severity string
|
||||
* @return Error ID if found, -1 if not found
|
||||
*/
|
||||
int get_error_id(const char *error_category, const char *error_type, const char *error_severity);
|
||||
|
||||
#endif /* ERROR_MAP_H */
|
||||
@@ -150,7 +150,7 @@ typedef enum {
|
||||
#define AMDSMI_MAX_NUM_JPEG 32
|
||||
|
||||
/**
|
||||
* @brief new for gpu metrics v1.8, document presents NUM_JPEG_ENG_V1
|
||||
* @brief Introduced in gpu metrics v1.8, document presents NUM_JPEG_ENG_V1
|
||||
* but will change to AMDSMI_MAX_NUM_JPEG_ENG_V1 for continuity
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_JPEG_ENG_V1 40
|
||||
@@ -182,6 +182,11 @@ typedef enum {
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_XCP 8
|
||||
|
||||
/**
|
||||
* @brief Max Number of AFIDs that will be inside one cper entry
|
||||
*/
|
||||
#define MAX_NUMBER_OF_AFIDS_PER_RECORD 12
|
||||
|
||||
/* string format */
|
||||
#define AMDSMI_TIME_FORMAT "%02d:%02d:%02d.%03d"
|
||||
#define AMDSMI_DATE_FORMAT "%04d-%02d-%02d:%02d:%02d:%02d.%03d"
|
||||
@@ -4795,6 +4800,32 @@ amdsmi_get_gpu_cper_entries(amdsmi_processor_handle processor_handle, uint32_t s
|
||||
|
||||
/** @} End tagECCInfo */
|
||||
|
||||
/**
|
||||
* @brief Get the AFIDs from CPER buffer
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf}
|
||||
* @platform{guest_mvf} @platform{guest_windows}
|
||||
*
|
||||
* @details A utility function which retrieves the AFIDs from the CPER record.
|
||||
*
|
||||
* @param[in] cper_buffer a pointer to the buffer with one CPER record. The caller must make sure the whole CPER record is loaded into the buffer.
|
||||
*
|
||||
* @param[in] buf_size is the size of the cper_buffer.
|
||||
*
|
||||
* @param[out] afids a pointer to an array of uint64_t to which the AF IDs will be written
|
||||
*
|
||||
* @param[in,out] num_afids As input, the value passed through this parameter is the number of
|
||||
* uint64_t that may be safely written to the memory pointed to by @p afids. This is the limit
|
||||
* on how many AF IDs will be written to @p afids. On return, @p num_afids will contain the
|
||||
* number of AF IDs written to @p afids, or the number of AF IDs that could have been written
|
||||
* if enough memory had been provided. It is suggest to pass MAX_NUMBER_OF_AFIDS_PER_RECORD for all
|
||||
* AF Ids.
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_afids_from_cper(
|
||||
char* cper_buffer, uint32_t buf_size, uint64_t* afids, uint32_t* num_afids);
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup tagErrorQuery Error Queries
|
||||
* These functions provide error information about AMDSMI calls as well as
|
||||
|
||||
@@ -0,0 +1,223 @@
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
|
||||
#pragma pack(1)
|
||||
|
||||
#define CPER_MAX_OAM_COUNT (8)
|
||||
|
||||
typedef enum cper_error_severity {
|
||||
CPER_SEV_FATAL_UNCORRECTED = 0,
|
||||
CPER_SEV_FATAL = 1,
|
||||
CPER_SEV_FATAL_CORRECTED = 2,
|
||||
|
||||
CPER_SEV_UNUSED = 10,
|
||||
};
|
||||
|
||||
typedef enum cper_aca_reg {
|
||||
CPER_ACA_REG_CTL_LO = 0,
|
||||
CPER_ACA_REG_CTL_HI = 1,
|
||||
CPER_ACA_REG_STATUS_LO = 2,
|
||||
CPER_ACA_REG_STATUS_HI = 3,
|
||||
CPER_ACA_REG_ADDR_LO = 4,
|
||||
CPER_ACA_REG_ADDR_HI = 5,
|
||||
CPER_ACA_REG_MISC0_LO = 6,
|
||||
CPER_ACA_REG_MISC0_HI = 7,
|
||||
CPER_ACA_REG_CONFIG_LO = 8,
|
||||
CPER_ACA_REG_CONFIG_HI = 9,
|
||||
CPER_ACA_REG_IPID_LO = 10,
|
||||
CPER_ACA_REG_IPID_HI = 11,
|
||||
CPER_ACA_REG_SYND_LO = 12,
|
||||
CPER_ACA_REG_SYND_HI = 13,
|
||||
|
||||
CPER_ACA_REG_COUNT = 32,
|
||||
};
|
||||
|
||||
struct cper_sec_desc {
|
||||
uint32_t sec_offset; /* Offset from the start of CPER entry */
|
||||
uint32_t sec_length;
|
||||
uint8_t revision_minor; /* CPER_SEC_MINOR_REV_1 */
|
||||
uint8_t revision_major; /* CPER_SEC_MAJOR_REV_22 */
|
||||
union {
|
||||
struct {
|
||||
uint8_t fru_id : 1;
|
||||
uint8_t fru_text : 1;
|
||||
uint8_t reserved : 6;
|
||||
} valid_bits;
|
||||
uint8_t valid_mask;
|
||||
};
|
||||
uint8_t reserved;
|
||||
union {
|
||||
struct {
|
||||
uint32_t primary : 1;
|
||||
uint32_t reserved1 : 2;
|
||||
uint32_t exceed_err_threshold : 1;
|
||||
uint32_t latent_err : 1; /* "Deferred" error Creation*/
|
||||
uint32_t reserved2 : 27;
|
||||
} flags_bits;
|
||||
uint32_t flags_mask;
|
||||
};
|
||||
amdsmi_cper_guid_t sec_type; /* AMD non-Standard, AMD Crashdump */
|
||||
char fru_id[16]; /* FRU Serial ID */
|
||||
amdsmi_cper_sev_t severity;
|
||||
char fru_text[20]; /* "OAM%d" */
|
||||
};
|
||||
|
||||
struct cper_sec_nonstd_err_info {
|
||||
amdsmi_cper_guid_t error_type;
|
||||
union {
|
||||
struct {
|
||||
uint64_t ms_chk : 1;
|
||||
uint64_t target_addr_id : 1;
|
||||
uint64_t req_id : 1;
|
||||
uint64_t resp_id : 1;
|
||||
uint64_t instr_ptr : 1;
|
||||
uint64_t reserved : 59;
|
||||
} valid_bits;
|
||||
uint64_t valid_mask;
|
||||
};
|
||||
union {
|
||||
struct {
|
||||
uint64_t err_type_valid : 1;
|
||||
uint64_t pcc_valid : 1;
|
||||
uint64_t uncorr_valid : 1;
|
||||
uint64_t precise_ip_valid : 1;
|
||||
uint64_t restartable_ip_valid : 1;
|
||||
uint64_t overflow_valid : 1;
|
||||
uint64_t reserved1 : 10;
|
||||
|
||||
uint64_t err_type : 2;
|
||||
uint64_t pcc : 1;
|
||||
uint64_t uncorr : 1;
|
||||
uint64_t precised_ip : 1;
|
||||
uint64_t restartable_ip : 1;
|
||||
uint64_t overflow : 1;
|
||||
uint64_t reserved2 : 41;
|
||||
} ms_chk_bits;
|
||||
uint64_t ms_chk_mask;
|
||||
};
|
||||
|
||||
uint64_t target_addr_id;
|
||||
uint64_t req_id;
|
||||
uint64_t resp_id;
|
||||
uint64_t instr_ptr;
|
||||
};
|
||||
|
||||
struct cper_sec_nonstd_err_ctx {
|
||||
uint16_t reg_ctx_type;
|
||||
uint16_t reg_arr_size;
|
||||
uint32_t msr_addr;
|
||||
uint64_t mm_reg_addr;
|
||||
uint32_t reg_dump[CPER_ACA_REG_COUNT]; /* This buffer can grow */
|
||||
};
|
||||
|
||||
struct cper_sec_nonstd_err_hdr {
|
||||
union {
|
||||
struct {
|
||||
uint64_t apic_id : 1;
|
||||
uint64_t fw_id : 1;
|
||||
uint64_t err_info_cnt : 6; /* should match context_cnt */
|
||||
uint64_t err_context_cnt : 6; /* should match info_cnt */
|
||||
} valid_bits;
|
||||
uint64_t valid_mask;
|
||||
};
|
||||
|
||||
uint64_t apic_id;
|
||||
char fw_id[48];
|
||||
};
|
||||
|
||||
struct cper_sec_nonstd_err_body {
|
||||
struct cper_sec_nonstd_err_info err_info;
|
||||
struct cper_sec_nonstd_err_ctx err_ctx;
|
||||
};
|
||||
|
||||
struct cper_sec_nonstd_err {
|
||||
struct cper_sec_nonstd_err_hdr hdr;
|
||||
struct cper_sec_nonstd_err_body body[]; /* Variable Size, today only 1 entry */
|
||||
};
|
||||
|
||||
struct cper_sec_crashdump_data {
|
||||
uint16_t reg_ctx_type;
|
||||
uint16_t reg_arr_size;
|
||||
uint32_t reserved1;
|
||||
uint64_t reserved2;
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint32_t status_lo;
|
||||
uint32_t status_hi;
|
||||
uint32_t addr_lo;
|
||||
uint32_t addr_hi;
|
||||
uint32_t ipid_lo;
|
||||
uint32_t ipid_hi;
|
||||
uint32_t synd_lo;
|
||||
uint32_t synd_hi;
|
||||
} fatal_err;
|
||||
|
||||
struct {
|
||||
uint64_t msg[CPER_MAX_OAM_COUNT];
|
||||
} boot_err;
|
||||
} dump;
|
||||
|
||||
};
|
||||
|
||||
struct cper_sec_crashdump {
|
||||
uint64_t reserved1;
|
||||
uint64_t reserved2;
|
||||
char fw_id[48];
|
||||
uint64_t reserved3[8];
|
||||
|
||||
struct cper_sec_crashdump_data data;
|
||||
};
|
||||
|
||||
struct cper_sec {
|
||||
union {
|
||||
struct {
|
||||
uint8_t fru_id : 1;
|
||||
uint8_t fru_text : 1;
|
||||
uint8_t reserved : 6;
|
||||
} valid_bits;
|
||||
uint8_t valid_mask;
|
||||
};
|
||||
|
||||
union {
|
||||
struct cper_sec_crashdump crashdump;
|
||||
struct cper_sec_nonstd_err runtime_err;
|
||||
};
|
||||
};
|
||||
|
||||
/* General CPER record structure */
|
||||
struct cper_1_0 {
|
||||
struct cper_hdr *hdr;
|
||||
struct cper_sec_desc *sec_desc; /* Variable Size */
|
||||
struct cper_sec *sec; /* Variable Size */
|
||||
};
|
||||
|
||||
#pragma pack()
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask,
|
||||
char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs,
|
||||
uint64_t *entry_count, uint64_t *cursor);
|
||||
std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper);
|
||||
@@ -56,7 +56,6 @@ std::string smi_split_string(std::string str, char delim);
|
||||
std::string smi_amdgpu_get_status_string(amdsmi_status_t ret, bool fullStatus);
|
||||
amdsmi_status_t smi_clear_char_and_reinitialize(char buffer[], uint32_t len,
|
||||
std::string newString);
|
||||
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask, char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs, uint64_t *entry_count, uint64_t *cursor);
|
||||
/**
|
||||
* @brief Wait for user input, a debugging function to pause the program
|
||||
*
|
||||
|
||||
@@ -60,6 +60,9 @@ AMDSMI_MAX_NUM_JPEG = 32
|
||||
AMDSMI_MAX_NUM_XCC = 8
|
||||
AMDSMI_MAX_NUM_XCP = 8
|
||||
|
||||
# max num afids per cper record
|
||||
MAX_NUMBER_OF_AFIDS_PER_RECORD = 12
|
||||
|
||||
# Max number of DPM policies
|
||||
AMDSMI_MAX_NUM_PM_POLICIES = 32
|
||||
|
||||
@@ -1888,7 +1891,6 @@ def amdsmi_get_gpu_asic_info(
|
||||
# Remove commas from vendor name for clean output
|
||||
asic_info["vendor_name"] = asic_info["vendor_name"].replace(',', '')
|
||||
|
||||
# logging.debug("amdsmi_interface.py | amdsmi_get_gpu_asic_info | return_dictionary = \n" + str(json.dumps(asic_info, indent=4)))
|
||||
return asic_info
|
||||
|
||||
|
||||
@@ -2300,9 +2302,10 @@ def notifyTypeToString(notify_type_b):
|
||||
idx = idx +1
|
||||
return "".join(guid[::-1])
|
||||
|
||||
def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
def amdsmi_get_gpu_cper_entries(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
severity_mask: int,
|
||||
buffer_size: int = 4*1048576,
|
||||
buffer_size: int = 4 * 1048576,
|
||||
cursor: int = 0
|
||||
) -> Tuple[List[Dict[str, Any]], int]:
|
||||
|
||||
@@ -2316,6 +2319,7 @@ def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processo
|
||||
buf_size = ctypes.c_uint64(buffer_size)
|
||||
entry_count = ctypes.c_uint64(20)
|
||||
cur = ctypes.c_uint64(cursor)
|
||||
|
||||
# Allocate a pointer for the CPER header array.
|
||||
cper_hdrs_array = (ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t) * 20)()
|
||||
cper_hdrs = ctypes.cast(cper_hdrs_array, ctypes.POINTER(ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t)))
|
||||
@@ -2336,51 +2340,114 @@ def amdsmi_get_gpu_cper_entries(processor_handle: amdsmi_wrapper.amdsmi_processo
|
||||
entries = {}
|
||||
cper_data = []
|
||||
offset = 0
|
||||
|
||||
# Iterate over each entry using its variable record_length.
|
||||
for i in range(entry_count.value):
|
||||
entry_address = ctypes.addressof(buf) + offset
|
||||
entry_ptr = ctypes.cast(entry_address, ctypes.POINTER(amdsmi_wrapper.amdsmi_cper_hdr_t))
|
||||
|
||||
# Extract the raw bytes and size of the entry.
|
||||
cper_data.append({
|
||||
"bytes":list((entry_ptr.contents.record_length * ctypes.c_byte).from_address(entry_address)),
|
||||
"size":entry_ptr.contents.record_length
|
||||
"bytes": list((entry_ptr.contents.record_length * ctypes.c_byte).from_address(entry_address)),
|
||||
"size": entry_ptr.contents.record_length
|
||||
})
|
||||
|
||||
# Extract the timestamp fields.
|
||||
year = entry_ptr.contents.timestamp.year
|
||||
# Adjust the year if it's less than 100. You can tweak this logic based on your expected data.
|
||||
if year < 100:
|
||||
year += 2000
|
||||
if year < 100: # Adjust the year if it's less than 100.
|
||||
year += 2000
|
||||
formatted_timestamp = (
|
||||
f"{year:04d}/"
|
||||
f"{entry_ptr.contents.timestamp.month:02d}/"
|
||||
f"{entry_ptr.contents.timestamp.day:02d} "
|
||||
f"{entry_ptr.contents.timestamp.hours:02d}:"
|
||||
f"{entry_ptr.contents.timestamp.minutes:02d}:"
|
||||
f"{entry_ptr.contents.timestamp.seconds:02d}"
|
||||
f"{year:04d}/"
|
||||
f"{entry_ptr.contents.timestamp.month:02d}/"
|
||||
f"{entry_ptr.contents.timestamp.day:02d} "
|
||||
f"{entry_ptr.contents.timestamp.hours:02d}:"
|
||||
f"{entry_ptr.contents.timestamp.minutes:02d}:"
|
||||
f"{entry_ptr.contents.timestamp.seconds:02d}"
|
||||
)
|
||||
|
||||
# Create a dictionary for the CPER entry.
|
||||
cper_entry = {
|
||||
"error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get(entry_ptr.contents.error_severity, "AMDSMI_CPER_SEV_UNUSED").replace("AMDSMI_CPER_SEV_", "").lower(),
|
||||
"error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get(
|
||||
entry_ptr.contents.error_severity, "AMDSMI_CPER_SEV_UNUSED"
|
||||
).replace("AMDSMI_CPER_SEV_", "").lower(),
|
||||
"notify_type": _notifyTypeToString(entry_ptr.contents.notify_type.b),
|
||||
"timestamp": formatted_timestamp,
|
||||
"signature" : entry_ptr.contents.signature,
|
||||
"revision" : entry_ptr.contents.revision,
|
||||
"signature_end" : hex(entry_ptr.contents.signature_end),
|
||||
"sec_cnt" : entry_ptr.contents.sec_cnt,
|
||||
"record_length" : entry_ptr.contents.record_length,
|
||||
"platform_id" : entry_ptr.contents.platform_id,
|
||||
"creator_id" : entry_ptr.contents.creator_id,
|
||||
"record_id" : entry_ptr.contents.record_id,
|
||||
"flags" : entry_ptr.contents.flags,
|
||||
"persistence_info" : entry_ptr.contents.persistence_info,
|
||||
"signature": entry_ptr.contents.signature,
|
||||
"revision": entry_ptr.contents.revision,
|
||||
"signature_end": hex(entry_ptr.contents.signature_end),
|
||||
"sec_cnt": entry_ptr.contents.sec_cnt,
|
||||
"record_length": entry_ptr.contents.record_length,
|
||||
"platform_id": entry_ptr.contents.platform_id,
|
||||
"creator_id": entry_ptr.contents.creator_id,
|
||||
"record_id": entry_ptr.contents.record_id,
|
||||
"flags": entry_ptr.contents.flags,
|
||||
"persistence_info": entry_ptr.contents.persistence_info,
|
||||
#"reserved" : entry_ptr.contents.reserved
|
||||
#"cper_valid_bit" : entry_ptr.contents.cper_valid_bits,
|
||||
#"partition_id" : entry_ptr.contents.partition_id,
|
||||
}
|
||||
|
||||
entries[i] = cper_entry.copy()
|
||||
offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset
|
||||
offset += entry_ptr.contents.record_length # Use the actual record length to advance the offset.
|
||||
|
||||
return entries, cur.value, cper_data
|
||||
|
||||
|
||||
def amdsmi_get_afids_from_cper(
|
||||
cper_afid_data: Union[bytes, bytearray, List[Dict[str, Any]]]
|
||||
) -> Tuple[List[int], int]:
|
||||
"""
|
||||
Extract AFIDs from one or more CPER blobs.
|
||||
|
||||
Args:
|
||||
cper_afid_data: Either
|
||||
- raw bytes or bytearray of a single CPER record, or
|
||||
- a list of dicts each with keys "bytes" (List[int]) and "size" (int).
|
||||
|
||||
Returns:
|
||||
Tuple[List[int], int]: A tuple containing:
|
||||
- A list of extracted AFIDs.
|
||||
- The total count of AFIDs.
|
||||
"""
|
||||
# Normalize single blob into a list of records
|
||||
if isinstance(cper_afid_data, (bytes, bytearray)):
|
||||
cper_records = [{
|
||||
"bytes": list(cper_afid_data),
|
||||
"size": len(cper_afid_data)
|
||||
}]
|
||||
else:
|
||||
cper_records = cper_afid_data
|
||||
|
||||
all_afids: List[int] = []
|
||||
|
||||
for record in cper_records:
|
||||
raw_bytes = bytes(record["bytes"])
|
||||
record_size = record["size"]
|
||||
|
||||
# Wrap as char*
|
||||
buf = ctypes.create_string_buffer(raw_bytes, record_size)
|
||||
buf_ptr = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char))
|
||||
|
||||
afid_array = (ctypes.c_uint64 * MAX_NUMBER_OF_AFIDS_PER_RECORD)()
|
||||
num_afids_ct = ctypes.c_uint32(MAX_NUMBER_OF_AFIDS_PER_RECORD)
|
||||
|
||||
# Call the wrapper function
|
||||
status = amdsmi_wrapper.amdsmi_get_afids_from_cper(
|
||||
buf_ptr,
|
||||
ctypes.c_uint32(record_size),
|
||||
afid_array,
|
||||
ctypes.byref(num_afids_ct)
|
||||
)
|
||||
if status != amdsmi_wrapper.AMDSMI_STATUS_SUCCESS:
|
||||
raise AmdSmiLibraryException(f"get_afids failed: {status}")
|
||||
|
||||
# Collect exactly the decoded AFIDs
|
||||
count = num_afids_ct.value
|
||||
all_afids.extend(afid_array[i] for i in range(count))
|
||||
|
||||
return all_afids, len(all_afids)
|
||||
|
||||
|
||||
def amdsmi_get_gpu_board_info(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
) -> Dict[str, Any]:
|
||||
|
||||
@@ -2642,6 +2642,9 @@ amdsmi_cper_hdr_t = struct_amdsmi_cper_hdr_t
|
||||
amdsmi_get_gpu_cper_entries = _libraries['libamd_smi.so'].amdsmi_get_gpu_cper_entries
|
||||
amdsmi_get_gpu_cper_entries.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_cper_entries.argtypes = [amdsmi_processor_handle, uint32_t, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.POINTER(struct_amdsmi_cper_hdr_t)), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64)]
|
||||
amdsmi_get_afids_from_cper = _libraries['libamd_smi.so'].amdsmi_get_afids_from_cper
|
||||
amdsmi_get_afids_from_cper.restype = amdsmi_status_t
|
||||
amdsmi_get_afids_from_cper.argtypes = [ctypes.POINTER(ctypes.c_char), uint32_t, ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint32)]
|
||||
amdsmi_get_gpu_ecc_status = _libraries['libamd_smi.so'].amdsmi_get_gpu_ecc_status
|
||||
amdsmi_get_gpu_ecc_status.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_ecc_status.argtypes = [amdsmi_processor_handle, amdsmi_gpu_block_t, ctypes.POINTER(amdsmi_ras_err_state_t)]
|
||||
@@ -3171,9 +3174,9 @@ __all__ = \
|
||||
'amdsmi_free_name_value_pairs', 'amdsmi_freq_ind_t',
|
||||
'amdsmi_freq_volt_region_t', 'amdsmi_frequencies_t',
|
||||
'amdsmi_frequency_range_t', 'amdsmi_fw_block_t',
|
||||
'amdsmi_fw_info_t', 'amdsmi_get_clk_freq',
|
||||
'amdsmi_get_clock_info', 'amdsmi_get_cpu_cclk_limit',
|
||||
'amdsmi_get_cpu_core_boostlimit',
|
||||
'amdsmi_fw_info_t', 'amdsmi_get_afids_from_cper',
|
||||
'amdsmi_get_clk_freq', 'amdsmi_get_clock_info',
|
||||
'amdsmi_get_cpu_cclk_limit', 'amdsmi_get_cpu_core_boostlimit',
|
||||
'amdsmi_get_cpu_core_current_freq_limit',
|
||||
'amdsmi_get_cpu_core_energy',
|
||||
'amdsmi_get_cpu_current_io_bandwidth',
|
||||
|
||||
@@ -16,6 +16,7 @@ set(INC_DIR "${PROJECT_SOURCE_DIR}/include/amd_smi")
|
||||
|
||||
set(SRC_LIST
|
||||
"${SRC_DIR}/amd_smi.cc"
|
||||
"${SRC_DIR}/amd_smi_cper.cc"
|
||||
"${SRC_DIR}/amd_smi_common.cc"
|
||||
"${SRC_DIR}/amd_smi_drm.cc"
|
||||
"${SRC_DIR}/amd_smi_gpu_device.cc"
|
||||
@@ -29,6 +30,7 @@ set(SRC_LIST
|
||||
set(INC_LIST
|
||||
"${INC_DIR}/amdsmi.h"
|
||||
"${INC_DIR}/impl/amd_smi_common.h"
|
||||
"${INC_DIR}/impl/amd_smi_cper.h"
|
||||
"${INC_DIR}/impl/amd_smi_processor.h"
|
||||
"${INC_DIR}/impl/amd_smi_drm.h"
|
||||
"${INC_DIR}/impl/amd_smi_gpu_device.h"
|
||||
@@ -38,6 +40,13 @@ set(INC_LIST
|
||||
"${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi.h"
|
||||
"${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi_utils.h")
|
||||
|
||||
set(ACA_SRC_DIR "aca-decode")
|
||||
set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c ${ACA_SRC_DIR}/aca_tables.c
|
||||
${ACA_SRC_DIR}/error_map.c)
|
||||
set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/aca-decode")
|
||||
set(INC_LIST ${INC_LIST} ${ACA_INC_DIR}/aca_decode.h ${ACA_INC_DIR}/aca_fields.h ${ACA_INC_DIR}/aca_tables.h
|
||||
${ACA_INC_DIR}/error_map.h)
|
||||
|
||||
if(ENABLE_ESMI_LIB)
|
||||
list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi.h)
|
||||
list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi_monitor.h)
|
||||
@@ -72,7 +81,7 @@ target_link_libraries(amd_smi_ex ${AMD_SMI})
|
||||
add_library(${AMD_SMI} ${SRC_LIST} ${INC_LIST})
|
||||
target_link_libraries(${AMD_SMI} pthread rt dl ${DRM_LIBRARIES} ${AMDGPU_DRM_LIBRARIES})
|
||||
target_include_directories(${AMD_SMI} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/rocm_smi/include
|
||||
${PROJECT_SOURCE_DIR}/common/shared_mutex)
|
||||
${PROJECT_SOURCE_DIR}/common/shared_mutex ${ACA_INC_DIR})
|
||||
|
||||
# use the target_include_directories() command to specify the include directories for the target
|
||||
target_include_directories(${AMD_SMI} PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
|
||||
|
||||
@@ -0,0 +1,285 @@
|
||||
/**
|
||||
* @file aca_decode.c
|
||||
* @brief Implementation of ACA error decoding functions
|
||||
*
|
||||
* This file contains functions for decoding and analyzing ACA error information from
|
||||
* raw register data. It provides functionality to determine error severity, bank
|
||||
* information, and specific error types based on hardware-specific error codes.
|
||||
*/
|
||||
|
||||
#include "aca_decode.h"
|
||||
#include "aca_tables.h"
|
||||
#include "error_map.h"
|
||||
#include <string.h>
|
||||
|
||||
/**
|
||||
* @brief Gets the bank name based on hardware ID and ACA type
|
||||
* @param[in] decoder Pointer to the ACA decoder structure
|
||||
* @param[out] bank_name Pointer to a string containing the bank name
|
||||
* @return 0 on success, -1 on failure
|
||||
*/
|
||||
static int
|
||||
aca_decoder_get_bank(const aca_decoder_t *decoder, const char **bank_name)
|
||||
{
|
||||
if (!decoder || !bank_name)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
const aca_ipid_fields_t *ipid = &decoder->ipid;
|
||||
return find_bank_name(ipid->hardware_id, ipid->aca_type, bank_name);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determines the error severity based on status fields
|
||||
* @param[in] status Pointer to the ACA status fields structure
|
||||
* @return String indicating error severity: "Fatal", "Uncorrected, Non-fatal", "Corrected", or "UNKNOWN"
|
||||
*/
|
||||
static const char *get_error_severity(const aca_status_fields_t *status)
|
||||
{
|
||||
if (status->poison)
|
||||
return "Uncorrected, Non-fatal";
|
||||
if (status->pcc)
|
||||
return "Fatal";
|
||||
if (!status->pcc && status->uc && status->tcc)
|
||||
return "Fatal";
|
||||
if (!status->pcc && status->uc && !status->tcc)
|
||||
return "Uncorrected, Non-fatal";
|
||||
if (!status->pcc && !status->uc && !status->tcc && status->deferred)
|
||||
return "Uncorrected, Non-fatal";
|
||||
if (!status->pcc && !status->uc && !status->tcc && !status->deferred)
|
||||
return "Corrected";
|
||||
return "UNKNOWN";
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determines the error category based on bank and error type
|
||||
* @param[in] bank Pointer to the bank name
|
||||
* @param[in] error_type Pointer to the error type
|
||||
* @return String indicating error category: "HBM Errors", "Off-Package Link Errors", or "Device Internal Errors"
|
||||
*/
|
||||
static const char *get_error_category(const char *bank, const char *error_type)
|
||||
{
|
||||
if (!bank || !error_type)
|
||||
{
|
||||
return "UNKNOWN";
|
||||
}
|
||||
|
||||
if (strcmp(bank, "umc") == 0)
|
||||
{
|
||||
if (strcmp(error_type, "On-die ECC") == 0 ||
|
||||
strcmp(error_type, "WriteDataPoisonErr") == 0 ||
|
||||
strcmp(error_type, "AddressCommandParityErr") == 0 ||
|
||||
strcmp(error_type, "WriteDataCrcErr") == 0 ||
|
||||
strcmp(error_type, "EcsErr") == 0 ||
|
||||
strcmp(error_type, "RdCrcErr") == 0 ||
|
||||
strcmp(error_type, "End-to-end CRC") == 0)
|
||||
{
|
||||
return "HBM Errors";
|
||||
}
|
||||
}
|
||||
else if (strcmp(bank, "pcs_xgmi") == 0 ||
|
||||
strcmp(bank, "kpx_serdes") == 0 ||
|
||||
strcmp(bank, "kpx_wafl") == 0 ||
|
||||
(strcmp(bank, "psp") == 0 && strcmp(error_type, "WAFL") == 0))
|
||||
{
|
||||
return "Off-Package Link Errors";
|
||||
}
|
||||
|
||||
return "Device Internal Errors";
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determines the service error type from error attributes
|
||||
* @param[in] error_category Pointer to the error category string
|
||||
* @param[in] error_bank Pointer to the error bank string
|
||||
* @param[in] error_type Pointer to the error type string
|
||||
* @param[in] error_severity Pointer to the error severity string
|
||||
* @param[out] service_error_type Pointer to store the resulting service error type string
|
||||
* @return 0 on success, non-zero on failure
|
||||
*/
|
||||
static int get_service_error_type(const char *error_category, const char *error_bank, const char *error_type,
|
||||
const char *error_severity, const char **service_error_type)
|
||||
{
|
||||
if (!error_category || !error_type || !error_severity || !service_error_type ||
|
||||
strcmp(error_category, "UNKNOWN") == 0 ||
|
||||
strcmp(error_type, "UNKNOWN") == 0 ||
|
||||
strcmp(error_severity, "UNKNOWN") == 0)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
if (strcmp(error_type, "Bad Page Retirement Threshold") == 0)
|
||||
{
|
||||
*service_error_type = "Bad Page Retirement Threshold";
|
||||
return 0;
|
||||
}
|
||||
if (strcmp(error_type, "RdCrcErr") == 0)
|
||||
{
|
||||
*service_error_type = "End-to-end CRC";
|
||||
return 0;
|
||||
}
|
||||
if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Corrected") == 0))
|
||||
{
|
||||
*service_error_type = "All";
|
||||
return 0;
|
||||
}
|
||||
if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Fatal") == 0) &&
|
||||
(strcmp(error_type, "On-die ECC") != 0) && (strcmp(error_type, "End-to-end CRC") != 0))
|
||||
{
|
||||
*service_error_type = "All Others";
|
||||
return 0;
|
||||
}
|
||||
if (strcmp(error_category, "Device Internal Errors") == 0)
|
||||
{
|
||||
if ((strcmp(error_severity, "Uncorrected, Non-fatal") == 0 ||
|
||||
strcmp(error_severity, "Corrected") == 0 ||
|
||||
strcmp(error_severity, "Fatal") == 0) &&
|
||||
strcmp(error_type, "Hardware Assertion (HWA)") != 0 &&
|
||||
strcmp(error_type, "Watchdog Timeout (WDT)") != 0)
|
||||
{
|
||||
*service_error_type = "All Others";
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (strcmp(error_category, "Off-Package Link Errors") == 0)
|
||||
{
|
||||
if (strcmp(error_bank, "pcs_xgmi") == 0)
|
||||
{
|
||||
*service_error_type = "XGMI";
|
||||
return 0;
|
||||
}
|
||||
if (strcmp(error_bank, "kpx_wafl") == 0)
|
||||
{
|
||||
*service_error_type = "WAFL";
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Extracts error information from the decoder and populates the info structure
|
||||
* @param[in] decoder Pointer to the ACA decoder structure
|
||||
* @param[out] info Pointer to the error info structure to be populated
|
||||
*/
|
||||
static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_info_t *info)
|
||||
{
|
||||
const char *bank;
|
||||
const char *error_type;
|
||||
int result;
|
||||
|
||||
result = aca_decoder_get_bank(decoder, &bank);
|
||||
if (result < 0)
|
||||
{
|
||||
bank = "UNKNOWN";
|
||||
}
|
||||
info->bank_ref = bank;
|
||||
|
||||
// 0b1000 indicate error threshold has been exceeded, and is always fatal
|
||||
if (decoder->flags & 0x8)
|
||||
{
|
||||
info->severity_ref = "Fatal";
|
||||
}
|
||||
else
|
||||
{
|
||||
info->severity_ref = get_error_severity(&decoder->status);
|
||||
}
|
||||
|
||||
if (decoder->status.error_code_ext >= 0x3A && decoder->status.error_code_ext <= 0x3E)
|
||||
{
|
||||
uint32_t instance_id = decoder->ipid.instance_id_lo;
|
||||
uint32_t error_info = decoder->synd.error_information & 0xFF;
|
||||
|
||||
if ((instance_id == 0x36430400 || instance_id == 0x38430400 ||
|
||||
instance_id == 0x36430401 || instance_id == 0x38430401) &&
|
||||
find_error_in_table(xcd_error_table, NUM_XCD_ERRORS, error_info, &error_type) == 0)
|
||||
{
|
||||
info->error_type_ref = error_type;
|
||||
}
|
||||
else if ((instance_id == 0x3B30400 || instance_id == 0x3B30401) &&
|
||||
find_error_in_table(aid_error_table, NUM_AID_ERRORS, error_info, &error_type) == 0)
|
||||
{
|
||||
info->error_type_ref = error_type;
|
||||
}
|
||||
else
|
||||
{
|
||||
info->error_type_ref = "UNKNOWN";
|
||||
}
|
||||
}
|
||||
// 0b1000 indicate error threshold has been exceeded
|
||||
else if (decoder->flags & 0x8)
|
||||
{
|
||||
info->error_type_ref = "Bad Page Retirement Threshold";
|
||||
}
|
||||
else
|
||||
{
|
||||
if (find_error_type_by_bank(bank, decoder->status.error_code_ext, &error_type) == 0)
|
||||
{
|
||||
info->error_type_ref = error_type;
|
||||
}
|
||||
else
|
||||
{
|
||||
info->error_type_ref = "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
// 0b1000 indicate error threshold has been exceeded, and is always a HBM error
|
||||
if (decoder->flags & 0x8)
|
||||
{
|
||||
info->category_ref = "HBM Errors";
|
||||
}
|
||||
else
|
||||
{
|
||||
info->category_ref = get_error_category(bank, info->error_type_ref);
|
||||
}
|
||||
|
||||
const char *service_error;
|
||||
if (get_service_error_type(info->category_ref, info->bank_ref, info->error_type_ref, info->severity_ref, &service_error) != 0)
|
||||
{
|
||||
service_error = info->error_type_ref;
|
||||
}
|
||||
|
||||
info->afid = get_error_id(info->category_ref, service_error, info->severity_ref);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Initializes an ACA decoder structure with raw register values
|
||||
* @param[out] decoder Pointer to the decoder structure to initialize
|
||||
* @param[in] hw_revision Hardware hw_revision number
|
||||
* @param[in] flags Decoder flags
|
||||
* @param[in] status_reg Raw status register value
|
||||
* @param[in] ipid_reg Raw IPID register value
|
||||
* @param[in] synd_reg Raw syndrome register value
|
||||
*/
|
||||
static void aca_decoder_init(aca_decoder_t *decoder, uint16_t hw_revision, uint32_t flags,
|
||||
uint64_t status_reg, uint64_t ipid_reg, uint64_t synd_reg)
|
||||
{
|
||||
memset(decoder, 0, sizeof(aca_decoder_t));
|
||||
|
||||
decoder->hw_revision = hw_revision;
|
||||
decoder->flags = flags;
|
||||
decoder->aca_status = status_reg;
|
||||
decoder->aca_ipid = ipid_reg;
|
||||
decoder->aca_synd = synd_reg;
|
||||
|
||||
aca_status_init(&decoder->status, status_reg);
|
||||
aca_ipid_init(&decoder->ipid, ipid_reg);
|
||||
aca_synd_init(&decoder->synd, synd_reg);
|
||||
}
|
||||
|
||||
aca_error_info_t aca_decode(const aca_raw_data_t *raw_data)
|
||||
{
|
||||
aca_decoder_t decoder = {0};
|
||||
aca_error_info_t info = {0};
|
||||
|
||||
aca_decoder_init(&decoder,
|
||||
raw_data->hw_revision,
|
||||
raw_data->flags,
|
||||
raw_data->aca_status,
|
||||
raw_data->aca_ipid,
|
||||
raw_data->aca_synd);
|
||||
|
||||
aca_decoder_get_error_info(&decoder, &info);
|
||||
return info;
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
/**
|
||||
* @file aca_fields.c
|
||||
* @brief Implementation of ACA register field handling
|
||||
*
|
||||
* This file contains functions for initializing and reading various ACA register fields
|
||||
* including status, IPID, and syndrome registers. Each function
|
||||
* extracts specific bit fields from raw register values and populates corresponding
|
||||
* field structures.
|
||||
*/
|
||||
|
||||
#include "aca_fields.h"
|
||||
|
||||
/**
|
||||
* @brief Extracts a bit field from a value
|
||||
* @param[in] value The source value to extract bits from
|
||||
* @param[in] start Starting bit position
|
||||
* @param[in] count Number of bits to extract
|
||||
* @param[in] type The type to cast the extracted bits to
|
||||
* @return The extracted bits as a value of the specified type
|
||||
*/
|
||||
#define EXTRACT_BITS(value, start, count, type) ((type)(((value) >> (start)) & ((1ULL << (count)) - 1)))
|
||||
|
||||
uint64_t aca_fields_read(const aca_fields_t *fields)
|
||||
{
|
||||
return fields->raw_value;
|
||||
}
|
||||
|
||||
void aca_status_init(aca_status_fields_t *fields, uint64_t status_reg)
|
||||
{
|
||||
fields->base.raw_value = status_reg;
|
||||
fields->error_code = EXTRACT_BITS(status_reg, 0, 16, uint16_t);
|
||||
fields->error_code_ext = EXTRACT_BITS(status_reg, 16, 6, uint8_t);
|
||||
fields->reserv22 = EXTRACT_BITS(status_reg, 22, 2, uint8_t);
|
||||
fields->addr_lsb = EXTRACT_BITS(status_reg, 24, 6, uint8_t);
|
||||
fields->reserv30 = EXTRACT_BITS(status_reg, 30, 2, uint8_t);
|
||||
fields->err_core_id = EXTRACT_BITS(status_reg, 32, 6, uint8_t);
|
||||
fields->reserv38 = EXTRACT_BITS(status_reg, 38, 2, uint8_t);
|
||||
fields->scrub = EXTRACT_BITS(status_reg, 40, 1, uint8_t);
|
||||
fields->reserv41 = EXTRACT_BITS(status_reg, 41, 2, uint8_t);
|
||||
fields->poison = EXTRACT_BITS(status_reg, 43, 1, uint8_t);
|
||||
fields->deferred = EXTRACT_BITS(status_reg, 44, 1, uint8_t);
|
||||
fields->uecc = EXTRACT_BITS(status_reg, 45, 1, uint8_t);
|
||||
fields->cecc = EXTRACT_BITS(status_reg, 46, 1, uint8_t);
|
||||
fields->reserv47 = EXTRACT_BITS(status_reg, 47, 5, uint8_t);
|
||||
fields->synd_v = EXTRACT_BITS(status_reg, 53, 1, uint8_t);
|
||||
fields->reserv54 = EXTRACT_BITS(status_reg, 54, 1, uint8_t);
|
||||
fields->tcc = EXTRACT_BITS(status_reg, 55, 1, uint8_t);
|
||||
fields->err_core_id_val = EXTRACT_BITS(status_reg, 56, 1, uint8_t);
|
||||
fields->pcc = EXTRACT_BITS(status_reg, 57, 1, uint8_t);
|
||||
fields->addr_v = EXTRACT_BITS(status_reg, 58, 1, uint8_t);
|
||||
fields->misc_v = EXTRACT_BITS(status_reg, 59, 1, uint8_t);
|
||||
fields->en = EXTRACT_BITS(status_reg, 60, 1, uint8_t);
|
||||
fields->uc = EXTRACT_BITS(status_reg, 61, 1, uint8_t);
|
||||
fields->overflow = EXTRACT_BITS(status_reg, 62, 1, uint8_t);
|
||||
fields->val = EXTRACT_BITS(status_reg, 63, 1, uint8_t);
|
||||
}
|
||||
|
||||
void aca_ipid_init(aca_ipid_fields_t *fields, uint64_t ipid_reg)
|
||||
{
|
||||
fields->base.raw_value = ipid_reg;
|
||||
fields->instance_id_lo = EXTRACT_BITS(ipid_reg, 0, 32, uint32_t);
|
||||
fields->hardware_id = EXTRACT_BITS(ipid_reg, 32, 12, uint16_t);
|
||||
fields->instance_id_hi = EXTRACT_BITS(ipid_reg, 44, 4, uint8_t);
|
||||
fields->aca_type = EXTRACT_BITS(ipid_reg, 48, 16, uint16_t);
|
||||
}
|
||||
|
||||
void aca_synd_init(aca_synd_fields_t *fields, uint64_t synd_reg)
|
||||
{
|
||||
fields->base.raw_value = synd_reg;
|
||||
fields->error_information = EXTRACT_BITS(synd_reg, 0, 18, uint32_t);
|
||||
fields->length = EXTRACT_BITS(synd_reg, 18, 6, uint8_t);
|
||||
fields->error_priority = EXTRACT_BITS(synd_reg, 24, 3, uint8_t);
|
||||
fields->reserved27 = EXTRACT_BITS(synd_reg, 27, 5, uint8_t);
|
||||
fields->syndrome = EXTRACT_BITS(synd_reg, 32, 7, uint16_t);
|
||||
fields->reserved39 = EXTRACT_BITS(synd_reg, 39, 25, uint32_t);
|
||||
}
|
||||
@@ -0,0 +1,368 @@
|
||||
/**
|
||||
* @file aca_tables.c
|
||||
* @brief ACA Decode Tables Implementation
|
||||
*
|
||||
* This file contains lookup tables and helper functions for mapping ACA error codes
|
||||
* to human-readable strings. It includes:
|
||||
* - Bank mapping table for hardware IDs and ACA types
|
||||
* - Error type mapping table for bank-specific error codes
|
||||
* - GFX error mapping tables for XCD and AID errors
|
||||
* - Lookup functions to find bank names and error types
|
||||
*/
|
||||
|
||||
#include "aca_tables.h"
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
/**
|
||||
* @brief Mapping table for hardware IDs and ACA types to bank names
|
||||
*/
|
||||
const aca_bank_entry_t bank_table[] = {
|
||||
{0x2E, 0x02, "cs"},
|
||||
{0x2E, 0x01, "pie"},
|
||||
{0x96, 0x00, "umc"},
|
||||
{0xFF, 0x01, "psp"},
|
||||
{0x01, 0x01, "smu"},
|
||||
{0x18, 0x00, "nbio"},
|
||||
{0x46, 0x01, "pcie"},
|
||||
{0x05, 0x00, "pb"},
|
||||
{0x259, 0x00, "kpx_serdes"},
|
||||
{0x2E, 0x04, "mall"},
|
||||
{0x267, 0x00, "kpx_wafl"},
|
||||
{0x50, 0x00, "pcs_xgmi"},
|
||||
{0x6C, 0x00, "nbif"},
|
||||
{0x80, 0x00, "shub"},
|
||||
{0x170, 0x00, "usr_dp"},
|
||||
{0x180, 0x00, "usr_cp"}};
|
||||
|
||||
/**
|
||||
* @brief Mapping table for bank-specific error codes to error types
|
||||
*/
|
||||
const aca_error_type_t error_table[] = {
|
||||
{"cs", 0x0, "FTI_ILL_REQ"},
|
||||
{"cs", 0x1, "FTI_ADDR_VIOL"},
|
||||
{"cs", 0x2, "FTI_SEC_VIOL"},
|
||||
{"cs", 0x3, "FTI_ILL_RSP"},
|
||||
{"cs", 0x4, "FTI_RSP_NO_MTCH"},
|
||||
{"cs", 0x5, "FTI_PAR_ERR"},
|
||||
{"cs", 0x6, "SDP_PAR_ERR"},
|
||||
{"cs", 0x7, "ATM_PAR_ERR"},
|
||||
{"cs", 0x8, "SDP_RSP_NO_MTCH"},
|
||||
{"cs", 0x9, "SPF_PRT_ERR"},
|
||||
{"cs", 0xa, "SPF_ECC_ERR"},
|
||||
{"cs", 0xb, "SDP_UNEXP_RETRY"},
|
||||
{"cs", 0xc, "CNTR_OVFL"},
|
||||
{"cs", 0xd, "CNTR_UNFL"},
|
||||
{"cs", 0xe, "FTI_ND_ILL_REQ"},
|
||||
{"cs", 0xf, "FTI_ND_ADDR_VIOL"},
|
||||
{"cs", 0x10, "FTI_ND_SEC_VIOL"},
|
||||
{"cs", 0x11, "Hardware Assertion (HWA)"},
|
||||
{"cs", 0x12, "ST_PRT_ERR"},
|
||||
{"cs", 0x13, "ST_ECC_ERR"},
|
||||
{"cs", 0x14, "ST_TXN_ERR"},
|
||||
{"pie", 0x0, "Hardware Assertion (HWA)"},
|
||||
{"pie", 0x1, "CSW"},
|
||||
{"pie", 0x2, "GMI"},
|
||||
{"pie", 0x3, "FTI_DAT_STAT"},
|
||||
{"pie", 0x4, "DEF"},
|
||||
{"pie", 0x5, "Watchdog Timeout (WDT)"},
|
||||
{"pie", 0x6, "CNLI"},
|
||||
{"pie", 0x7, "RSLVFCI"},
|
||||
{"umc", 0x0, "On-die ECC"},
|
||||
{"umc", 0x1, "WriteDataPoisonErr"},
|
||||
{"umc", 0x2, "SdpParityErr"},
|
||||
{"umc", 0x4, "AddressCommandParityErr"},
|
||||
{"umc", 0x5, "WriteDataCrcErr"},
|
||||
{"umc", 0x6, "SramEccErr"},
|
||||
{"umc", 0x9, "EcsErr"},
|
||||
{"umc", 0xa, "ThrttlErr"},
|
||||
{"umc", 0xb, "RdCrcErr"},
|
||||
{"umc", 0xd, "MpFwErr"},
|
||||
{"umc", 0xe, "MpParErr"},
|
||||
{"umc", 0xf, "End-to-end CRC"},
|
||||
{"psp", 0x0, "Mp0HighSramError"},
|
||||
{"psp", 0x1, "Mp0LowSramError"},
|
||||
{"psp", 0x2, "Mp0IDataBank0Error"},
|
||||
{"psp", 0x3, "Mp0IDataBank1Error"},
|
||||
{"psp", 0x4, "Mp0ITagRam0Error"},
|
||||
{"psp", 0x5, "Mp0ITagRam1Error"},
|
||||
{"psp", 0x6, "Mp0DDataBank0Error"},
|
||||
{"psp", 0x7, "Mp0DDataBank1Error"},
|
||||
{"psp", 0x8, "Mp0DDataBank2Error"},
|
||||
{"psp", 0x9, "Mp0DDataBank3Error"},
|
||||
{"psp", 0xa, "Mp0DTagBank0Error"},
|
||||
{"psp", 0xb, "Mp0DTagBank1Error"},
|
||||
{"psp", 0xc, "Mp0DTagBank2Error"},
|
||||
{"psp", 0xd, "Mp0DTagBank3Error"},
|
||||
{"psp", 0xe, "Mp0DDirtyRamError"},
|
||||
{"psp", 0xf, "Mp0TlbBank0Error"},
|
||||
{"psp", 0x10, "Mp0TlbBank1Error"},
|
||||
{"psp", 0x11, "Mp0SHubIfRdBufError"},
|
||||
{"psp", 0x12, "PhyRamEccError"},
|
||||
{"psp", 0x3a, "PoisonDataConsumption"},
|
||||
{"psp", 0x3b, "SRAM_EDC"},
|
||||
{"psp", 0x3c, "SMN_Parity"},
|
||||
{"psp", 0x3d, "SMN_Timeout"},
|
||||
{"psp", 0x3f, "WAFL"},
|
||||
{"smu", 0x0, "Mp5HighSramError"},
|
||||
{"smu", 0x1, "Mp5LowSramError"},
|
||||
{"smu", 0x2, "Mp5DCacheAError"},
|
||||
{"smu", 0x3, "Mp5DCacheBError"},
|
||||
{"smu", 0x4, "Mp5DTagAError"},
|
||||
{"smu", 0x5, "Mp5DTagBError"},
|
||||
{"smu", 0x6, "Mp5ICacheAError"},
|
||||
{"smu", 0x7, "Mp5ICacheBError"},
|
||||
{"smu", 0x8, "Mp5ITagAError"},
|
||||
{"smu", 0x9, "Mp5ITagBError"},
|
||||
{"smu", 0xb, "PhyRamEccError"},
|
||||
{"smu", 0x3a, "GFX_IP_Correctable_Error"},
|
||||
{"smu", 0x3b, "GFX_IP_Fatal_Error"},
|
||||
{"smu", 0x3d, "Reserved"},
|
||||
{"smu", 0x3e, "GFX_IP_Poison_Error"},
|
||||
{"nbio", 0x0, "EccParityError"},
|
||||
{"nbio", 0x1, "PCIE_Sideband"},
|
||||
{"nbio", 0x2, "Ext_ErrEvent"},
|
||||
{"nbio", 0x3, "Egress_Poison"},
|
||||
{"nbio", 0x4, "IOHC_Internal_Poison"},
|
||||
{"nbio", 0x5, "Int_ErrEvent"},
|
||||
{"pcie", 0x0, "SDP_PARITY_ERR_LOG"},
|
||||
{"pb", 0x0, "EccError"},
|
||||
{"kpx_serdes", 0x0, "RAMECC"},
|
||||
{"kpx_serdes", 0x1, "ARCIns"},
|
||||
{"kpx_serdes", 0x2, "ARCData"},
|
||||
{"kpx_serdes", 0x3, "APB"},
|
||||
{"mall", 0x0, "CNTR_OVFL"},
|
||||
{"mall", 0x1, "CNTR_UNFL"},
|
||||
{"mall", 0x2, "CSDP_PAR_ERR"},
|
||||
{"mall", 0x3, "USDP_PAR_ERR"},
|
||||
{"mall", 0x4, "CACHE_TAG0_ERR"},
|
||||
{"mall", 0x5, "CACHE_TAG1_ERR"},
|
||||
{"mall", 0x6, "CACHE_DAT_ERR"},
|
||||
{"kpx_wafl", 0x0, "RAMECC"},
|
||||
{"kpx_wafl", 0x1, "ARCIns"},
|
||||
{"kpx_wafl", 0x2, "ARCData"},
|
||||
{"kpx_wafl", 0x3, "APB"},
|
||||
{"pcs_xgmi", 0x0, "DataLossErr"},
|
||||
{"pcs_xgmi", 0x1, "TrainingErr"},
|
||||
{"pcs_xgmi", 0x2, "FlowCtrlAckErr"},
|
||||
{"pcs_xgmi", 0x3, "RxFifoUnderflowErr"},
|
||||
{"pcs_xgmi", 0x4, "RxFifoOverflowErr"},
|
||||
{"pcs_xgmi", 0x5, "CRCErr"},
|
||||
{"pcs_xgmi", 0x6, "BERExceededErr"},
|
||||
{"pcs_xgmi", 0x7, "TxMetaDataErr_TxVcidDataErr"},
|
||||
{"pcs_xgmi", 0x8, "ReplayBufParityErr"},
|
||||
{"pcs_xgmi", 0x9, "DataParityErr"},
|
||||
{"pcs_xgmi", 0xa, "ReplayFifoOverflowErr"},
|
||||
{"pcs_xgmi", 0xb, "ReplaFifoUnderflowErr"},
|
||||
{"pcs_xgmi", 0xc, "ElasticFifoOverflowErr"},
|
||||
{"pcs_xgmi", 0xd, "DeskewErr"},
|
||||
{"pcs_xgmi", 0xe, "FlowCtrlCRCErr"},
|
||||
{"pcs_xgmi", 0xf, "DataStartupLimitErr"},
|
||||
{"pcs_xgmi", 0x10, "FCInitTimeoutErr"},
|
||||
{"pcs_xgmi", 0x11, "RecoveryTimeoutErr"},
|
||||
{"pcs_xgmi", 0x12, "ReadySerialTimeoutErr"},
|
||||
{"pcs_xgmi", 0x13, "ReadySerialAttemptErr"},
|
||||
{"pcs_xgmi", 0x14, "RecoveryAttemptErr"},
|
||||
{"pcs_xgmi", 0x15, "RecoveryRelockAttemptErr"},
|
||||
{"pcs_xgmi", 0x16, "ReplayAttemptErr"},
|
||||
{"pcs_xgmi", 0x17, "SyncHdrErr"},
|
||||
{"pcs_xgmi", 0x18, "TxReplayTimeoutErr"},
|
||||
{"pcs_xgmi", 0x19, "RxReplayTimeoutErr"},
|
||||
{"pcs_xgmi", 0x1a, "LinkSubTxTimeoutErr"},
|
||||
{"pcs_xgmi", 0x1b, "LinkSubRxTimeoutErr"},
|
||||
{"pcs_xgmi", 0x1c, "RxCMDPktErr"},
|
||||
{"nbif", 0x0, "TIMEOUT_ERR"},
|
||||
{"nbif", 0x1, "SRAM_ECC_ERR"},
|
||||
{"nbif", 0x2, "NTB_ERR_EVENT"},
|
||||
{"nbif", 0x3, "SDP_PARITY_ERR"},
|
||||
{"shub", 0x0, "TIMEOUT_ERR"},
|
||||
{"shub", 0x1, "SRAM_ECC_ERR"},
|
||||
{"shub", 0x2, "NTB_ERR_EVENT"},
|
||||
{"shub", 0x3, "SDP_PARITY_ERR"},
|
||||
{"usr_dp", 0x0, "MstCMDErr"},
|
||||
{"usr_dp", 0x1, "MstRxFIFOErr"},
|
||||
{"usr_dp", 0x2, "MstDeskewErr"},
|
||||
{"usr_dp", 0x3, "MstDetectTimeoutErr"},
|
||||
{"usr_dp", 0x4, "MstFlowControlErr"},
|
||||
{"usr_dp", 0x5, "MstDataValidFifoErr"},
|
||||
{"usr_dp", 0x6, "macLinkStateErr"},
|
||||
{"usr_dp", 0x7, "DeskewErr"},
|
||||
{"usr_dp", 0x8, "InitTimeoutErr"},
|
||||
{"usr_dp", 0x9, "InitAttemptErr"},
|
||||
{"usr_dp", 0xa, "RecoveryTimeoutErr"},
|
||||
{"usr_dp", 0xb, "RecoveryAttemptErr"},
|
||||
{"usr_dp", 0xc, "EyeTrainingTimeoutErr"},
|
||||
{"usr_dp", 0xd, "DataStartupLimitErr"},
|
||||
{"usr_dp", 0xe, "LS0ExitErr"},
|
||||
{"usr_dp", 0xf, "PLLpowerStateUpdateTimeoutErr"},
|
||||
{"usr_dp", 0x10, "RxFifoErr"},
|
||||
{"usr_dp", 0x11, "LcuErr"},
|
||||
{"usr_dp", 0x12, "convCECCErr"},
|
||||
{"usr_dp", 0x13, "convUECCErr"},
|
||||
{"usr_dp", 0x15, "rxDataLossErr"},
|
||||
{"usr_dp", 0x16, "ReplayCECCErr"},
|
||||
{"usr_dp", 0x17, "ReplayUECCErr"},
|
||||
{"usr_dp", 0x18, "CRCErr"},
|
||||
{"usr_dp", 0x19, "BERExceededErr"},
|
||||
{"usr_dp", 0x1a, "FCInitTimeoutErr"},
|
||||
{"usr_dp", 0x1b, "FCInitAttemptErr"},
|
||||
{"usr_dp", 0x1c, "ReplayTimoutErr"},
|
||||
{"usr_dp", 0x1d, "ReplayAttemptErr"},
|
||||
{"usr_dp", 0x1e, "ReplayUnderflowErr"},
|
||||
{"usr_dp", 0x1f, "ReplayOverflowErr"},
|
||||
{"usr_cp", 0x0, "PacketTypeErr"},
|
||||
{"usr_cp", 0x1, "RxFifoErr"},
|
||||
{"usr_cp", 0x2, "DeskewErr"},
|
||||
{"usr_cp", 0x3, "RxDetectTimeoutErr"},
|
||||
{"usr_cp", 0x4, "DataParityErr"},
|
||||
{"usr_cp", 0x5, "DataLossErr"},
|
||||
{"usr_cp", 0x6, "LcuErr"},
|
||||
{"usr_cp", 0x7, "HB1HandshakeTimeoutErr"},
|
||||
{"usr_cp", 0x8, "HB2HandshakeTimeoutErr"},
|
||||
{"usr_cp", 0x9, "ClkSleepRspTimeoutErr"},
|
||||
{"usr_cp", 0xa, "ClkWakeRspTimeoutErr"},
|
||||
{"usr_cp", 0xb, "resetAttackErr"},
|
||||
{"usr_cp", 0xc, "remoteLinkFatalErr"},
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Error GFX mapping table for XCD errors
|
||||
*/
|
||||
const aca_error_entry_t xcd_error_table[] = {
|
||||
{0x0, "GfxGcError"},
|
||||
{0x1, "GfxGcError"},
|
||||
{0x2, "GfxGcError"},
|
||||
{0x3, "GfxGcError"},
|
||||
{0x4, "GfxGcError"},
|
||||
{0x5, "GfxGcError"},
|
||||
{0x6, "GfxGcError"},
|
||||
{0x7, "GfxGcError"},
|
||||
{0x8, "GfxGcError"},
|
||||
{0x9, "GfxGcError"},
|
||||
{0xa, "GfxGcError"},
|
||||
{0xb, "GfxGcError"},
|
||||
{0xc, "GfxGcError"},
|
||||
{0xd, "GfxGcError"},
|
||||
{0xe, "GfxGcError"},
|
||||
{0xf, "GfxGcError"},
|
||||
{0x10, "GfxGcError"},
|
||||
{0x28, "Reserved"},
|
||||
{0x2a, "Reserved"}};
|
||||
|
||||
/**
|
||||
* @brief Error GFX mapping table for AID errors
|
||||
*/
|
||||
const aca_error_entry_t aid_error_table[] = {
|
||||
{0x0, "GfxGcError"},
|
||||
{0x1, "GfxGcError"},
|
||||
{0x2, "GfxGcError"},
|
||||
{0x3, "GfxGcError"},
|
||||
{0x4, "GfxGcError"},
|
||||
{0x5, "GfxMmhubError"},
|
||||
{0x6, "GfxMmhubError"},
|
||||
{0x7, "GfxMmhubError"},
|
||||
{0x8, "GfxMmhubError"},
|
||||
{0x9, "GfxMmhubError"},
|
||||
{0xa, "GfxMmhubError"},
|
||||
{0xb, "GfxMmhubError"},
|
||||
{0xc, "GfxMmhubError"},
|
||||
{0xd, "GfxGcError"},
|
||||
{0xe, "GfxVcnError"},
|
||||
{0xf, "GfxVcnError"},
|
||||
{0x10, "GfxVcnError"},
|
||||
{0x11, "GfxVcnError"},
|
||||
{0x12, "GfxVcnError"},
|
||||
{0x13, "GfxVcnError"},
|
||||
{0x14, "GfxVcnError"},
|
||||
{0x15, "GfxVcnError"},
|
||||
{0x16, "GfxVcnError"},
|
||||
{0x17, "GfxVcnError"},
|
||||
{0x18, "GfxVcnError"},
|
||||
{0x19, "GfxVcnError"},
|
||||
{0x1a, "GfxVcnError"},
|
||||
{0x1b, "GfxVcnError"},
|
||||
{0x1c, "GfxVcnError"},
|
||||
{0x1d, "GfxVcnError"},
|
||||
{0x1e, "GfxVcnError"},
|
||||
{0x1f, "GfxVcnError"},
|
||||
{0x20, "GfxVcnError"},
|
||||
{0x21, "GfxSdmaError"},
|
||||
{0x22, "GfxSdmaError"},
|
||||
{0x23, "GfxSdmaError"},
|
||||
{0x24, "GfxSdmaError"},
|
||||
{0x25, "GfxHdpError"},
|
||||
{0x26, "GfxAthubError"},
|
||||
{0x27, "GfxGcError"},
|
||||
{0x28, "Reserved"},
|
||||
{0x29, "Reserved"},
|
||||
{0x2a, "Reserved"},
|
||||
{0x2b, "Reserved"}};
|
||||
|
||||
const size_t NUM_BANKS = sizeof(bank_table) / sizeof(bank_table[0]);
|
||||
const size_t NUM_ERRORS = sizeof(error_table) / sizeof(error_table[0]);
|
||||
const size_t NUM_XCD_ERRORS = sizeof(xcd_error_table) / sizeof(xcd_error_table[0]);
|
||||
const size_t NUM_AID_ERRORS = sizeof(aid_error_table) / sizeof(aid_error_table[0]);
|
||||
|
||||
int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name)
|
||||
{
|
||||
if (!bank_name)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < NUM_BANKS; i++)
|
||||
{
|
||||
if (bank_table[i].hw_id == hw_id &&
|
||||
bank_table[i].aca_type == aca_type)
|
||||
{
|
||||
*bank_name = bank_table[i].name;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
*bank_name = "UNKNOWN";
|
||||
return 1;
|
||||
}
|
||||
|
||||
int find_error_type_by_bank(const char *bank, uint32_t error_code, const char **error_type)
|
||||
{
|
||||
if (!bank || !error_type)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < NUM_ERRORS; i++)
|
||||
{
|
||||
if (error_code == error_table[i].error_code &&
|
||||
strcmp(bank, error_table[i].bank) == 0)
|
||||
{
|
||||
*error_type = error_table[i].type;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
*error_type = "UNKNOWN";
|
||||
return 1;
|
||||
}
|
||||
|
||||
int find_error_in_table(const aca_error_entry_t *table, size_t table_size,
|
||||
uint32_t error_code, const char **error_type)
|
||||
{
|
||||
if (!table || !error_type)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < table_size; i++)
|
||||
{
|
||||
if (table[i].error_code == error_code)
|
||||
{
|
||||
*error_type = table[i].type;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
*error_type = "UNKNOWN";
|
||||
return 1;
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
#include "error_map.h"
|
||||
#include <string.h>
|
||||
|
||||
static const error_map_entry_t error_map[] = {
|
||||
{1, "Boot-Time Errors", "FW Load", "CPER", "Fail-to-init"},
|
||||
{2, "Boot-Time Errors", "HBM BIST Test", "CPER", "Fail-to-init"},
|
||||
{3, "Boot-Time Errors", "HBM Memory Test", "CPER", "Fail-to-init"},
|
||||
{4, "Boot-Time Errors", "HBM Training", "CPER", "Fail-to-init"},
|
||||
{5, "Boot-Time Errors", "Unhandled", "CPER", "Fail-to-init"},
|
||||
{6, "Boot-Time Errors", "Unknown", "CPER", "Fail-to-init"},
|
||||
{7, "Boot-Time Errors", "USR CP Link Training", "CPER", "Fail-to-init"},
|
||||
{8, "Boot-Time Errors", "USR DP Link Training", "CPER", "Fail-to-init"},
|
||||
{9, "Boot-Time Errors", "WAFL Link Training", "CPER", "Fail-to-init"},
|
||||
{10, "Boot-Time Errors", "XGMI Link Training", "CPER", "Fail-to-init"},
|
||||
{11, "Boot-Time Errors", "Boot Controller Data Abort", "CPER", "Fail-to-init"},
|
||||
{12, "Boot-Time Errors", "Boot Controller Generic", "CPER ", "Fail-to-init"},
|
||||
{13, "Off-Package Link Errors", "PCIe AER", "CPER", "Corrected"},
|
||||
{14, "Off-Package Link Errors", "PCIe AER", "CPER", "Fatal"},
|
||||
{15, "Off-Package Link Errors", "WAFL", "CPER", "Corrected"},
|
||||
{16, "Off-Package Link Errors", "WAFL", "CPER", "Fatal"},
|
||||
{17, "Off-Package Link Errors", "XGMI", "CPER", "Corrected"},
|
||||
{18, "Off-Package Link Errors", "XGMI", "CPER", "Fatal"},
|
||||
{19, "HBM Errors", "Bad Page Retirement Threshold", "CPER", "Fatal"},
|
||||
{20, "HBM Errors", "On-die ECC", "CPER", "Fatal"},
|
||||
{21, "HBM Errors", "End-to-end CRC", "CPER", "Fatal"},
|
||||
{22, "HBM Errors", "On-die ECC", "CPER", "Uncorrected, Non-fatal"},
|
||||
{23, "HBM Errors", "End-to-end CRC", "CPER", "Uncorrected, Non-fatal"},
|
||||
{24, "HBM Errors", "All", "CPER", "Corrected"},
|
||||
{25, "HBM Errors", "All Others", "CPER", "Fatal"},
|
||||
{26, "Device Internal Errors", "Hardware Assertion (HWA)", "CPER", "Fatal"},
|
||||
{27, "Device Internal Errors", "Watchdog Timeout (WDT)", "CPER", "Fatal"},
|
||||
{28, "Device Internal Errors", "All Others", "CPER", "Uncorrected, Non-fatal"},
|
||||
{29, "Device Internal Errors", "All Others", "CPER", "Corrected"},
|
||||
{30, "Device Internal Errors", "All Others", "CPER", "Fatal"}};
|
||||
|
||||
static const size_t NUM_ERROR_ENTRIES = sizeof(error_map) / sizeof(error_map[0]);
|
||||
|
||||
int get_error_id(const char *error_category, const char *error_type, const char *error_severity)
|
||||
{
|
||||
if (!error_category || !error_type || !error_severity ||
|
||||
strcmp(error_category, "UNKNOWN") == 0 ||
|
||||
strcmp(error_type, "UNKNOWN") == 0 ||
|
||||
strcmp(error_severity, "UNKNOWN") == 0)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < NUM_ERROR_ENTRIES; i++)
|
||||
{
|
||||
if (strcmp(error_map[i].error_category, error_category) == 0 &&
|
||||
strcmp(error_map[i].error_type, error_type) == 0 &&
|
||||
strcmp(error_map[i].error_severity, error_severity) == 0)
|
||||
{
|
||||
return (int)error_map[i].id;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
@@ -46,6 +46,7 @@
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "amd_smi/impl/fdinfo.h"
|
||||
#include "amd_smi/impl/amd_smi_common.h"
|
||||
#include "amd_smi/impl/amd_smi_cper.h"
|
||||
#include "amd_smi/impl/amd_smi_system.h"
|
||||
#include "amd_smi/impl/amd_smi_socket.h"
|
||||
#include "amd_smi/impl/amd_smi_gpu_device.h"
|
||||
@@ -3950,6 +3951,65 @@ amdsmi_get_gpu_cper_entries(
|
||||
cursor);
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_afids_from_cper(
|
||||
char* cper_buffer, uint32_t buf_size, uint64_t* afids, uint32_t* num_afids) {
|
||||
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] begin\n";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
if(!cper_buffer) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper_buffer should be a valid memory address\n";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
else if(!buf_size) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] buf_size should be greater than 0\n";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
else if(!afids) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] afids should be a valid memory address\n";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
else if(!num_afids) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] num_afids should be a valid memory address\n";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
else if(!*num_afids) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] num_afids should be greater than 0\n";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
const amdsmi_cper_hdr_t *cper = reinterpret_cast<const amdsmi_cper_hdr_t *>(cper_buffer);
|
||||
if(cper->record_length > buf_size) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper buffer size " << std::dec << buf_size << " is smaller than cper record length " << std::dec << cper->record_length << "\n";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
else if(strncmp(cper->signature, "CPER", 4) != 0) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper buffer does not have the correct signature\n";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for(int afid: cper_decode(cper)) {
|
||||
if(i < *num_afids) {
|
||||
afids[i] = afid;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
*num_afids = i;
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
@@ -0,0 +1,567 @@
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <memory>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include "aca-decode/aca_decode.h"
|
||||
#include "amd_smi/impl/amd_smi_cper.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
namespace {
|
||||
static std::vector<const amdsmi_cper_hdr_t *>
|
||||
amdsmi_get_gpu_cper_headers(const char *buffer, size_t buffer_sz) {
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] buffer_sz: " << buffer_sz;
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
std::vector<const amdsmi_cper_hdr_t *> headers;
|
||||
if(!buffer) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] buffer is null";
|
||||
LOG_ERROR(ss);
|
||||
return headers;
|
||||
}
|
||||
static constexpr char cper_signature[] = "CPER";
|
||||
static constexpr size_t cper_signature_size = sizeof(cper_signature) - 1;
|
||||
for(size_t data_idx = 0;
|
||||
buffer_sz >= cper_signature_size &&
|
||||
data_idx < buffer_sz - cper_signature_size;
|
||||
++data_idx) {
|
||||
|
||||
const amdsmi_cper_hdr_t *hdr = reinterpret_cast<const amdsmi_cper_hdr_t *>(
|
||||
&buffer[data_idx]);
|
||||
if(hdr->signature[0] != 'C' || hdr->signature[1] != 'P' ||
|
||||
hdr->signature[2] != 'E' || hdr->signature[3] != 'R' ) {
|
||||
continue;
|
||||
}
|
||||
if(hdr->signature_end != 0xFFFFFFFF) {
|
||||
continue;
|
||||
}
|
||||
if(hdr->record_length > buffer_sz) {
|
||||
continue;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] add header at data_idx: " << data_idx
|
||||
<< ", sig: " << hdr->signature[0] << hdr->signature[1] << hdr->signature[2] << hdr->signature[3];
|
||||
LOG_DEBUG(ss);
|
||||
headers.emplace_back(hdr);
|
||||
}
|
||||
return headers;
|
||||
}
|
||||
|
||||
struct CperFileCtx {
|
||||
amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR;
|
||||
std::unique_ptr<char[]> buffer;
|
||||
long file_size = 0;
|
||||
};
|
||||
|
||||
static auto amdsmi_read_cper_file(const std::string &filepath) -> CperFileCtx {
|
||||
|
||||
std::ostringstream ss;
|
||||
|
||||
CperFileCtx ctx;
|
||||
ctx.status = AMDSMI_STATUS_FILE_ERROR;
|
||||
ctx.file_size = 0;
|
||||
|
||||
struct stat file_stats;
|
||||
if (stat(filepath.c_str(), &file_stats) == 0) {
|
||||
if (!S_ISREG(file_stats.st_mode)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file is not a regular file: "
|
||||
<< filepath << ", errno: " << errno << "): " << strerror(errno);
|
||||
return ctx;
|
||||
}
|
||||
} else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file does not exist: "
|
||||
<< filepath << ", errno: " << errno << "): " << strerror(errno);
|
||||
ctx.status = AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
return ctx;
|
||||
}
|
||||
|
||||
ctx.file_size = file_stats.st_size;
|
||||
ctx.buffer = std::make_unique<char[]>(ctx.file_size);
|
||||
int file = open(filepath.c_str(), O_RDONLY);
|
||||
if (file == -1) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] failed to open file: "
|
||||
<< filepath << ", errno:()" << errno << "): " << strerror(errno);
|
||||
LOG_ERROR(ss);
|
||||
return ctx;
|
||||
}
|
||||
long bytes_read = read(file, ctx.buffer.get(), ctx.file_size);
|
||||
if (bytes_read <= 0) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] failed to read complete file, read only "
|
||||
<< bytes_read << " of " << ctx.file_size << " bytes";
|
||||
LOG_ERROR(ss);
|
||||
return ctx;
|
||||
}
|
||||
close(file);
|
||||
|
||||
ctx.status = AMDSMI_STATUS_SUCCESS;
|
||||
ctx.file_size = bytes_read;
|
||||
return ctx;
|
||||
}
|
||||
|
||||
#define GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
|
||||
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
|
||||
(b) & 0xff, ((b) >> 8) & 0xff, \
|
||||
(c) & 0xff, ((c) >> 8) & 0xff, \
|
||||
(d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) };
|
||||
|
||||
/* Machine Check Exception */
|
||||
#define CPER_NOTIFY_MCE \
|
||||
GUID_INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \
|
||||
0xE1, 0x49, 0x13, 0xBB)
|
||||
#define CPER_NOTIFY_CMC \
|
||||
GUID_INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \
|
||||
0xEB, 0xD4, 0xF8, 0x90)
|
||||
#define BOOT_TYPE \
|
||||
GUID_INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98,0xF3, 0x62, \
|
||||
0xD4, 0x64, 0xB3, 0x8F)
|
||||
#define AMD_OOB_CRASHDUMP \
|
||||
GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0xB0, 0xD0, 0x73, 0x65, \
|
||||
0x72, 0x5F, 0xD6, 0xAE)
|
||||
#define AMD_GPU_NONSTANDARD_ERROR \
|
||||
GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0x81, 0xA2, 0xAC, 0x69, \
|
||||
0x17, 0x80, 0x55, 0x1D)
|
||||
#define PROC_ERR_SECTION_TYPE \
|
||||
GUID_INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \
|
||||
0x24, 0x2B, 0x6E, 0x1D)
|
||||
|
||||
static amdsmi_cper_guid_t mce = CPER_NOTIFY_MCE;
|
||||
static amdsmi_cper_guid_t cmc = CPER_NOTIFY_CMC;
|
||||
static amdsmi_cper_guid_t bt = BOOT_TYPE;
|
||||
static amdsmi_cper_guid_t cr = AMD_OOB_CRASHDUMP;
|
||||
static amdsmi_cper_guid_t nonstd = AMD_GPU_NONSTANDARD_ERROR;
|
||||
static amdsmi_cper_guid_t proc_err = PROC_ERR_SECTION_TYPE;
|
||||
|
||||
static int cper_is_cr(const amdsmi_cper_guid_t *guid)
|
||||
{
|
||||
return !memcmp(&cr, guid, sizeof(amdsmi_cper_guid_t));
|
||||
}
|
||||
|
||||
static int cper_is_nonstd(const amdsmi_cper_guid_t *guid)
|
||||
{
|
||||
return !memcmp(&nonstd, guid, sizeof(amdsmi_cper_guid_t));
|
||||
}
|
||||
|
||||
static int cper_is_proc_err(const amdsmi_cper_guid_t *guid)
|
||||
{
|
||||
return !memcmp(&proc_err, guid, sizeof(amdsmi_cper_guid_t));
|
||||
}
|
||||
|
||||
static int cper_is_bt(const amdsmi_cper_guid_t *guid)
|
||||
{
|
||||
return !memcmp(&bt, guid, sizeof(amdsmi_cper_guid_t));
|
||||
}
|
||||
|
||||
static int cper_num_sec(const amdsmi_cper_hdr_t *hdr)
|
||||
{
|
||||
return hdr->sec_cnt;
|
||||
}
|
||||
|
||||
static const amdsmi_cper_guid_t *get_sec_desc_type(const struct cper_sec_desc *desc)
|
||||
{
|
||||
return &desc->sec_type;
|
||||
}
|
||||
|
||||
static const amdsmi_cper_guid_t *get_cper_type(const amdsmi_cper_hdr_t *hdr)
|
||||
{
|
||||
return &hdr->notify_type;
|
||||
}
|
||||
|
||||
static void* cper_get_sec_desc_offset(const amdsmi_cper_hdr_t *hdr, int idx)
|
||||
{
|
||||
char *offset;
|
||||
|
||||
if (idx >= hdr->sec_cnt)
|
||||
return 0;
|
||||
|
||||
offset = (char *)hdr + sizeof(amdsmi_cper_hdr_t);
|
||||
offset += sizeof(struct cper_sec_desc) * idx;
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
static void* cper_get_sec_offset(const amdsmi_cper_hdr_t *hdr, int idx)
|
||||
{
|
||||
struct cper_sec_desc *tmp_desc;
|
||||
char *offset;
|
||||
|
||||
if (idx >= hdr->sec_cnt)
|
||||
return 0;
|
||||
|
||||
tmp_desc = reinterpret_cast<struct cper_sec_desc *>(
|
||||
(char *)hdr + sizeof(amdsmi_cper_hdr_t) + sizeof(struct cper_sec_desc) * idx
|
||||
);
|
||||
|
||||
return (char *)hdr + tmp_desc->sec_offset;
|
||||
}
|
||||
|
||||
static int cper_dump_sec_desc(const struct cper_sec_desc *desc)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~SECTION DESCRIPTION~~~\n";
|
||||
|
||||
ss << "[SEC DESC] REV Major = 0x" << std::hex << static_cast<int>(desc->revision_major) << "\n";
|
||||
ss << "[SEC DESC] REV Minor = 0x" << std::hex << static_cast<int>(desc->revision_minor) << "\n";
|
||||
ss << "[SEC DESC] Length = 0x" << std::hex << desc->sec_length << "\n";
|
||||
ss << "[SEC DESC] Offset = 0x" << std::hex << desc->sec_offset << "\n";
|
||||
|
||||
ss << "[SEC DESC] fru_id = " << desc->fru_id << "\n";
|
||||
ss << "[SEC DESC] fru_text = " << desc->fru_text << "\n";
|
||||
|
||||
ss << std::dec << "\n";
|
||||
|
||||
if (cper_is_cr(&desc->sec_type))
|
||||
ss << "[SEC DESC] AMD CrashDump Section\n";
|
||||
else if (cper_is_nonstd(&desc->sec_type))
|
||||
ss << "[SEC DESC] AMD NonStandard Section\n";
|
||||
else if (cper_is_proc_err(&desc->sec_type))
|
||||
ss << "[SEC DESC] AMD Proc Error Section\n";
|
||||
else
|
||||
ss << "UNKNOWN ERROR TYPE!!\n";
|
||||
|
||||
ss << "~~~~SECTION DESCRIPTION~~~\n\n";
|
||||
|
||||
LOG_DEBUG(ss);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int aca_decode_fatal(const cper_sec_crashdump_data &data)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
|
||||
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(&data.dump.fatal_err);
|
||||
aca_raw_data_t raw_data;
|
||||
raw_data.aca_status = register_array[0];
|
||||
raw_data.aca_ipid = register_array[2];
|
||||
raw_data.aca_synd = register_array[3];
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n";
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n";
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n";
|
||||
|
||||
raw_data.flags = 0;
|
||||
raw_data.hw_revision = 1;
|
||||
|
||||
aca_error_info_t error_info = aca_decode(&raw_data);
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] fatal error_info.afid: " << std::dec << error_info.afid << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return error_info.afid;
|
||||
}
|
||||
|
||||
static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes) {
|
||||
|
||||
std::ostringstream ss;
|
||||
if(num_bytes != CPER_ACA_REG_COUNT * sizeof(uint32_t)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Size of register array must be " << std::dec << (CPER_ACA_REG_COUNT * sizeof(uint32_t)) << " bytes\n";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(reg_dump);
|
||||
aca_raw_data_t raw_data;
|
||||
raw_data.aca_status = register_array[2];
|
||||
raw_data.aca_ipid = register_array[5];
|
||||
raw_data.aca_synd = register_array[6];
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n";
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n";
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n";
|
||||
|
||||
raw_data.flags = 0;
|
||||
raw_data.hw_revision = 1;
|
||||
|
||||
aca_error_info_t error_info = aca_decode(&raw_data);
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] non-fatal error_info.afid: " << std::dec << error_info.afid << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return error_info.afid;
|
||||
}
|
||||
|
||||
static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
|
||||
struct cper_sec_nonstd_err_body *body;
|
||||
char *offset;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~NON STANDARD SECTION~~~\n";
|
||||
|
||||
ss << "[NonSTD SEC] Err Info Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_info_cnt << "\n";
|
||||
ss << "[NonSTD SEC] Err Context Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_context_cnt << "\n";
|
||||
|
||||
if (nonstd_err->hdr.valid_bits.err_context_cnt != nonstd_err->hdr.valid_bits.err_context_cnt) {
|
||||
ss << "~~~~Malformed Non Standard Section!~~~~\n\n";
|
||||
goto exit;
|
||||
}
|
||||
|
||||
body = reinterpret_cast<struct cper_sec_nonstd_err_body *>(
|
||||
(char *)nonstd_err + sizeof(struct cper_sec_nonstd_err_hdr)
|
||||
);
|
||||
|
||||
ss << "[NonSTD SEC] Reg Ctx Type = 0x" << std::hex << body->err_ctx.reg_ctx_type << "\n";
|
||||
ss << "[NonSTD SEC] Reg Array Size = 0x" << std::hex << body->err_ctx.reg_arr_size << "\n";
|
||||
|
||||
for (int i = 0; i < CPER_ACA_REG_COUNT; i++) {
|
||||
ss << "[NonSTD SEC] reg_dump[" << std::dec << i << "] = 0x" << std::hex << body->err_ctx.reg_dump[i] << "\n";
|
||||
}
|
||||
|
||||
exit:
|
||||
ss << std::dec << "~~~~NON STANDARD SECTION~~~\n\n";
|
||||
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump));
|
||||
}
|
||||
|
||||
static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - FATAL~~~\n";
|
||||
|
||||
ss << "[Crash Dump - Fatal] status_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.status_lo << "\n";
|
||||
ss << "[Crash Dump - Fatal] status_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.status_hi << "\n";
|
||||
ss << "[Crash Dump - Fatal] addr_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.addr_lo << "\n";
|
||||
ss << "[Crash Dump - Fatal] addr_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.addr_hi << "\n";
|
||||
ss << "[Crash Dump - Fatal] ipid_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.ipid_lo << "\n";
|
||||
ss << "[Crash Dump - Fatal] ipid_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.ipid_hi << "\n";
|
||||
ss << "[Crash Dump - Fatal] synd_lo = 0x" << std::hex << crashdump->data.dump.fatal_err.synd_lo << "\n";
|
||||
ss << "[Crash Dump - Fatal] synd_hi = 0x" << std::hex << crashdump->data.dump.fatal_err.synd_hi << "\n";
|
||||
|
||||
ss << std::dec << "~~~~CRASH DUMP - FATAL~~~\n\n";
|
||||
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return aca_decode_fatal(crashdump->data);
|
||||
}
|
||||
|
||||
static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - BOOT TIME~~~\n";
|
||||
|
||||
for (int i = 0; i < CPER_MAX_OAM_COUNT; i++) {
|
||||
ss << "[Crash Dump - Boot] bootmsg[" << std::dec << i << "] = 0x" << std::hex << crashdump->data.dump.boot_err.msg[i] << "\n";
|
||||
}
|
||||
|
||||
ss << "~~~~CRASH DUMP - BOOT TIME~~~\n\n";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return aca_decode_fatal(crashdump->data);
|
||||
}
|
||||
|
||||
} //namespace
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
|
||||
const char *amdgpu_ring_cper_file,
|
||||
uint32_t severity_mask,
|
||||
char *cper_data,
|
||||
uint64_t *buf_size,
|
||||
amdsmi_cper_hdr_t **cper_hdrs,
|
||||
uint64_t *entry_count,
|
||||
uint64_t *cursor) {
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] begin\n"
|
||||
<< ", amdgpu_ring_cper_file: " << amdgpu_ring_cper_file
|
||||
<< ", severity_mask: " << severity_mask;
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
if(!cper_data) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_data should be a valid memory address\n";
|
||||
LOG_ERROR(ss);
|
||||
if(entry_count) {*entry_count = 0;}
|
||||
if(buf_size) { *buf_size = 0; }
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!buf_size) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be a valid memory address";
|
||||
LOG_ERROR(ss);
|
||||
if(entry_count) {*entry_count = 0;}
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!entry_count) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be a valid memory address";
|
||||
LOG_ERROR(ss);
|
||||
*buf_size = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!*buf_size) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be greater than zero";
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!*entry_count) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be greater than 0";
|
||||
LOG_ERROR(ss);
|
||||
*buf_size = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!cper_hdrs) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs should be a valid memory address";
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = 0;
|
||||
*buf_size = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!cursor) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cursor should be a valid memory address";
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = 0;
|
||||
*buf_size = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
auto ctx = amdsmi_read_cper_file(amdgpu_ring_cper_file);
|
||||
if(ctx.status != AMDSMI_STATUS_SUCCESS) {
|
||||
*entry_count = 0;
|
||||
*buf_size = 0;
|
||||
return ctx.status;
|
||||
}
|
||||
|
||||
auto headers = amdsmi_get_gpu_cper_headers(ctx.buffer.get(), ctx.file_size);
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] num headers: " << headers.size();
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
uint64_t data_idx = 0;
|
||||
uint64_t header_idx = 0;
|
||||
size_t num_headers_copied = 0;
|
||||
for(const amdsmi_cper_hdr_t *header: headers) {
|
||||
if(((1 << header->error_severity) & severity_mask) !=
|
||||
static_cast<uint32_t>(1 << header->error_severity)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x"
|
||||
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
|
||||
<< std::hex << severity_mask << ", record_length:"
|
||||
<< std::dec << header->record_length;
|
||||
LOG_DEBUG(ss);
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x"
|
||||
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
|
||||
<< std::hex << severity_mask << ", record_length:"
|
||||
<< std::dec << header->record_length;
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
if((*buf_size - data_idx) < header->record_length ) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buffer filled up without copying all cper entries, buf_size: " << std::dec << *buf_size;
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = num_headers_copied;
|
||||
*buf_size = data_idx;
|
||||
return (data_idx == 0) ?
|
||||
AMDSMI_STATUS_OUT_OF_RESOURCES :
|
||||
AMDSMI_STATUS_MORE_DATA;
|
||||
}
|
||||
if(num_headers_copied == *entry_count) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs filled up before finished with copying all header pointers, entry_count: " << std::dec << *entry_count;
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = num_headers_copied;
|
||||
*buf_size = data_idx;
|
||||
return (data_idx == 0) ?
|
||||
AMDSMI_STATUS_OUT_OF_RESOURCES :
|
||||
AMDSMI_STATUS_MORE_DATA;
|
||||
}
|
||||
if(*cursor != header_idx) {
|
||||
++header_idx;
|
||||
continue;
|
||||
}
|
||||
cper_hdrs[num_headers_copied] = reinterpret_cast<amdsmi_cper_hdr_t*>(&cper_data[data_idx]);
|
||||
++num_headers_copied;
|
||||
*cursor = ++header_idx;
|
||||
std::memcpy(
|
||||
&cper_data[data_idx],
|
||||
reinterpret_cast<const char*>(header),
|
||||
header->record_length);
|
||||
data_idx += header->record_length;
|
||||
}
|
||||
*entry_count = num_headers_copied;
|
||||
*buf_size = data_idx;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] *entry_count: " << entry_count
|
||||
<< ", *cursor: " << cursor
|
||||
<< ", *buf_size: " << buf_size;
|
||||
|
||||
LOG_DEBUG(ss);
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper) {
|
||||
|
||||
std::vector<int> afids;
|
||||
std::ostringstream ss;
|
||||
|
||||
for (int i = 0; i < cper_num_sec(cper); i ++) {
|
||||
void *sec_desc_offset = cper_get_sec_desc_offset(cper, i);
|
||||
void *sec_offset = cper_get_sec_offset(cper, i);
|
||||
const amdsmi_cper_guid_t *sec_guid = get_sec_desc_type(static_cast<struct cper_sec_desc *>(sec_desc_offset));
|
||||
const amdsmi_cper_guid_t *cper_guid = get_cper_type(cper);
|
||||
|
||||
cper_dump_sec_desc(static_cast<struct cper_sec_desc *>(sec_desc_offset));
|
||||
|
||||
if (cper_is_cr(sec_guid)) {
|
||||
if (cper_is_bt(cper_guid)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding boot crash dump\n";
|
||||
LOG_DEBUG(ss);
|
||||
afids.emplace_back(cper_dump_cr_boot(static_cast<struct cper_sec_crashdump *>(sec_offset)));
|
||||
}
|
||||
else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding crash dump\n";
|
||||
LOG_DEBUG(ss);
|
||||
afids.emplace_back(cper_dump_cr_fatal(static_cast<struct cper_sec_crashdump *>(sec_offset)));
|
||||
}
|
||||
}
|
||||
else if (cper_is_nonstd(sec_guid)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding non-standard error\n";
|
||||
LOG_DEBUG(ss);
|
||||
afids.emplace_back(cper_dump_nonstd_err(static_cast<struct cper_sec_nonstd_err *>(sec_offset)));
|
||||
}
|
||||
else if (cper_is_proc_err(sec_guid)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding proc error section type\n";
|
||||
LOG_DEBUG(ss);
|
||||
afids.emplace_back(cper_dump_nonstd_err(static_cast<struct cper_sec_nonstd_err *>(sec_offset)));
|
||||
}
|
||||
else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Unknown error type!!\n";
|
||||
for(int i = 0; i < sizeof(sec_guid->b); ++i) {
|
||||
ss << std::hex << static_cast<int>(sec_guid->b[i]) << ":";
|
||||
}
|
||||
ss << "\n";
|
||||
LOG_ERROR(ss);
|
||||
}
|
||||
}
|
||||
|
||||
return afids;
|
||||
}
|
||||
|
||||
@@ -1031,142 +1031,6 @@ static auto amdsmi_read_cper_file(const std::string &filepath) -> CperFileCtx {
|
||||
ctx.file_size = bytes_read;
|
||||
return ctx;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
|
||||
const char *amdgpu_ring_cper_file,
|
||||
uint32_t severity_mask,
|
||||
char *cper_data,
|
||||
uint64_t *buf_size,
|
||||
amdsmi_cper_hdr_t **cper_hdrs,
|
||||
uint64_t *entry_count,
|
||||
uint64_t *cursor) {
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] begin\n"
|
||||
<< ", amdgpu_ring_cper_file: " << amdgpu_ring_cper_file
|
||||
<< ", severity_mask: " << severity_mask;
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
if(!cper_data) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_data should be a valid memory address\n";
|
||||
LOG_ERROR(ss);
|
||||
if(entry_count) {*entry_count = 0;}
|
||||
if(buf_size) { *buf_size = 0; }
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!buf_size) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be a valid memory address";
|
||||
LOG_ERROR(ss);
|
||||
if(entry_count) {*entry_count = 0;}
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!entry_count) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be a valid memory address";
|
||||
LOG_ERROR(ss);
|
||||
*buf_size = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!*buf_size) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buf_size should be greater than zero";
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!*entry_count) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] entry_count should be greater than 0";
|
||||
LOG_ERROR(ss);
|
||||
*buf_size = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!cper_hdrs) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs should be a valid memory address";
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = 0;
|
||||
*buf_size = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
else if(!cursor) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cursor should be a valid memory address";
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = 0;
|
||||
*buf_size = 0;
|
||||
return AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
auto ctx = amdsmi_read_cper_file(amdgpu_ring_cper_file);
|
||||
if(ctx.status != AMDSMI_STATUS_SUCCESS) {
|
||||
*entry_count = 0;
|
||||
*buf_size = 0;
|
||||
return ctx.status;
|
||||
}
|
||||
|
||||
auto headers = amdsmi_get_gpu_cper_headers(ctx.buffer.get(), ctx.file_size);
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] num headers: " << headers.size();
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
uint64_t data_idx = 0;
|
||||
uint64_t header_idx = 0;
|
||||
size_t num_headers_copied = 0;
|
||||
for(const amdsmi_cper_hdr_t *header: headers) {
|
||||
if(((1 << header->error_severity) & severity_mask) !=
|
||||
static_cast<uint32_t>(1 << header->error_severity)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header rejected with severity: 0x"
|
||||
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
|
||||
<< std::hex << severity_mask << ", record_length:"
|
||||
<< std::dec << header->record_length;
|
||||
LOG_DEBUG(ss);
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper header accepted with severity: 0x"
|
||||
<< std::hex << (1 << header->error_severity) << ", given severity_mask: 0x"
|
||||
<< std::hex << severity_mask << ", record_length:"
|
||||
<< std::dec << header->record_length;
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
if((*buf_size - data_idx) < header->record_length ) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] buffer filled up without copying all cper entries, buf_size: " << std::dec << *buf_size;
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = num_headers_copied;
|
||||
*buf_size = data_idx;
|
||||
return (data_idx == 0) ?
|
||||
AMDSMI_STATUS_OUT_OF_RESOURCES :
|
||||
AMDSMI_STATUS_MORE_DATA;
|
||||
}
|
||||
if(num_headers_copied == *entry_count) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] cper_hdrs filled up before finished with copying all header pointers, entry_count: " << std::dec << *entry_count;
|
||||
LOG_ERROR(ss);
|
||||
*entry_count = num_headers_copied;
|
||||
*buf_size = data_idx;
|
||||
return (data_idx == 0) ?
|
||||
AMDSMI_STATUS_OUT_OF_RESOURCES :
|
||||
AMDSMI_STATUS_MORE_DATA;
|
||||
}
|
||||
if(*cursor != header_idx) {
|
||||
++header_idx;
|
||||
continue;
|
||||
}
|
||||
cper_hdrs[num_headers_copied] = reinterpret_cast<amdsmi_cper_hdr_t*>(&cper_data[data_idx]);
|
||||
++num_headers_copied;
|
||||
*cursor = ++header_idx;
|
||||
std::memcpy(
|
||||
&cper_data[data_idx],
|
||||
reinterpret_cast<const char*>(header),
|
||||
header->record_length);
|
||||
data_idx += header->record_length;
|
||||
}
|
||||
*entry_count = num_headers_copied;
|
||||
*buf_size = data_idx;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] *entry_count: " << entry_count
|
||||
<< ", *cursor: " << cursor
|
||||
<< ", *buf_size: " << buf_size;
|
||||
|
||||
LOG_DEBUG(ss);
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void amdsmi_wait_for_user_input(void) {
|
||||
for (;;) {
|
||||
std::cout << "\n\t**Press any key to continue**" << std::endl;
|
||||
|
||||
Ссылка в новой задаче
Block a user