[SWDEV-553168] Add support for decoding out of band boot time CPER files.

Change-Id: Ic4278698f9c5b5ae56bd56fd43150c0653c1ef05


[ROCm/amdsmi commit: c6698c9100]
Este commit está contenido en:
Oosman Saeed
2025-09-12 21:13:01 +00:00
cometido por Arif, Maisam
padre 25a6ac3585
commit 2214445327
Se han modificado 28 ficheros con 2438 adiciones y 649 borrados
@@ -1,119 +0,0 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/**
* @file aca_constants.h
* @brief Shared constants for ACA error decoding
*
* This file contains string constants and numerical constants that are used
* across multiple source files to improve maintainability and prevent typos.
*/
#ifndef ACA_CONSTANTS_H
#define ACA_CONSTANTS_H
/* Error severity constants */
#define ACA_SEVERITY_UNKNOWN "UNKNOWN"
#define ACA_SEVERITY_FATAL "Fatal"
#define ACA_SEVERITY_CORRECTED "Corrected"
#define ACA_SEVERITY_UNCORRECTED_NON_FATAL "Uncorrected, Non-fatal"
#define ACA_SEVERITY_FAIL_TO_INIT "Fail-to-init"
#define ACA_SEVERITY_ALL_CAPS "ALL"
/* Error category constants */
#define ACA_CATEGORY_HBM_ERRORS "HBM Errors"
#define ACA_CATEGORY_DEVICE_INTERNAL_ERRORS "Device Internal Errors"
#define ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS "Off-Package Link Errors"
#define ACA_CATEGORY_BOOT_TIME_ERRORS "Boot-Time Errors"
#define ACA_CATEGORY_CPER_FORMAT "CPER Format"
#define ACA_CATEGORY_UNIDENTIFIED_ERRORS "Unidentified Errors"
/* Common error type constants */
#define ACA_ERROR_TYPE_ALL_OTHERS "All Others"
#define ACA_ERROR_TYPE_ALL "All"
#define ACA_ERROR_TYPE_DECODE_INAPPLICABLE "Decode Inapplicable"
#define ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD "Bad Page Retirement Threshold"
#define ACA_ERROR_TYPE_HARDWARE_ASSERTION "Hardware Assertion (HWA)"
#define ACA_ERROR_TYPE_WATCHDOG_TIMEOUT "Watchdog Timeout (WDT)"
#define ACA_ERROR_TYPE_ON_DIE_ECC "On-die ECC"
#define ACA_ERROR_TYPE_END_TO_END_CRC "End-to-end CRC"
#define ACA_ERROR_TYPE_WAFL "WAFL"
#define ACA_ERROR_TYPE_XGMI "XGMI"
/* Boot-time error type constants */
#define ACA_ERROR_TYPE_FW_LOAD "FW Load"
#define ACA_ERROR_TYPE_HBM_BIST_TEST "HBM BIST Test"
#define ACA_ERROR_TYPE_HBM_MEMORY_TEST "HBM Memory Test"
#define ACA_ERROR_TYPE_HBM_TRAINING "HBM Training"
#define ACA_ERROR_TYPE_UNHANDLED "Unhandled"
#define ACA_ERROR_TYPE_UNKNOWN_ERROR "Unknown"
#define ACA_ERROR_TYPE_USR_CP_LINK_TRAINING "USR CP Link Training"
#define ACA_ERROR_TYPE_USR_DP_LINK_TRAINING "USR DP Link Training"
#define ACA_ERROR_TYPE_WAFL_LINK_TRAINING "WAFL Link Training"
#define ACA_ERROR_TYPE_XGMI_LINK_TRAINING "XGMI Link Training"
#define ACA_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT "Boot Controller Data Abort"
#define ACA_ERROR_TYPE_BOOT_CONTROLLER_GENERIC "Boot Controller Generic"
/* Link error type constants */
#define ACA_ERROR_TYPE_PCIE_AER "PCIe AER"
/* CPER format error type constants */
#define ACA_ERROR_TYPE_MALFORMED_CPER "Malformed CPER"
#define ACA_ERROR_TYPE_INCOMPLETE_ACA_DATA "Incomplete ACA Data"
#define ACA_ERROR_TYPE_INVALID_ACA_DATA "Invalid ACA Data"
#define ACA_ERROR_TYPE_UNIDENTIFIED_ERROR "Unidentified Error"
/* Protocol constants */
#define ACA_PROTOCOL_CPER "CPER"
#define ACA_PROTOCOL_CPER_WITH_SPACE "CPER "
/* Bank name strings */
#define ACA_BANK_UMC "umc"
#define ACA_BANK_PSP "psp"
#define ACA_BANK_CS "cs"
#define ACA_BANK_PIE "pie"
#define ACA_BANK_PCS_XGMI "pcs_xgmi"
#define ACA_BANK_KPX_SERDES "kpx_serdes"
#define ACA_BANK_KPX_WAFL "kpx_wafl"
/* Numerical constants */
#define ACA_FLAG_THRESHOLD_EXCEEDED 0x8
#define ACA_REGISTER_ARRAY_SIZE_32_BYTES 4
#define ACA_REGISTER_ARRAY_SIZE_128_BYTES 16
/* Error code ranges */
#define ACA_ERROR_CODE_EXT_MIN 0x3A
#define ACA_ERROR_CODE_EXT_MAX 0x3E
/* Instance ID values for XCD and AID error decoding */
#define ACA_INSTANCE_ID_XCD0_400 0x36430400
#define ACA_INSTANCE_ID_XCD1_400 0x38430400
#define ACA_INSTANCE_ID_XCD0_401 0x36430401
#define ACA_INSTANCE_ID_XCD1_401 0x38430401
#define ACA_INSTANCE_ID_AID_400 0x3B30400
#define ACA_INSTANCE_ID_AID_401 0x3B30401
/* Error return codes */
#define ACA_ERROR_INVALID_ACA_DATA_ID 33
#define ACA_ERROR_UNIDENTIFIED_ERROR_ID 34
#endif /* ACA_CONSTANTS_H */
-60
Ver fichero
@@ -1,60 +0,0 @@
// SPDX-License-Identifier: MIT
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/**
* @file utils.h
* @brief Common utility functions
*/
#ifndef UTILS_H
#define UTILS_H
#include <stdint.h>
/**
* @brief Convert a 64-bit value from little endian to big endian
* @param[in] value Value to convert
* @return Converted value in big endian
*/
static inline uint64_t le64_to_be64(uint64_t value) {
return ((value & 0xFF00000000000000ULL) >> 56) |
((value & 0x00FF000000000000ULL) >> 40) |
((value & 0x0000FF0000000000ULL) >> 24) |
((value & 0x000000FF00000000ULL) >> 8) |
((value & 0x00000000FF000000ULL) << 8) |
((value & 0x0000000000FF0000ULL) << 24) |
((value & 0x000000000000FF00ULL) << 40) |
((value & 0x00000000000000FFULL) << 56);
}
/**
* @brief Convert an array of 64-bit values from little endian to big endian
* @param[in,out] array Array to convert
* @param[in] len Length of the array
*/
static inline void convert_array_le_to_be(uint64_t *array, size_t len) {
for (size_t i = 0; i < len; i++) {
array[i] = le64_to_be64(array[i]);
}
}
#endif /* UTILS_H */
+9 -17
Ver fichero
@@ -1,4 +1,3 @@
// SPDX-License-Identifier: MIT
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
@@ -21,20 +20,16 @@
* THE SOFTWARE.
*/
/**
/**
* @file aca_decode.h
* @brief Internal decoder interface and data structures
*/
#ifndef RAS_DECODE_DECODE_H
#define RAS_DECODE_DECODE_H
#ifdef __cplusplus
extern "C" {
#endif
#ifndef ACA_DECODE_H
#define ACA_DECODE_H
#include "aca_api.h"
#include "ras_decode_api.h"
#include "aca_fields.h"
#include "json_util.h"
/**
* @brief Internal decoder structure with parsed register fields
@@ -67,13 +62,10 @@ typedef struct
} aca_raw_data_t;
/**
* @brief Main decode function that processes raw ACA error data
* @brief Main decode function that processes raw ACA error data and returns JSON
* @param[in] raw_data Pointer to structure containing raw ACA error data
* @return Decoded error information structure
* @return JsonValue* containing the decoded error information, or NULL on failure
*/
aca_error_info_t aca_decode(const aca_raw_data_t *raw_data);
JsonValue* aca_decode(const aca_raw_data_t *raw_data);
#ifdef __cplusplus
}
#endif
#endif /* ACA_DECODE_H */
#endif /* RAS_DECODE_DECODE_H */
@@ -1,4 +1,3 @@
// SPDX-License-Identifier: MIT
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
@@ -21,7 +20,7 @@
* THE SOFTWARE.
*/
/**
/**
* @file aca_fields.h
* @brief ACA register field definitions and manipulation functions
*
@@ -30,8 +29,8 @@
* definitions for status, IPID, and syndrome registers, along with
* functions to initialize and access these fields.
*/
#ifndef ACA_FIELDS_H
#define ACA_FIELDS_H
#ifndef RAS_DECODE_FIELDS_H
#define RAS_DECODE_FIELDS_H
#include <stdint.h>
@@ -1,4 +1,3 @@
// SPDX-License-Identifier: MIT
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
@@ -28,8 +27,8 @@
* into their corresponding names and types.
*/
#ifndef ACA_TABLES_H
#define ACA_TABLES_H
#ifndef RAS_DECODE_TABLES_H
#define RAS_DECODE_TABLES_H
#include <stdint.h>
#include <stddef.h>
@@ -20,8 +20,8 @@
* THE SOFTWARE.
*/
#ifndef ACA_VERSION_H
#define ACA_VERSION_H
#ifndef RAS_DECODE_VERSION_H
#define RAS_DECODE_VERSION_H
#ifdef __cplusplus
extern "C"
@@ -40,12 +40,19 @@ extern "C"
*/
/* Version Components */
#define ACA_VERSION_MAJOR 1 /**< Major version number */
#define ACA_VERSION_MINOR 0 /**< Minor version number */
#define ACA_VERSION_PATCH 0 /**< Patch version number */
#define RAS_DECODE_VERSION_MAJOR 2 /**< Major version number */
#define RAS_DECODE_VERSION_MINOR 0 /**< Minor version number */
#define RAS_DECODE_VERSION_PATCH 0 /**< Patch version number */
/* Version String */
#define ACA_VERSION_STRING "1.0.0"
/* Helper macros for string concatenation */
#define RAS_DECODE_STRINGIFY(x) #x
#define RAS_DECODE_TOSTRING(x) RAS_DECODE_STRINGIFY(x)
/* Version String - dynamically constructed from components */
#define RAS_DECODE_VERSION_STRING \
RAS_DECODE_TOSTRING(RAS_DECODE_VERSION_MAJOR) "." \
RAS_DECODE_TOSTRING(RAS_DECODE_VERSION_MINOR) "." \
RAS_DECODE_TOSTRING(RAS_DECODE_VERSION_PATCH)
/**
* @brief Structure containing version information
@@ -92,4 +99,4 @@ extern "C"
}
#endif
#endif /* ACA_VERSION_H */
#endif /* RAS_DECODE_VERSION_H */
@@ -0,0 +1,219 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef BOOT_DECODE_H
#define BOOT_DECODE_H
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
#include "json_util.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Boot message structure representing OAM boot message
*/
typedef struct {
uint64_t value; ///< 64-bit boot message value
} OamBootMsg;
/**
* @brief Decoder function pointer type
* @param msg Boot message to decode
* @return JsonValue containing decoded information or NULL on failure
*/
typedef JsonValue* (*boot_decoder_func_t)(OamBootMsg *msg);
/**
* @brief Decoder mapping entry
*/
typedef struct {
uint8_t encoding; ///< Error encoding value
boot_decoder_func_t decoder; ///< Decoder function
} boot_decoder_entry_t;
/**
* @brief Boot message constants
*/
#define BOOT_ERROR_PRESENT_MARKER 0xA4
#define BOOT_IN_BOOT_MARKER 0xBA
#define BOOT_SUCCESS_ENCODING 0xBA
/**
* @brief Error encoding constants
*/
#define BOOT_ENCODING_HBM_TRAINING 0x01
#define BOOT_ENCODING_FW_LOAD 0x04
#define BOOT_ENCODING_WAFL_LINK 0x05
#define BOOT_ENCODING_XGMI_LINK 0x06
#define BOOT_ENCODING_USR_CP_LINK 0x07
#define BOOT_ENCODING_USR_DP_LINK 0x08
#define BOOT_ENCODING_HBM_MEM_TEST 0x09
#define BOOT_ENCODING_HBM_BIST_TEST 0x0A
#define BOOT_ENCODING_BOOT_CTRL_GEN_V0 0x0B
#define BOOT_ENCODING_BOOT_CTRL_GEN_V1 0x0C
#define BOOT_ENCODING_DATA_ABORT 0x0D
/**
* @brief HBM stack decoder constants
*/
#define HBM_STACK_0 0x01
#define HBM_STACK_1 0x02
#define HBM_STACK_UNKNOWN -1
/**
* @brief Extract specific byte from 64-bit value
* @param value 64-bit value
* @param byte_index Byte index (0-7)
* @return Extracted byte value
*/
static inline uint8_t extract_byte(uint64_t value, int byte_index) {
return (uint8_t)((value >> (byte_index * 8)) & 0xFF);
}
/**
* @brief Extract specific bits mask
* @param num_bits Number of bits to extract
* @return Bit mask
*/
static inline uint32_t extract_bits(int num_bits) {
return (1U << num_bits) - 1;
}
/**
* @brief Get boot version from boot message
* @param msg Boot message
* @return Boot version (0 or 1)
*/
int get_boot_version(OamBootMsg *msg);
/**
* @brief Get error encoding from boot message
* @param msg Boot message
* @return Error encoding value
*/
int get_error_encoding(OamBootMsg *msg);
/**
* @brief Check if error is present in boot message
* @param msg Boot message
* @return true if error present, false otherwise
*/
bool error_present(OamBootMsg *msg);
/**
* @brief Check if in boot mode
* @param msg Boot message
* @return true if in boot mode, false otherwise
*/
bool in_boot(OamBootMsg *msg);
/**
* @brief Get socket number from boot message
* @param msg Boot message
* @param version Boot version
* @return Socket number
*/
int get_socket(OamBootMsg *msg, int version);
/**
* @brief Get AID number from boot message
* @param msg Boot message
* @param version Boot version
* @return AID number
*/
int get_aid(OamBootMsg *msg, int version);
/**
* @brief Decode HBM stack value
* @param stack Stack value
* @return Decoded stack number or HBM_STACK_UNKNOWN
*/
int decode_hbm_stack(uint8_t stack);
/**
* @brief Create JSON array of failed links
* @param byte_value Byte containing link status bits
* @param max_links Maximum number of links to check
* @return JsonValue array or NULL on failure
*/
JsonValue* create_failed_links_array(uint8_t byte_value, int max_links);
/**
* @brief Create hex string representation
* @param value Value to convert
* @param width Width of hex string (with padding)
* @return Dynamically allocated hex string or NULL on failure
*/
char* create_hex_string(uint64_t value, int width);
// Decoder functions for Version 0
JsonValue* decode_hbm_training_v0(OamBootMsg *msg);
JsonValue* decode_fw_load_v0(OamBootMsg *msg);
JsonValue* decode_wafl_link_training_v0(OamBootMsg *msg);
JsonValue* decode_xgmi_link_training_v0(OamBootMsg *msg);
JsonValue* decode_usr_cp_link_training_v0(OamBootMsg *msg);
JsonValue* decode_usr_dp_link_training_v0(OamBootMsg *msg);
JsonValue* decode_hbm_mem_test_v0(OamBootMsg *msg);
JsonValue* decode_hbm_bist_test_v0(OamBootMsg *msg);
JsonValue* decode_boot_controller_generic_v0(OamBootMsg *msg);
// Decoder functions for Version 1
JsonValue* decode_hbm_training_v1(OamBootMsg *msg);
JsonValue* decode_fw_load_v1(OamBootMsg *msg);
JsonValue* decode_wafl_link_training_v1(OamBootMsg *msg);
JsonValue* decode_xgmi_link_training_v1(OamBootMsg *msg);
JsonValue* decode_usr_cp_link_training_v1(OamBootMsg *msg);
JsonValue* decode_usr_dp_link_training_v1(OamBootMsg *msg);
JsonValue* decode_hbm_mem_test_v1(OamBootMsg *msg);
JsonValue* decode_hbm_bist_test_v1(OamBootMsg *msg);
JsonValue* decode_boot_controller_generic_v1(OamBootMsg *msg);
JsonValue* decode_data_abort_v1(OamBootMsg *msg);
JsonValue* decode_boot_success_v1(OamBootMsg *msg);
// Unhandled error decoders
JsonValue* decode_unhandled_error_v0(OamBootMsg *msg);
JsonValue* decode_unhandled_error_v1(OamBootMsg *msg);
/**
* @brief Get appropriate decoder function for boot message
* @param msg Boot message
* @return Decoder function pointer or NULL if no decoder found
*/
boot_decoder_func_t get_decoder_function(OamBootMsg *msg);
/**
* @brief Orchestrate decoding of multiple boot messages
* @param oam_boot_msgs Array of boot message values
* @param count Number of boot messages
* @return JsonValue object containing decoded results or NULL on failure
*/
JsonValue* boot_decode_orchestrator(uint64_t *oam_boot_msgs, size_t count);
#ifdef __cplusplus
}
#endif
#endif /* BOOT_DECODE_H */
@@ -1,4 +1,3 @@
// SPDX-License-Identifier: MIT
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
@@ -0,0 +1,42 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef JSON_PRINTER_H
#define JSON_PRINTER_H
#include "json_util.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Print a JSON value to stdout in formatted form
* @param value JSON value to print
*/
void print_json_value(JsonValue *value);
#ifdef __cplusplus
}
#endif
#endif /* JSON_PRINTER_H */
@@ -0,0 +1,171 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef JSON_UTIL_H
#define JSON_UTIL_H
#include <stdbool.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief JSON value types enumeration
*/
typedef enum {
JSON_NULL,
JSON_BOOL,
JSON_NUMBER,
JSON_STRING,
JSON_OBJECT,
JSON_ARRAY
} JsonType;
typedef struct JsonValue JsonValue;
typedef struct JsonPair JsonPair;
/**
* @brief JSON key-value pair structure for objects
*/
struct JsonPair {
char *key;
JsonValue *value;
JsonPair *next;
};
/**
* @brief JSON value structure
*/
struct JsonValue {
JsonType type;
union {
bool boolean;
double number;
char *string;
JsonPair *object; // Linked list of key-value pairs
struct {
JsonValue **items;
size_t count;
size_t capacity;
} array;
} data;
};
/**
* @brief Create a null JSON value
* @return Pointer to new JsonValue or NULL on failure
*/
JsonValue* json_create_null(void);
/**
* @brief Create a boolean JSON value
* @param b Boolean value
* @return Pointer to new JsonValue or NULL on failure
*/
JsonValue* json_create_bool(bool b);
/**
* @brief Create a number JSON value
* @param num Numeric value
* @return Pointer to new JsonValue or NULL on failure
*/
JsonValue* json_create_number(double num);
/**
* @brief Create a string JSON value
* @param str String value (will be copied)
* @return Pointer to new JsonValue or NULL on failure
*/
JsonValue* json_create_string(const char *str);
/**
* @brief Create an empty JSON object
* @return Pointer to new JsonValue or NULL on failure
*/
JsonValue* json_create_object(void);
/**
* @brief Create an empty JSON array
* @return Pointer to new JsonValue or NULL on failure
*/
JsonValue* json_create_array(void);
/**
* @brief Add a key-value pair to a JSON object
* @param obj JSON object to modify
* @param key Key string (will be copied)
* @param value Value to add
*/
void json_object_set(JsonValue *obj, const char *key, JsonValue *value);
/**
* @brief Get a value by key from a JSON object
* @param obj JSON object to search
* @param key Key to search for
* @return Pointer to JsonValue or NULL if not found
*/
JsonValue* json_object_get(JsonValue *obj, const char *key);
/**
* @brief Check if a key exists in a JSON object
* @param obj JSON object to check
* @param key Key to check for
* @return true if key exists, false otherwise
*/
bool json_object_has_key(JsonValue *obj, const char *key);
/**
* @brief Add a value to a JSON array
* @param arr JSON array to modify
* @param value Value to add
* @return true on success, false on failure
*/
bool json_array_push(JsonValue *arr, JsonValue *value);
/**
* @brief Get a value by index from a JSON array
* @param arr JSON array to access
* @param index Array index
* @return Pointer to JsonValue or NULL if index out of bounds
*/
JsonValue* json_array_get(JsonValue *arr, size_t index);
/**
* @brief Get the size of a JSON array
* @param arr JSON array
* @return Number of elements in array, or 0 if not an array
*/
size_t json_array_size(JsonValue *arr);
/**
* @brief Free a JSON value and all its contents
* @param val JSON value to free
*/
void json_free(JsonValue *val);
#ifdef __cplusplus
}
#endif
#endif /* JSON_UTIL_H */
@@ -1,4 +1,3 @@
// SPDX-License-Identifier: MIT
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
@@ -21,11 +20,13 @@
* THE SOFTWARE.
*/
#ifndef ACA_API_H
#define ACA_API_H
#ifndef RAS_DECODE_API_H
#define RAS_DECODE_API_H
#include <stdint.h>
#include <stddef.h>
#include "aca_version.h"
#include "json_util.h"
/**
* @brief Structure containing decoded error information
@@ -39,12 +40,13 @@ typedef struct
const char *instance_ref; /**< Reference to instance name string */
int oam; /**< OAM value */
int aid; /**< AID value */
int afid; /**< AFID value (AMD Field ID) */
uint64_t raw_status; /**< Raw status register value */
uint64_t raw_addr; /**< Raw address register value */
uint64_t raw_ipid; /**< Raw IPID register value */
uint64_t raw_synd; /**< Raw syndrome register value */
uint8_t scrub; /**< Scrub bit from status */
uint8_t poison; /**< Poison bit from status */
uint8_t deferred; /**< Deferred bit from status */
uint8_t error_code_ext; /**< Extended error code from status */
} aca_error_info_t;
@@ -54,18 +56,27 @@ typedef struct
* @param[in] array_len Size of register array in elements
* @param[in] flag Decoder flags
* @param[in] hw_revision Hardware revision number
* @param[in] register_context_type Register context type (16-bit): 1 for ACA decode, 9 for boot decode
* @return AFID value or -1 if decoding fails
*/
int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision);
int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type);
/**
* @brief Decodes and returns complete error information from a register array
* @brief Decodes and returns complete error information from a register array as JSON
* @param[in] register_array Pointer to an array of 64-bit register values
* @param[in] array_len Size of register array in elements
* @param[in] flag Decoder flags
* @param[in] hw_revision Hardware revision number
* @return Complete error information structure
* @param[in] register_context_type Register context type (16-bit): 1 for ACA decode, 9 for boot decode
* @return JsonValue* containing complete error information, or NULL on failure
*/
aca_error_info_t decode_error_info(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision);
JsonValue* decode_error_info(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type);
#endif // ACA_API_H
/**
* @brief Decodes the AFID from a JSON error object based on error category, type, and severity
* @param[in] error_json Pointer to JSON object containing error information
* @return AFID value or -1 if decoding fails or JSON is NULL
*/
int decode_error_info_afid(JsonValue *error_json);
#endif // RAS_DECODE_API_H
@@ -0,0 +1,121 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/**
* @file ras_decode_constants.h
* @brief Shared constants for ACA error decoding
*
* This file contains string constants and numerical constants that are used
* across multiple source files to improve maintainability and prevent typos.
*/
#ifndef RAS_DECODE_CONSTANTS_H
#define RAS_DECODE_CONSTANTS_H
/* Error severity constants */
#define RAS_DECODE_SEVERITY_UNKNOWN "UNKNOWN"
#define RAS_DECODE_SEVERITY_FATAL "Fatal"
#define RAS_DECODE_SEVERITY_CORRECTED "Corrected"
#define RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL "Uncorrected, Non-fatal"
#define RAS_DECODE_SEVERITY_FAIL_TO_INIT "Fail-to-init"
#define RAS_DECODE_SEVERITY_ALL_CAPS "ALL"
/* Error category constants */
#define RAS_DECODE_CATEGORY_HBM_ERRORS "HBM Errors"
#define RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS "Device Internal Errors"
#define RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS "Off-Package Link Errors"
#define RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS "Boot-Time Errors"
#define RAS_DECODE_CATEGORY_CPER_FORMAT "CPER Format"
#define RAS_DECODE_CATEGORY_UNIDENTIFIED_ERRORS "Unidentified Errors"
/* Common error type constants */
#define RAS_DECODE_ERROR_TYPE_ALL_OTHERS "All Others"
#define RAS_DECODE_ERROR_TYPE_ALL "All"
#define RAS_DECODE_ERROR_TYPE_DECODE_INAPPLICABLE "Decode Inapplicable"
#define RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD "Bad Page Retirement Threshold"
#define RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION "Hardware Assertion (HWA)"
#define RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT "Watchdog Timeout (WDT)"
#define RAS_DECODE_ERROR_TYPE_ON_DIE_ECC "On-die ECC"
#define RAS_DECODE_ERROR_TYPE_END_TO_END_CRC "End-to-end CRC"
#define RAS_DECODE_ERROR_TYPE_WAFL "WAFL"
#define RAS_DECODE_ERROR_TYPE_XGMI "XGMI"
/* Boot-time error type constants */
#define RAS_DECODE_ERROR_TYPE_FW_LOAD "FW Load"
#define RAS_DECODE_ERROR_TYPE_HBM_BIST_TEST "HBM BIST Test"
#define RAS_DECODE_ERROR_TYPE_HBM_MEMORY_TEST "HBM Memory Test"
#define RAS_DECODE_ERROR_TYPE_HBM_TRAINING "HBM Training"
#define RAS_DECODE_ERROR_TYPE_UNHANDLED "Unhandled"
#define RAS_DECODE_ERROR_TYPE_UNKNOWN_ERROR "Unknown"
#define RAS_DECODE_ERROR_TYPE_USR_CP_LINK_TRAINING "USR CP Link Training"
#define RAS_DECODE_ERROR_TYPE_USR_DP_LINK_TRAINING "USR DP Link Training"
#define RAS_DECODE_ERROR_TYPE_WAFL_LINK_TRAINING "WAFL Link Training"
#define RAS_DECODE_ERROR_TYPE_XGMI_LINK_TRAINING "XGMI Link Training"
#define RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT "Boot Controller Data Abort"
#define RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_GENERIC "Boot Controller Generic"
#define RAS_DECODE_ERROR_TYPE_BOOT_SUCCESS "Boot Success"
/* Link error type constants */
#define RAS_DECODE_ERROR_TYPE_PCIE_AER "PCIe AER"
/* CPER format error type constants */
#define RAS_DECODE_ERROR_TYPE_MALFORMED_CPER "Malformed CPER"
#define RAS_DECODE_ERROR_TYPE_INCOMPLETE_RAS_DECODE_DATA "Incomplete ACA Data"
#define RAS_DECODE_ERROR_TYPE_INVALID_RAS_DECODE_DATA "Invalid ACA Data"
#define RAS_DECODE_ERROR_TYPE_UNIDENTIFIED_ERROR "Unidentified Error"
/* Protocol constants */
#define RAS_DECODE_PROTOCOL_CPER "CPER"
#define RAS_DECODE_PROTOCOL_CPER_WITH_SPACE "CPER "
/* Bank name strings */
#define RAS_DECODE_BANK_UMC "umc"
#define RAS_DECODE_BANK_PSP "psp"
#define RAS_DECODE_BANK_CS "cs"
#define RAS_DECODE_BANK_PIE "pie"
#define RAS_DECODE_BANK_PCS_XGMI "pcs_xgmi"
#define RAS_DECODE_BANK_KPX_SERDES "kpx_serdes"
#define RAS_DECODE_BANK_KPX_WAFL "kpx_wafl"
/* Numerical constants */
#define RAS_DECODE_FLAG_THRESHOLD_EXCEEDED 0x8
#define RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES 4
#define RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES 16
#define BOOT_REGISTER_ARRAY_SIZE_64_BYTES 8
/* Error code ranges */
#define RAS_DECODE_ERROR_CODE_EXT_MIN 0x3A
#define RAS_DECODE_ERROR_CODE_EXT_MAX 0x3E
/* Instance ID values for XCD and AID error decoding */
#define RAS_DECODE_INSTANCE_ID_XCD0_400 0x36430400
#define RAS_DECODE_INSTANCE_ID_XCD1_400 0x38430400
#define RAS_DECODE_INSTANCE_ID_XCD0_401 0x36430401
#define RAS_DECODE_INSTANCE_ID_XCD1_401 0x38430401
#define RAS_DECODE_INSTANCE_ID_AID_400 0x3B30400
#define RAS_DECODE_INSTANCE_ID_AID_401 0x3B30401
/* Error return codes */
#define RAS_DECODE_ERROR_INVALID_RAS_DECODE_DATA_ID 33
#define RAS_DECODE_ERROR_UNIDENTIFIED_ERROR_ID 34
#endif /* RAS_DECODE_CONSTANTS_H */
+27 -7
Ver fichero
@@ -44,13 +44,33 @@ set(INC_LIST
"${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi.h"
"${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi_utils.h")
set(ACA_SRC_DIR "aca-decode")
set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_api.c ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c
${ACA_SRC_DIR}/aca_tables.c ${ACA_SRC_DIR}/error_map.c)
set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/aca-decode")
set(INC_LIST ${INC_LIST} ${ACA_INC_DIR}/aca_decode.h ${ACA_INC_DIR}/aca_fields.h ${ACA_INC_DIR}/aca_tables.h
${ACA_INC_DIR}/error_map.h)
set(RAS_DECODE "ras-decode")
set(ACA_SRC_DIR "${PROJECT_SOURCE_DIR}/src/${RAS_DECODE}")
set(SRC_LIST ${SRC_LIST}
${ACA_SRC_DIR}/aca_decode.c
${ACA_SRC_DIR}/aca_fields.c
${ACA_SRC_DIR}/aca_tables.c
${ACA_SRC_DIR}/aca_version.c
${ACA_SRC_DIR}/boot_decode.c
${ACA_SRC_DIR}/error_map.c
${ACA_SRC_DIR}/json_printer.c
${ACA_SRC_DIR}/json_util.c
# ${ACA_SRC_DIR}/main.c
${ACA_SRC_DIR}/ras_decode_api.c
)
set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/${RAS_DECODE}")
set(INC_LIST ${INC_LIST}
${ACA_INC_DIR}/aca_decode.h
${ACA_INC_DIR}/aca_fields.h
${ACA_INC_DIR}/aca_tables.h
${ACA_INC_DIR}/aca_version.h
${ACA_INC_DIR}/boot_decode.h
${ACA_INC_DIR}/error_map.h
${ACA_INC_DIR}/json_printer.h
${ACA_INC_DIR}/json_util.h
${ACA_INC_DIR}/ras_decode_api.h
${ACA_INC_DIR}/ras_decode_constants.h
)
if(ENABLE_ESMI_LIB)
list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi.h)
list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi_monitor.h)
-94
Ver fichero
@@ -1,94 +0,0 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "aca_decode.h"
#include "aca_constants.h"
int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision)
{
if (!register_array)
{
return -1;
}
aca_raw_data_t raw_data;
if (array_len == ACA_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes
{
raw_data.aca_status = register_array[0];
raw_data.aca_addr = register_array[1];
raw_data.aca_ipid = register_array[2];
raw_data.aca_synd = register_array[3];
}
else if (array_len == ACA_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes
{
raw_data.aca_status = register_array[1];
raw_data.aca_addr = register_array[2];
raw_data.aca_ipid = register_array[5];
raw_data.aca_synd = register_array[6];
}
else
{
return -1; // Unsupported size
}
raw_data.flags = flag;
raw_data.hw_revision = hw_revision;
aca_error_info_t error_info = aca_decode(&raw_data);
return error_info.afid;
}
aca_error_info_t decode_error_info(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision)
{
aca_raw_data_t raw_data = {0};
aca_error_info_t error_info = {0};
if (!register_array)
{
return error_info;
} if (array_len == ACA_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes
{
raw_data.aca_status = register_array[0];
raw_data.aca_addr = register_array[1];
raw_data.aca_ipid = register_array[2];
raw_data.aca_synd = register_array[3];
}
else if (array_len == ACA_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes
{
raw_data.aca_status = register_array[1];
raw_data.aca_addr = register_array[2];
raw_data.aca_ipid = register_array[5];
raw_data.aca_synd = register_array[6];
}
else
{
return error_info; // Return zero-initialized structure for unsupported size
}
raw_data.flags = flag;
raw_data.hw_revision = hw_revision;
return aca_decode(&raw_data);
}
-88
Ver fichero
@@ -1,88 +0,0 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "error_map.h"
#include "aca_constants.h"
#include <string.h>
#define AFID_VERSION "0.7"
static const error_map_entry_t error_map[] = {
{1, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_FW_LOAD, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{2, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_BIST_TEST, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{3, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_MEMORY_TEST, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{4, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{5, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_UNHANDLED, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{6, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_UNKNOWN_ERROR, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{7, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_USR_CP_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{8, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_USR_DP_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{9, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_WAFL_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{10, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_XGMI_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{11, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
{12, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_BOOT_CONTROLLER_GENERIC, ACA_PROTOCOL_CPER_WITH_SPACE, ACA_SEVERITY_FAIL_TO_INIT},
{13, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_PCIE_AER, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
{14, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_PCIE_AER, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{15, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_WAFL, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
{16, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_WAFL, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{17, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_XGMI, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
{18, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_XGMI, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{19, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{20, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ON_DIE_ECC, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{21, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_END_TO_END_CRC, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{22, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ON_DIE_ECC, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL},
{23, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_END_TO_END_CRC, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL},
{24, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ALL, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
{25, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{26, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_HARDWARE_ASSERTION, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{27, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{28, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL},
{29, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
{30, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
{31, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_MALFORMED_CPER, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS},
{32, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_INCOMPLETE_ACA_DATA, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS},
{33, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_INVALID_ACA_DATA, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS},
{34, ACA_CATEGORY_UNIDENTIFIED_ERRORS, ACA_ERROR_TYPE_UNIDENTIFIED_ERROR, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}};
static const size_t NUM_ERROR_ENTRIES = sizeof(error_map) / sizeof(error_map[0]);
int get_error_id(const char *error_category, const char *error_type, const char *error_severity)
{
if (!error_category || !error_type || !error_severity ||
strcmp(error_category, ACA_SEVERITY_UNKNOWN) == 0 ||
strcmp(error_type, ACA_SEVERITY_UNKNOWN) == 0 ||
strcmp(error_severity, ACA_SEVERITY_UNKNOWN) == 0)
{
return ACA_ERROR_INVALID_ACA_DATA_ID; // Return ID for "Invalid Error" if any input is "UNKNOWN" or NULL
}
for (size_t i = 0; i < NUM_ERROR_ENTRIES; i++)
{
if (strcmp(error_map[i].error_category, error_category) == 0 &&
strcmp(error_map[i].error_type, error_type) == 0 &&
strcmp(error_map[i].error_severity, error_severity) == 0)
{
return (int)error_map[i].id;
}
}
return ACA_ERROR_UNIDENTIFIED_ERROR_ID; // Return ID for "Unidentified Errors" if no match found
}
-148
Ver fichero
@@ -1,148 +0,0 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/**
* @file main.c
* @brief Demo program showing how to use the ACA decoder
*
* This is a demonstration program that shows how to use the ACA decoder
* with sample raw data to decode ACA error information.
*/
#include <stdio.h>
#include <aca_api.h>
#include <aca_version.h>
#include <aca_constants.h>
#include <stdint.h>
#include <inttypes.h>
// Function prototype
void print_error_info(const aca_error_info_t *info);
void print_version_info(void);
// Function to print error info in JSON format
void print_error_info(const aca_error_info_t *info)
{
printf("{\n");
printf(" \"bank\": \"%s\",\n", info->bank_ref);
printf(" \"error_location\": {\n");
printf(" \"oam\": \"%d\",\n", info->oam);
printf(" \"aid\": \"%d\",\n", info->aid);
printf(" \"instance\": \"%s\"\n", info->instance_ref);
printf(" },\n");
printf(" \"severity\": \"%s\",\n", info->severity_ref);
printf(" \"afid\": \"%d\",\n", info->afid);
printf(" \"scrub\": \"%u\",\n", info->scrub);
printf(" \"err_ext\": \"%u\",\n", info->error_code_ext);
printf(" \"error_category\": \"%s\",\n", info->category_ref);
printf(" \"error_type\": \"%s\",\n", info->error_type_ref);
printf(" \"address\": \"0x%" PRIx64 "\",\n", info->raw_addr);
printf(" \"syndrome\": \"0x%" PRIx64 "\"\n", info->raw_synd);
printf("}\n");
}
// Function to print version information
void print_version_info(void)
{
printf("=== ACA Decoder Library Version Information ===\n");
printf("Version: %s\n", aca_get_version_string());
printf("Major: %d\n", aca_get_version_major());
printf("Minor: %d\n", aca_get_version_minor());
printf("Patch: %d\n", aca_get_version_patch());
aca_version_info_t version_info = aca_get_version_info();
printf("Complete version info:\n");
printf(" Major: %d\n", version_info.major);
printf(" Minor: %d\n", version_info.minor);
printf(" Patch: %d\n", version_info.patch);
printf(" String: %s\n", version_info.string);
printf("===============================================\n\n");
}
int main()
{
// Display version information
print_version_info();
// Sample usage of decode_afid with 32-byte register array (HBM FATAL ERROR, expected output is 4)
uint64_t register_array_32[ACA_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbaa000000004081b, 0x0, 0x209600090f00, 0x5d000000};
int afid_32 = decode_afid(register_array_32, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1);
printf("Decoded AFID (32-byte array): %d\n", afid_32);
// Sample usage of decode_afid with 32-byte register array (GC FATAL ERROR, expected output is 3)
uint64_t register_array_test[ACA_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbea00000003b0000, 0x100000029, 0x1200136430400, 0x20b};
int afid_test = decode_afid(register_array_test, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1);
printf("Decoded AFID (test array): %d\n", afid_test);
// Sample usage of decode_afid with 128-byte register array (HBM CORRECTED ERROR, expected output is 1)
uint64_t register_array_128[ACA_REGISTER_ARRAY_SIZE_128_BYTES] = {
0xffff,
0xdc2040000000011b,
0x0,
0xd008000801000000,
0x25000001ff,
0x209600191f00,
0xa000000,
0x0,
0x0,
0x0,
0xd008000801000000,
0x0,
0x0,
0x0,
0x0,
0x0};
int afid_128 = decode_afid(register_array_128, ACA_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1);
printf("Decoded AFID (128-byte array): %d\n", afid_128);
// sample for bad page
uint64_t register_array_bad_page[ACA_REGISTER_ARRAY_SIZE_128_BYTES] = {
0x1,
0xb000000000000137,
0x0,
0x0,
0x1ff00000002,
0x9600000000,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0};
// when flag is 0b1000, it indicates that the error threshold has been exceeded
// and is always a HBM error. The expected output is 19.
int afid_bad_page = decode_afid(register_array_bad_page, ACA_REGISTER_ARRAY_SIZE_128_BYTES, ACA_FLAG_THRESHOLD_EXCEEDED, 1);
printf("Decoded AFID (bad page): %d\n", afid_bad_page);
const aca_error_info_t error_info_32 = decode_error_info(register_array_32, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1);
print_error_info(&error_info_32);
const aca_error_info_t error_info_128 = decode_error_info(register_array_128, ACA_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1);
print_error_info(&error_info_128);
return 0;
}
+8 -8
Ver fichero
@@ -29,7 +29,7 @@
#include <sstream>
extern "C" {
#include "aca-decode/aca_decode.h"
#include "ras-decode/aca_decode.h"
}
#include "amd_smi/impl/amd_smi_cper.h"
#include "rocm_smi/rocm_smi_logger.h"
@@ -254,16 +254,16 @@ static int cper_dump_sec_desc(const struct cper_sec_desc *desc)
return 0;
}
static int aca_decode_fatal(const cper_sec_crashdump_data &data, uint32_t flag, uint16_t hw_revision)
static int aca_decode_fatal(const cper_sec_crashdump_data &data, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type)
{
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(&data.dump.fatal_err);
return decode_afid(register_array, sizeof(data.dump.fatal_err)/sizeof(uint64_t), flag, hw_revision);
return decode_afid(register_array, sizeof(data.dump.fatal_err)/sizeof(uint64_t), flag, hw_revision, register_context_type);
}
static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes, uint32_t flag, uint16_t hw_revision)
static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type)
{
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(reg_dump);
return decode_afid(register_array, num_bytes, flag, hw_revision);
return decode_afid(register_array, num_bytes, flag, hw_revision, register_context_type);
}
static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err, const cper_sec_desc *section)
@@ -299,7 +299,7 @@ exit:
LOG_DEBUG(ss);
return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump)/sizeof(uint64_t),
section->flags_mask, section->revision_major);
section->flags_mask, section->revision_major, body->err_ctx.reg_ctx_type);
}
static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section)
@@ -320,7 +320,7 @@ static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump, const
LOG_DEBUG(ss);
return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major);
return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major, crashdump->data.reg_ctx_type);
}
static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section)
@@ -335,7 +335,7 @@ static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump, const c
ss << "~~~~CRASH DUMP - BOOT TIME~~~\n\n";
LOG_DEBUG(ss);
return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major);
return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major, crashdump->data.reg_ctx_type);
}
} //namespace
@@ -32,8 +32,11 @@
#include "aca_decode.h"
#include "aca_tables.h"
#include "error_map.h"
#include "aca_constants.h"
#include "ras_decode_constants.h"
#include "json_util.h"
#include <string.h>
#include <stdio.h>
#include <inttypes.h>
/**
* @brief Gets the bank name based on hardware ID and ACA type
@@ -61,18 +64,18 @@ aca_decoder_get_bank(const aca_decoder_t *decoder, const char **bank_name)
static const char *get_error_severity(const aca_status_fields_t *status)
{
if (status->poison)
return ACA_SEVERITY_UNCORRECTED_NON_FATAL;
return RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL;
if (status->pcc)
return ACA_SEVERITY_FATAL;
return RAS_DECODE_SEVERITY_FATAL;
if (!status->pcc && status->uc && status->tcc)
return ACA_SEVERITY_FATAL;
return RAS_DECODE_SEVERITY_FATAL;
if (!status->pcc && status->uc && !status->tcc)
return ACA_SEVERITY_UNCORRECTED_NON_FATAL;
return RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL;
if (!status->pcc && !status->uc && !status->tcc && status->deferred)
return ACA_SEVERITY_UNCORRECTED_NON_FATAL;
return RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL;
if (!status->pcc && !status->uc && !status->tcc && !status->deferred)
return ACA_SEVERITY_CORRECTED;
return ACA_SEVERITY_UNKNOWN;
return RAS_DECODE_SEVERITY_CORRECTED;
return RAS_DECODE_SEVERITY_UNKNOWN;
}
/**
@@ -85,31 +88,31 @@ static const char *get_error_category(const char *bank, const char *error_type)
{
if (!bank || !error_type)
{
return ACA_SEVERITY_UNKNOWN;
return RAS_DECODE_SEVERITY_UNKNOWN;
}
if (strcmp(bank, ACA_BANK_UMC) == 0)
if (strcmp(bank, RAS_DECODE_BANK_UMC) == 0)
{
if (strcmp(error_type, ACA_ERROR_TYPE_ON_DIE_ECC) == 0 ||
if (strcmp(error_type, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC) == 0 ||
strcmp(error_type, "WriteDataPoisonErr") == 0 ||
strcmp(error_type, "AddressCommandParityErr") == 0 ||
strcmp(error_type, "WriteDataCrcErr") == 0 ||
strcmp(error_type, "EcsErr") == 0 ||
strcmp(error_type, "RdCrcErr") == 0 ||
strcmp(error_type, ACA_ERROR_TYPE_END_TO_END_CRC) == 0)
strcmp(error_type, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC) == 0)
{
return ACA_CATEGORY_HBM_ERRORS;
return RAS_DECODE_CATEGORY_HBM_ERRORS;
}
}
else if (strcmp(bank, ACA_BANK_PCS_XGMI) == 0 ||
strcmp(bank, ACA_BANK_KPX_SERDES) == 0 ||
strcmp(bank, ACA_BANK_KPX_WAFL) == 0 ||
(strcmp(bank, ACA_BANK_PSP) == 0 && strcmp(error_type, ACA_ERROR_TYPE_WAFL) == 0))
else if (strcmp(bank, RAS_DECODE_BANK_PCS_XGMI) == 0 ||
strcmp(bank, RAS_DECODE_BANK_KPX_SERDES) == 0 ||
strcmp(bank, RAS_DECODE_BANK_KPX_WAFL) == 0 ||
(strcmp(bank, RAS_DECODE_BANK_PSP) == 0 && strcmp(error_type, RAS_DECODE_ERROR_TYPE_WAFL) == 0))
{
return ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS;
return RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS;
}
return ACA_CATEGORY_DEVICE_INTERNAL_ERRORS;
return RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS;
}
/**
@@ -125,55 +128,55 @@ static int get_service_error_type(const char *error_category, const char *error_
const char *error_severity, const char **service_error_type)
{
if (!error_category || !error_type || !error_severity || !service_error_type ||
strcmp(error_category, ACA_SEVERITY_UNKNOWN) == 0 ||
strcmp(error_type, ACA_SEVERITY_UNKNOWN) == 0 ||
strcmp(error_severity, ACA_SEVERITY_UNKNOWN) == 0)
strcmp(error_category, RAS_DECODE_SEVERITY_UNKNOWN) == 0 ||
strcmp(error_type, RAS_DECODE_SEVERITY_UNKNOWN) == 0 ||
strcmp(error_severity, RAS_DECODE_SEVERITY_UNKNOWN) == 0)
{
return -1;
}
if (strcmp(error_type, ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0)
if (strcmp(error_type, RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0)
{
*service_error_type = ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD;
*service_error_type = RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD;
return 0;
}
if ((strcmp(error_category, ACA_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, ACA_SEVERITY_CORRECTED) == 0))
if ((strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, RAS_DECODE_SEVERITY_CORRECTED) == 0))
{
*service_error_type = ACA_ERROR_TYPE_ALL;
*service_error_type = RAS_DECODE_ERROR_TYPE_ALL;
return 0;
}
if (strcmp(error_type, "RdCrcErr") == 0)
{
*service_error_type = ACA_ERROR_TYPE_END_TO_END_CRC;
*service_error_type = RAS_DECODE_ERROR_TYPE_END_TO_END_CRC;
return 0;
}
if ((strcmp(error_category, ACA_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, ACA_SEVERITY_FATAL) == 0) &&
(strcmp(error_type, ACA_ERROR_TYPE_ON_DIE_ECC) != 0) && (strcmp(error_type, ACA_ERROR_TYPE_END_TO_END_CRC) != 0))
if ((strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0) &&
(strcmp(error_type, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC) != 0) && (strcmp(error_type, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC) != 0))
{
*service_error_type = ACA_ERROR_TYPE_ALL_OTHERS;
*service_error_type = RAS_DECODE_ERROR_TYPE_ALL_OTHERS;
return 0;
}
if (strcmp(error_category, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS) == 0)
if (strcmp(error_category, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS) == 0)
{
if ((strcmp(error_severity, ACA_SEVERITY_UNCORRECTED_NON_FATAL) == 0 ||
strcmp(error_severity, ACA_SEVERITY_CORRECTED) == 0 ||
strcmp(error_severity, ACA_SEVERITY_FATAL) == 0) &&
strcmp(error_type, ACA_ERROR_TYPE_HARDWARE_ASSERTION) != 0 &&
strcmp(error_type, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT) != 0)
if ((strcmp(error_severity, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL) == 0 ||
strcmp(error_severity, RAS_DECODE_SEVERITY_CORRECTED) == 0 ||
strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0) &&
strcmp(error_type, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION) != 0 &&
strcmp(error_type, RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT) != 0)
{
*service_error_type = ACA_ERROR_TYPE_ALL_OTHERS;
*service_error_type = RAS_DECODE_ERROR_TYPE_ALL_OTHERS;
return 0;
}
}
if (strcmp(error_category, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS) == 0)
if (strcmp(error_category, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS) == 0)
{
if (strcmp(error_bank, ACA_BANK_PCS_XGMI) == 0)
if (strcmp(error_bank, RAS_DECODE_BANK_PCS_XGMI) == 0)
{
*service_error_type = ACA_ERROR_TYPE_XGMI;
*service_error_type = RAS_DECODE_ERROR_TYPE_XGMI;
return 0;
}
if (strcmp(error_bank, ACA_BANK_KPX_WAFL) == 0)
if (strcmp(error_bank, RAS_DECODE_BANK_KPX_WAFL) == 0)
{
*service_error_type = ACA_ERROR_TYPE_WAFL;
*service_error_type = RAS_DECODE_ERROR_TYPE_WAFL;
return 0;
}
}
@@ -199,13 +202,15 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i
info->raw_synd = decoder->aca_synd;
info->scrub = decoder->status.scrub;
info->poison = decoder->status.poison;
info->deferred = decoder->status.deferred;
info->error_code_ext = decoder->status.error_code_ext;
result = aca_decoder_get_bank(decoder, &bank);
if (result < 0)
{
bank = ACA_SEVERITY_UNKNOWN;
bank = RAS_DECODE_SEVERITY_UNKNOWN;
}
info->bank_ref = bank;
@@ -215,13 +220,13 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i
}
else
{
info->instance_ref = ACA_ERROR_TYPE_DECODE_INAPPLICABLE;
info->instance_ref = RAS_DECODE_ERROR_TYPE_DECODE_INAPPLICABLE;
}
// 0b1000 indicate error threshold has been exceeded, and is always fatal
if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED)
if (decoder->flags & RAS_DECODE_FLAG_THRESHOLD_EXCEEDED)
{
info->severity_ref = ACA_SEVERITY_FATAL;
info->severity_ref = RAS_DECODE_SEVERITY_FATAL;
}
else
{
@@ -242,31 +247,31 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i
info->aid = -1; // Invalid value
}
if (decoder->status.error_code_ext >= ACA_ERROR_CODE_EXT_MIN && decoder->status.error_code_ext <= ACA_ERROR_CODE_EXT_MAX)
if (decoder->status.error_code_ext >= RAS_DECODE_ERROR_CODE_EXT_MIN && decoder->status.error_code_ext <= RAS_DECODE_ERROR_CODE_EXT_MAX)
{
uint32_t instance_id = decoder->ipid.instance_id_lo;
uint32_t error_info = decoder->synd.error_information & 0xFF;
if ((instance_id == ACA_INSTANCE_ID_XCD0_400 || instance_id == ACA_INSTANCE_ID_XCD1_400 ||
instance_id == ACA_INSTANCE_ID_XCD0_401 || instance_id == ACA_INSTANCE_ID_XCD1_401) &&
if ((instance_id == RAS_DECODE_INSTANCE_ID_XCD0_400 || instance_id == RAS_DECODE_INSTANCE_ID_XCD1_400 ||
instance_id == RAS_DECODE_INSTANCE_ID_XCD0_401 || instance_id == RAS_DECODE_INSTANCE_ID_XCD1_401) &&
find_error_in_table(xcd_error_table, NUM_XCD_ERRORS, error_info, &error_type) == 0)
{
info->error_type_ref = error_type;
}
else if ((instance_id == ACA_INSTANCE_ID_AID_400 || instance_id == ACA_INSTANCE_ID_AID_401) &&
else if ((instance_id == RAS_DECODE_INSTANCE_ID_AID_400 || instance_id == RAS_DECODE_INSTANCE_ID_AID_401) &&
find_error_in_table(aid_error_table, NUM_AID_ERRORS, error_info, &error_type) == 0)
{
info->error_type_ref = error_type;
}
else
{
info->error_type_ref = ACA_SEVERITY_UNKNOWN;
info->error_type_ref = RAS_DECODE_SEVERITY_UNKNOWN;
}
}
// 0b1000 indicate error threshold has been exceeded
else if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED)
else if (decoder->flags & RAS_DECODE_FLAG_THRESHOLD_EXCEEDED)
{
info->error_type_ref = ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD;
info->error_type_ref = RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD;
}
else
{
@@ -276,14 +281,14 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i
}
else
{
info->error_type_ref = ACA_SEVERITY_UNKNOWN;
info->error_type_ref = RAS_DECODE_SEVERITY_UNKNOWN;
}
}
// 0b1000 indicate error threshold has been exceeded, and is always a HBM error
if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED)
if (decoder->flags & RAS_DECODE_FLAG_THRESHOLD_EXCEEDED)
{
info->category_ref = ACA_CATEGORY_HBM_ERRORS;
info->category_ref = RAS_DECODE_CATEGORY_HBM_ERRORS;
}
else
{
@@ -295,8 +300,6 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i
{
service_error = info->error_type_ref;
}
info->afid = get_error_id(info->category_ref, service_error, info->severity_ref);
}
/**
@@ -324,8 +327,17 @@ static void aca_decoder_init(aca_decoder_t *decoder, uint16_t hw_revision, uint3
aca_synd_init(&decoder->synd, synd_reg);
}
aca_error_info_t aca_decode(const aca_raw_data_t *raw_data)
/**
* @brief Main decode function that processes raw ACA error data and returns JSON
* @param[in] raw_data Pointer to structure containing raw ACA error data
* @return JsonValue* containing the decoded error information, or NULL on failure
*/
JsonValue* aca_decode(const aca_raw_data_t *raw_data)
{
if (!raw_data) {
return NULL;
}
aca_decoder_t decoder = {0};
aca_error_info_t info = {0};
@@ -337,5 +349,68 @@ aca_error_info_t aca_decode(const aca_raw_data_t *raw_data)
raw_data->aca_synd);
aca_decoder_get_error_info(&decoder, &info);
return info;
// Create the main JSON object
JsonValue *json_obj = json_create_object();
if (!json_obj) {
return NULL;
}
// Add bank
json_object_set(json_obj, "bank", json_create_string(info.bank_ref));
// Create error_location object
JsonValue *error_location = json_create_object();
if (error_location) {
char oam_str[16], aid_str[16];
snprintf(oam_str, sizeof(oam_str), "%d", info.oam);
snprintf(aid_str, sizeof(aid_str), "%d", info.aid);
json_object_set(error_location, "oam", json_create_string(oam_str));
json_object_set(error_location, "aid", json_create_string(aid_str));
json_object_set(error_location, "instance", json_create_string(info.instance_ref));
json_object_set(json_obj, "error_location", error_location);
}
// Add severity
json_object_set(json_obj, "severity", json_create_string(info.severity_ref));
// Add scrub as string
char scrub_str[16];
snprintf(scrub_str, sizeof(scrub_str), "%u", info.scrub);
json_object_set(json_obj, "scrub", json_create_string(scrub_str));
// Add poison as string
char poison_str[16];
snprintf(poison_str, sizeof(poison_str), "%u", info.poison);
json_object_set(json_obj, "poison", json_create_string(poison_str));
// Add deferred as string
char deferred_str[16];
snprintf(deferred_str, sizeof(deferred_str), "%u", info.deferred);
json_object_set(json_obj, "deferred", json_create_string(deferred_str));
// Add err_ext as string
char err_ext_str[16];
snprintf(err_ext_str, sizeof(err_ext_str), "%u", info.error_code_ext);
json_object_set(json_obj, "err_ext", json_create_string(err_ext_str));
// Add error_category
json_object_set(json_obj, "error_category", json_create_string(info.category_ref));
// Add error_type
json_object_set(json_obj, "error_type", json_create_string(info.error_type_ref));
// Add address as hex string
char address_str[32];
snprintf(address_str, sizeof(address_str), "0x%" PRIx64, info.raw_addr);
json_object_set(json_obj, "address", json_create_string(address_str));
// Add syndrome as hex string
char syndrome_str[32];
snprintf(syndrome_str, sizeof(syndrome_str), "0x%" PRIx64, info.raw_synd);
json_object_set(json_obj, "syndrome", json_create_string(syndrome_str));
return json_obj;
}
@@ -1,4 +1,3 @@
// SPDX-License-Identifier: MIT
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
@@ -21,7 +20,7 @@
* THE SOFTWARE.
*/
/**
/**
* @file aca_fields.c
* @brief Implementation of ACA register field handling
*
@@ -33,7 +33,7 @@
*/
#include "aca_tables.h"
#include "aca_constants.h"
#include "ras_decode_constants.h"
#include <stdint.h>
#include <stddef.h>
#include <string.h>
@@ -80,19 +80,19 @@ const aca_error_type_t error_table[] = {
{"cs", 0xe, "FTI_ND_ILL_REQ"},
{"cs", 0xf, "FTI_ND_ADDR_VIOL"},
{"cs", 0x10, "FTI_ND_SEC_VIOL"},
{"cs", 0x11, ACA_ERROR_TYPE_HARDWARE_ASSERTION},
{"cs", 0x11, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION},
{"cs", 0x12, "ST_PRT_ERR"},
{"cs", 0x13, "ST_ECC_ERR"},
{"cs", 0x14, "ST_TXN_ERR"},
{"pie", 0x0, ACA_ERROR_TYPE_HARDWARE_ASSERTION},
{"pie", 0x0, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION},
{"pie", 0x1, "CSW"},
{"pie", 0x2, "GMI"},
{"pie", 0x3, "FTI_DAT_STAT"},
{"pie", 0x4, "DEF"},
{"pie", 0x5, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT},
{"pie", 0x5, RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT},
{"pie", 0x6, "CNLI"},
{"pie", 0x7, "RSLVFCI"},
{"umc", 0x0, ACA_ERROR_TYPE_ON_DIE_ECC},
{"umc", 0x0, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC},
{"umc", 0x1, "WriteDataPoisonErr"},
{"umc", 0x2, "SdpParityErr"},
{"umc", 0x4, "AddressCommandParityErr"},
@@ -103,7 +103,7 @@ const aca_error_type_t error_table[] = {
{"umc", 0xb, "RdCrcErr"},
{"umc", 0xd, "MpFwErr"},
{"umc", 0xe, "MpParErr"},
{"umc", 0xf, ACA_ERROR_TYPE_END_TO_END_CRC},
{"umc", 0xf, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC},
{"psp", 0x0, "Mp0HighSramError"},
{"psp", 0x1, "Mp0LowSramError"},
{"psp", 0x2, "Mp0IDataBank0Error"},
@@ -127,7 +127,7 @@ const aca_error_type_t error_table[] = {
{"psp", 0x3b, "SRAM_EDC"},
{"psp", 0x3c, "SMN_Parity"},
{"psp", 0x3d, "SMN_Timeout"},
{"psp", 0x3f, ACA_ERROR_TYPE_WAFL},
{"psp", 0x3f, RAS_DECODE_ERROR_TYPE_WAFL},
{"smu", 0x0, "Mp5HighSramError"},
{"smu", 0x1, "Mp5LowSramError"},
{"smu", 0x2, "Mp5DCacheAError"},
@@ -452,7 +452,11 @@ static const aca_instance_entry_t instance_table[] = {
{"umc", 0x193F00, "ch7 umc0"},
{"umc", 0x393F00, "ch7 umc1"},
{"umc", 0x593F00, "ch7 umc2"},
{"umc", 0x793F00, "ch7 umc3"}};
{"umc", 0x793F00, "ch7 umc3"},
{"pcs_xgmi", 0x11A09200, "serdes a pcs0"},
{"pcs_xgmi", 0x12109200, "serdes b pcs7"},
{"pcs_xgmi", 0x12209200, "serdes b pcs8"},
{"pcs_xgmi", 0x11B09200, "xgmi pcs"}};
const size_t NUM_OAM_AID_ENTRIES = sizeof(oam_aid_table) / sizeof(oam_aid_table[0]);
const size_t NUM_BANKS = sizeof(bank_table) / sizeof(bank_table[0]);
@@ -478,7 +482,7 @@ int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name)
}
}
*bank_name = ACA_SEVERITY_UNKNOWN;
*bank_name = RAS_DECODE_SEVERITY_UNKNOWN;
return 1;
}
@@ -499,7 +503,7 @@ int find_error_type_by_bank(const char *bank, uint32_t error_code, const char **
}
}
*error_type = ACA_SEVERITY_UNKNOWN;
*error_type = RAS_DECODE_SEVERITY_UNKNOWN;
return 1;
}
@@ -520,7 +524,7 @@ int find_error_in_table(const aca_error_entry_t *table, size_t table_size,
}
}
*error_type = ACA_SEVERITY_UNKNOWN;
*error_type = RAS_DECODE_SEVERITY_UNKNOWN;
return 1;
}
@@ -556,6 +560,6 @@ int find_instance_name(const char *bank, uint32_t instance_id_lo, const char **i
}
}
*instance_name = ACA_SEVERITY_UNKNOWN;
*instance_name = RAS_DECODE_SEVERITY_UNKNOWN;
return 1;
}
@@ -26,32 +26,32 @@
int aca_get_version_major(void)
{
return ACA_VERSION_MAJOR;
return RAS_DECODE_VERSION_MAJOR;
}
int aca_get_version_minor(void)
{
return ACA_VERSION_MINOR;
return RAS_DECODE_VERSION_MINOR;
}
int aca_get_version_patch(void)
{
return ACA_VERSION_PATCH;
return RAS_DECODE_VERSION_PATCH;
}
const char *aca_get_version_string(void)
{
return ACA_VERSION_STRING;
return RAS_DECODE_VERSION_STRING;
}
aca_version_info_t aca_get_version_info(void)
{
aca_version_info_t info;
info.major = ACA_VERSION_MAJOR;
info.minor = ACA_VERSION_MINOR;
info.patch = ACA_VERSION_PATCH;
info.string = ACA_VERSION_STRING;
info.major = RAS_DECODE_VERSION_MAJOR;
info.minor = RAS_DECODE_VERSION_MINOR;
info.patch = RAS_DECODE_VERSION_PATCH;
info.string = RAS_DECODE_VERSION_STRING;
return info;
}
+862
Ver fichero
@@ -0,0 +1,862 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "boot_decode.h"
#include "ras_decode_constants.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Boot decoder mapping tables
static const boot_decoder_entry_t boot_decoder_map_v0[] = {
{BOOT_ENCODING_HBM_TRAINING, decode_hbm_training_v0},
{BOOT_ENCODING_FW_LOAD, decode_fw_load_v0},
{BOOT_ENCODING_WAFL_LINK, decode_wafl_link_training_v0},
{BOOT_ENCODING_XGMI_LINK, decode_xgmi_link_training_v0},
{BOOT_ENCODING_USR_CP_LINK, decode_usr_cp_link_training_v0},
{BOOT_ENCODING_USR_DP_LINK, decode_usr_dp_link_training_v0},
{BOOT_ENCODING_HBM_MEM_TEST, decode_hbm_mem_test_v0},
{BOOT_ENCODING_HBM_BIST_TEST, decode_hbm_bist_test_v0},
{BOOT_ENCODING_BOOT_CTRL_GEN_V0, decode_boot_controller_generic_v0},
{0, NULL} // Sentinel
};
static const boot_decoder_entry_t boot_decoder_map_v1[] = {
{BOOT_ENCODING_HBM_TRAINING, decode_hbm_training_v1},
{BOOT_ENCODING_FW_LOAD, decode_fw_load_v1},
{BOOT_ENCODING_WAFL_LINK, decode_wafl_link_training_v1},
{BOOT_ENCODING_XGMI_LINK, decode_xgmi_link_training_v1},
{BOOT_ENCODING_USR_CP_LINK, decode_usr_cp_link_training_v1},
{BOOT_ENCODING_USR_DP_LINK, decode_usr_dp_link_training_v1},
{BOOT_ENCODING_HBM_MEM_TEST, decode_hbm_mem_test_v1},
{BOOT_ENCODING_HBM_BIST_TEST, decode_hbm_bist_test_v1},
{BOOT_ENCODING_BOOT_CTRL_GEN_V1, decode_boot_controller_generic_v1},
{BOOT_ENCODING_DATA_ABORT, decode_data_abort_v1},
{BOOT_SUCCESS_ENCODING, decode_boot_success_v1},
{0, NULL} // Sentinel
};
int get_boot_version(OamBootMsg *msg)
{
if (!msg)
return 0;
return extract_byte(msg->value, 1) >> 5;
}
int get_error_encoding(OamBootMsg *msg)
{
if (!msg)
return 0;
return (int)(extract_byte(msg->value, 1) & extract_bits(5));
}
bool error_present(OamBootMsg *msg)
{
if (!msg)
return false;
return extract_byte(msg->value, 0) == BOOT_ERROR_PRESENT_MARKER;
}
bool in_boot(OamBootMsg *msg)
{
if (!msg)
return false;
return extract_byte(msg->value, 0) == BOOT_IN_BOOT_MARKER;
}
int get_socket(OamBootMsg *msg, int version)
{
if (!msg)
return 0;
if (version == 0)
{
return extract_byte(msg->value, 4);
}
else
{
return (int)((extract_byte(msg->value, 2) >> 4) & extract_bits(4));
}
}
int get_aid(OamBootMsg *msg, int version)
{
if (!msg)
return 0;
if (version == 0)
{
return extract_byte(msg->value, 5);
}
else
{
return (int)(extract_byte(msg->value, 2) & extract_bits(4));
}
}
int decode_hbm_stack(uint8_t stack)
{
switch (stack)
{
case HBM_STACK_0:
return 0;
case HBM_STACK_1:
return 1;
default:
return HBM_STACK_UNKNOWN;
}
}
JsonValue *create_failed_links_array(uint8_t byte_value, int max_links)
{
JsonValue *array = json_create_array();
if (!array)
return NULL;
for (int i = 0; i < max_links; i++)
{
if ((byte_value >> i) & 0x1)
{
JsonValue *link_num = json_create_number(i);
if (link_num)
{
json_array_push(array, link_num);
}
}
}
return array;
}
char *create_hex_string(uint64_t value, int width)
{
if (width < 0)
return NULL;
size_t buffer_size = (size_t)width + 3U; // "0x" + digits + null terminator
char *hex_str = malloc(buffer_size);
if (!hex_str)
return NULL;
snprintf(hex_str, buffer_size, "0x%0*llX", width, (unsigned long long)value);
return hex_str;
}
// Version 0 decoder implementations
JsonValue *decode_hbm_training_v0(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte3 = extract_byte(msg->value, 3);
uint8_t byte2 = extract_byte(msg->value, 2);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte3)));
json_object_set(result, "hbm_channel", json_create_number(byte2));
return result;
}
JsonValue *decode_fw_load_v0(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte3 = extract_byte(msg->value, 3);
uint8_t byte2 = extract_byte(msg->value, 2);
uint16_t fw_id = (byte3 << 8) | byte2;
char *fw_id_str = create_hex_string(fw_id, 4);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_FW_LOAD));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "fw_id", json_create_string(fw_id_str ? fw_id_str : "0x0000"));
free(fw_id_str);
return result;
}
JsonValue *decode_wafl_link_training_v0(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte2 = extract_byte(msg->value, 2);
JsonValue *failed_links = create_failed_links_array(byte2, 2);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_WAFL_LINK_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array());
return result;
}
JsonValue *decode_xgmi_link_training_v0(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte2 = extract_byte(msg->value, 2);
JsonValue *failed_links = create_failed_links_array(byte2, 8);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_XGMI_LINK_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array());
return result;
}
JsonValue *decode_usr_cp_link_training_v0(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte2 = extract_byte(msg->value, 2);
JsonValue *failed_links = create_failed_links_array(byte2, 2);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_USR_CP_LINK_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array());
return result;
}
JsonValue *decode_usr_dp_link_training_v0(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte2 = extract_byte(msg->value, 2);
JsonValue *failed_links = create_failed_links_array(byte2, 4);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_USR_DP_LINK_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array());
return result;
}
JsonValue *decode_hbm_mem_test_v0(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte3 = extract_byte(msg->value, 3);
uint8_t byte2 = extract_byte(msg->value, 2);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_MEMORY_TEST));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte3)));
json_object_set(result, "hbm_channel", json_create_number(byte2));
return result;
}
JsonValue *decode_hbm_bist_test_v0(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte3 = extract_byte(msg->value, 3);
uint8_t byte2 = extract_byte(msg->value, 2);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_BIST_TEST));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte3)));
json_object_set(result, "hbm_channel", json_create_number(byte2));
return result;
}
JsonValue *decode_boot_controller_generic_v0(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_GENERIC));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
return result;
}
// Version 1 decoder implementations
JsonValue *decode_hbm_training_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte5 = extract_byte(msg->value, 5);
uint8_t byte4 = extract_byte(msg->value, 4);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte5)));
json_object_set(result, "hbm_channel", json_create_number(byte4));
return result;
}
JsonValue *decode_fw_load_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte5 = extract_byte(msg->value, 5);
uint8_t byte4 = extract_byte(msg->value, 4);
uint16_t fw_id = (byte5 << 8) | byte4;
char *fw_id_str = create_hex_string(fw_id, 4);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_FW_LOAD));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "fw_id", json_create_string(fw_id_str ? fw_id_str : "0x0000"));
free(fw_id_str);
return result;
}
JsonValue *decode_wafl_link_training_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte4 = extract_byte(msg->value, 4);
JsonValue *failed_links = create_failed_links_array(byte4, 2);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_WAFL_LINK_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array());
return result;
}
JsonValue *decode_xgmi_link_training_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte4 = extract_byte(msg->value, 4);
JsonValue *failed_links = create_failed_links_array(byte4, 8);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_XGMI_LINK_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array());
return result;
}
JsonValue *decode_usr_cp_link_training_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte4 = extract_byte(msg->value, 4);
JsonValue *failed_links = create_failed_links_array(byte4, 2);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_USR_CP_LINK_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array());
return result;
}
JsonValue *decode_usr_dp_link_training_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte4 = extract_byte(msg->value, 4);
JsonValue *failed_links = create_failed_links_array(byte4, 4);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_USR_DP_LINK_TRAINING));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array());
return result;
}
JsonValue *decode_hbm_mem_test_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte5 = extract_byte(msg->value, 5);
uint8_t byte4 = extract_byte(msg->value, 4);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_MEMORY_TEST));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte5)));
json_object_set(result, "hbm_channel", json_create_number(byte4));
return result;
}
JsonValue *decode_hbm_bist_test_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte5 = extract_byte(msg->value, 5);
uint8_t byte4 = extract_byte(msg->value, 4);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_BIST_TEST));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte5)));
json_object_set(result, "hbm_channel", json_create_number(byte4));
return result;
}
JsonValue *decode_boot_controller_generic_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte4 = extract_byte(msg->value, 4);
uint8_t byte0 = extract_byte(msg->value, 0);
uint8_t byte5 = extract_byte(msg->value, 5);
uint8_t byte6 = extract_byte(msg->value, 6);
uint8_t byte7 = extract_byte(msg->value, 7);
char *boot_step_str = create_hex_string(byte4, 2);
uint32_t boot_status = (byte7 << 24) | (byte6 << 16) | (byte5 << 8) | byte0;
char *boot_status_str = create_hex_string(boot_status, 8);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_GENERIC));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "last_successful_boot_step_number",
json_create_string(boot_step_str ? boot_step_str : "0x00"));
json_object_set(result, "fw_boot_status",
json_create_string(boot_status_str ? boot_status_str : "0x00000000"));
free(boot_step_str);
free(boot_status_str);
return result;
}
JsonValue *decode_data_abort_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
int version = get_boot_version(msg);
uint8_t byte3 = extract_byte(msg->value, 3);
uint8_t byte4 = extract_byte(msg->value, 4);
uint8_t byte5 = extract_byte(msg->value, 5);
uint8_t byte6 = extract_byte(msg->value, 6);
uint8_t byte7 = extract_byte(msg->value, 7);
char *boot_step_str = create_hex_string(byte3, 2);
uint32_t exception_addr = (byte7 << 24) | (byte6 << 16) | (byte5 << 8) | byte4;
char *exception_addr_str = create_hex_string(exception_addr, 8);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT));
json_object_set(result, "socket", json_create_number(get_socket(msg, version)));
json_object_set(result, "aid", json_create_number(get_aid(msg, version)));
json_object_set(result, "last_successful_boot_step_number",
json_create_string(boot_step_str ? boot_step_str : "0x00"));
json_object_set(result, "exception_address",
json_create_string(exception_addr_str ? exception_addr_str : "0x00000000"));
free(boot_step_str);
free(exception_addr_str);
return result;
}
JsonValue *decode_boot_success_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
uint8_t byte4 = extract_byte(msg->value, 4);
uint8_t byte0 = extract_byte(msg->value, 0);
uint8_t byte5 = extract_byte(msg->value, 5);
uint8_t byte6 = extract_byte(msg->value, 6);
uint8_t byte7 = extract_byte(msg->value, 7);
char *boot_step_str = create_hex_string(byte4, 2);
uint32_t boot_status = (byte7 << 24) | (byte6 << 16) | (byte5 << 8) | byte0;
char *boot_status_str = create_hex_string(boot_status, 8);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_BOOT_SUCCESS));
json_object_set(result, "last_successful_boot_step_number",
json_create_string(boot_step_str ? boot_step_str : "0x00"));
json_object_set(result, "fw_boot_status",
json_create_string(boot_status_str ? boot_status_str : "0x00000000"));
free(boot_step_str);
free(boot_status_str);
return result;
}
// Unhandled error decoders
JsonValue *decode_unhandled_error_v0(OamBootMsg *msg)
{
(void)msg; // Suppress unused parameter warning
JsonValue *result = json_create_object();
if (!result)
return NULL;
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_UNHANDLED));
return result;
}
JsonValue *decode_unhandled_error_v1(OamBootMsg *msg)
{
if (!msg)
return NULL;
JsonValue *result = json_create_object();
if (!result)
return NULL;
uint8_t byte4 = extract_byte(msg->value, 4);
uint8_t byte0 = extract_byte(msg->value, 0);
uint8_t byte5 = extract_byte(msg->value, 5);
uint8_t byte6 = extract_byte(msg->value, 6);
uint8_t byte7 = extract_byte(msg->value, 7);
char *boot_step_str = create_hex_string(byte4, 2);
uint32_t boot_status = (byte7 << 24) | (byte6 << 16) | (byte5 << 8) | byte0;
char *boot_status_str = create_hex_string(boot_status, 8);
json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_UNHANDLED));
json_object_set(result, "last_successful_boot_step_number",
json_create_string(boot_step_str ? boot_step_str : "0x00"));
json_object_set(result, "fw_boot_status",
json_create_string(boot_status_str ? boot_status_str : "0x00000000"));
free(boot_step_str);
free(boot_status_str);
return result;
}
boot_decoder_func_t get_decoder_function(OamBootMsg *msg)
{
if (!msg)
return NULL;
uint8_t byte0 = extract_byte(msg->value, 0);
if (byte0 == BOOT_IN_BOOT_MARKER)
{
int version = get_boot_version(msg);
if (version == 1)
{
return decode_boot_success_v1;
}
}
int version = get_boot_version(msg);
int encoding = get_error_encoding(msg);
const boot_decoder_entry_t *decoder_map = (version == 0) ? boot_decoder_map_v0 : boot_decoder_map_v1;
for (int i = 0; decoder_map[i].decoder != NULL; i++)
{
if (decoder_map[i].encoding == encoding)
{
return decoder_map[i].decoder;
}
}
return NULL; // No decoder found
}
JsonValue *boot_decode_orchestrator(uint64_t *oam_boot_msgs, size_t count)
{
if (!oam_boot_msgs || count == 0)
return NULL;
JsonValue *results = json_create_object();
if (!results)
return NULL;
// Convert to OamBootMsg structures
OamBootMsg *msgs = malloc(count * sizeof(OamBootMsg));
if (!msgs)
{
json_free(results);
return NULL;
}
for (size_t i = 0; i < count; i++)
{
msgs[i].value = oam_boot_msgs[i];
}
// Check error markers across all messages
size_t messages_with_markers = 0;
bool *has_marker = malloc(count * sizeof(bool));
if (!has_marker) {
free(msgs);
json_free(results);
return NULL;
}
// Count messages with error markers (0xA4) or boot markers (0xBA)
for (size_t i = 0; i < count; i++)
{
has_marker[i] = error_present(&msgs[i]) || in_boot(&msgs[i]);
if (has_marker[i]) {
messages_with_markers++;
}
}
// Determine decoding strategy based on the presence of error markers
bool decode_all_as_unhandled = (messages_with_markers == 0);
bool decode_only_marked = (messages_with_markers > 0 && messages_with_markers < count);
bool decode_all_normally = (messages_with_markers == count);
// Check if all decoders are NULL (for unhandled error handling)
bool all_decoders_none = true;
if (!decode_all_as_unhandled) {
for (size_t i = 0; i < count; i++)
{
if (has_marker[i] && get_decoder_function(&msgs[i]) != NULL)
{
all_decoders_none = false;
break;
}
}
}
// Process each message
for (size_t i = 0; i < count; i++)
{
char msg_key[32];
snprintf(msg_key, sizeof(msg_key), "msg%zu", i);
// Skip messages without markers if we're in selective decode mode
if (decode_only_marked && !has_marker[i]) {
continue;
}
JsonValue *msg_result = json_create_object();
if (!msg_result)
continue;
boot_decoder_func_t decoder_func = NULL;
if (decode_all_as_unhandled)
{
// Rule 3: No messages have markers, decode all as UNHANDLED
decoder_func = decode_unhandled_error_v1;
}
else if (has_marker[i] || decode_all_normally)
{
// Rule 1 & 2: Decode messages with markers (or all if all have markers)
if (all_decoders_none)
{
// Use unhandled error decoders
int encoding = get_error_encoding(&msgs[i]);
decoder_func = (encoding == 0) ? decode_unhandled_error_v0 : decode_unhandled_error_v1;
}
else
{
decoder_func = get_decoder_function(&msgs[i]);
}
}
// If no decoder function is found, skip this message
if (decoder_func)
{
JsonValue *decoded = decoder_func(&msgs[i]);
if (decoded)
{
// Copy all fields from decoded result to msg_result
for (JsonPair *pair = decoded->data.object; pair != NULL; pair = pair->next)
{
// Create a copy of the value for the new object
JsonValue *value_copy = NULL;
switch (pair->value->type)
{
case JSON_STRING:
value_copy = json_create_string(pair->value->data.string);
break;
case JSON_NUMBER:
value_copy = json_create_number(pair->value->data.number);
break;
case JSON_BOOL:
value_copy = json_create_bool(pair->value->data.boolean);
break;
case JSON_NULL:
value_copy = json_create_null();
break;
case JSON_ARRAY:
// For arrays, we need to copy each element
value_copy = json_create_array();
if (value_copy)
{
for (size_t j = 0; j < pair->value->data.array.count; j++)
{
JsonValue *elem = pair->value->data.array.items[j];
JsonValue *elem_copy = NULL;
if (elem->type == JSON_NUMBER)
{
elem_copy = json_create_number(elem->data.number);
}
if (elem_copy)
{
json_array_push(value_copy, elem_copy);
}
}
}
break;
default:
break;
}
if (value_copy)
{
json_object_set(msg_result, pair->key, value_copy);
}
}
json_free(decoded);
}
}
json_object_set(results, msg_key, msg_result);
}
free(msgs);
free(has_marker);
return results;
}
+88
Ver fichero
@@ -0,0 +1,88 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "error_map.h"
#include "ras_decode_constants.h"
#include <string.h>
#define AFID_VERSION "0.7"
static const error_map_entry_t error_map[] = {
{1, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_FW_LOAD, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{2, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_HBM_BIST_TEST, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{3, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_HBM_MEMORY_TEST, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{4, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_HBM_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{5, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_UNHANDLED, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{6, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_UNKNOWN_ERROR, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{7, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_USR_CP_LINK_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{8, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_USR_DP_LINK_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{9, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_WAFL_LINK_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{10, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_XGMI_LINK_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{11, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{12, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_GENERIC, RAS_DECODE_PROTOCOL_CPER_WITH_SPACE, RAS_DECODE_SEVERITY_FAIL_TO_INIT},
{13, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_PCIE_AER, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED},
{14, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_PCIE_AER, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{15, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_WAFL, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED},
{16, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_WAFL, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{17, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_XGMI, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED},
{18, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_XGMI, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{19, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{20, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{21, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{22, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL},
{23, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL},
{24, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_ALL, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED},
{25, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_ALL_OTHERS, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{26, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{27, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{28, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_ALL_OTHERS, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL},
{29, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_ALL_OTHERS, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED},
{30, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_ALL_OTHERS, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL},
{31, RAS_DECODE_CATEGORY_CPER_FORMAT, RAS_DECODE_ERROR_TYPE_MALFORMED_CPER, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_ALL_CAPS},
{32, RAS_DECODE_CATEGORY_CPER_FORMAT, RAS_DECODE_ERROR_TYPE_INCOMPLETE_RAS_DECODE_DATA, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_ALL_CAPS},
{33, RAS_DECODE_CATEGORY_CPER_FORMAT, RAS_DECODE_ERROR_TYPE_INVALID_RAS_DECODE_DATA, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_ALL_CAPS},
{34, RAS_DECODE_CATEGORY_UNIDENTIFIED_ERRORS, RAS_DECODE_ERROR_TYPE_UNIDENTIFIED_ERROR, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_ALL_CAPS}};
static const size_t NUM_ERROR_ENTRIES = sizeof(error_map) / sizeof(error_map[0]);
int get_error_id(const char *error_category, const char *error_type, const char *error_severity)
{
if (!error_category || !error_type || !error_severity ||
strcmp(error_category, RAS_DECODE_SEVERITY_UNKNOWN) == 0 ||
strcmp(error_type, RAS_DECODE_SEVERITY_UNKNOWN) == 0 ||
strcmp(error_severity, RAS_DECODE_SEVERITY_UNKNOWN) == 0)
{
return RAS_DECODE_ERROR_INVALID_RAS_DECODE_DATA_ID; // Return ID for "Invalid Error" if any input is "UNKNOWN" or NULL
}
for (size_t i = 0; i < NUM_ERROR_ENTRIES; i++)
{
if (strcmp(error_map[i].error_category, error_category) == 0 &&
strcmp(error_map[i].error_type, error_type) == 0 &&
strcmp(error_map[i].error_severity, error_severity) == 0)
{
return (int)error_map[i].id;
}
}
return RAS_DECODE_ERROR_UNIDENTIFIED_ERROR_ID; // Return ID for "Unidentified Errors" if no match found
}
@@ -0,0 +1,74 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "json_printer.h"
#include <stdio.h>
static void print_json_value_internal(JsonValue *value, int indent) {
if (!value) return;
switch (value->type) {
case JSON_NULL:
printf("null");
break;
case JSON_BOOL:
printf("%s", value->data.boolean ? "true" : "false");
break;
case JSON_NUMBER:
printf("%.0f", value->data.number);
break;
case JSON_STRING:
printf("\"%s\"", value->data.string ? value->data.string : "");
break;
case JSON_OBJECT: {
printf("{\n");
JsonPair *pair = value->data.object;
bool first = true;
while (pair) {
if (!first) printf(",\n");
for (int i = 0; i < indent + 3; i++) printf(" ");
printf("\"%s\": ", pair->key);
print_json_value_internal(pair->value, indent + 3);
pair = pair->next;
first = false;
}
printf("\n");
for (int i = 0; i < indent; i++) printf(" ");
printf("}");
break;
}
case JSON_ARRAY: {
printf("[");
for (size_t i = 0; i < value->data.array.count; i++) {
if (i > 0) printf(", ");
print_json_value_internal(value->data.array.items[i], indent);
}
printf("]");
break;
}
}
}
void print_json_value(JsonValue *value) {
print_json_value_internal(value, 0);
printf("\n");
}
+205
Ver fichero
@@ -0,0 +1,205 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "json_util.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define JSON_ARRAY_INITIAL_CAPACITY 16
JsonValue* json_create_null(void) {
JsonValue *val = calloc(1, sizeof(JsonValue));
if (!val) return NULL;
val->type = JSON_NULL;
return val;
}
JsonValue* json_create_bool(bool b) {
JsonValue *val = calloc(1, sizeof(JsonValue));
if (!val) return NULL;
val->type = JSON_BOOL;
val->data.boolean = b;
return val;
}
JsonValue* json_create_number(double num) {
JsonValue *val = calloc(1, sizeof(JsonValue));
if (!val) return NULL;
val->type = JSON_NUMBER;
val->data.number = num;
return val;
}
JsonValue* json_create_string(const char *str) {
if (!str) return NULL;
JsonValue *val = calloc(1, sizeof(JsonValue));
if (!val) return NULL;
val->type = JSON_STRING;
val->data.string = strdup(str);
if (!val->data.string) {
free(val);
return NULL;
}
return val;
}
JsonValue* json_create_object(void) {
JsonValue *val = calloc(1, sizeof(JsonValue));
if (!val) return NULL;
val->type = JSON_OBJECT;
val->data.object = NULL;
return val;
}
JsonValue* json_create_array(void) {
JsonValue *val = calloc(1, sizeof(JsonValue));
if (!val) return NULL;
val->type = JSON_ARRAY;
val->data.array.items = malloc(sizeof(JsonValue*) * JSON_ARRAY_INITIAL_CAPACITY);
if (!val->data.array.items) {
free(val);
return NULL;
}
val->data.array.count = 0;
val->data.array.capacity = JSON_ARRAY_INITIAL_CAPACITY;
return val;
}
void json_object_set(JsonValue *obj, const char *key, JsonValue *value) {
if (!obj || obj->type != JSON_OBJECT || !key || !value) return;
// Check if key already exists and update it
JsonPair *current = obj->data.object;
while (current) {
if (strcmp(current->key, key) == 0) {
json_free(current->value);
current->value = value;
return;
}
current = current->next;
}
// Key doesn't exist, create new pair
JsonPair *pair = malloc(sizeof(JsonPair));
if (!pair) return;
pair->key = strdup(key);
if (!pair->key) {
free(pair);
return;
}
pair->value = value;
pair->next = NULL;
if (!obj->data.object) {
obj->data.object = pair;
} else {
JsonPair *last = obj->data.object;
while (last->next) {
last = last->next;
}
last->next = pair;
}
}
JsonValue* json_object_get(JsonValue *obj, const char *key) {
if (!obj || obj->type != JSON_OBJECT || !key) return NULL;
JsonPair *current = obj->data.object;
while (current) {
if (strcmp(current->key, key) == 0) {
return current->value;
}
current = current->next;
}
return NULL;
}
bool json_object_has_key(JsonValue *obj, const char *key) {
return json_object_get(obj, key) != NULL;
}
bool json_array_push(JsonValue *arr, JsonValue *value) {
if (!arr || arr->type != JSON_ARRAY || !value) return false;
// Resize array if needed
if (arr->data.array.count >= arr->data.array.capacity) {
size_t new_capacity = arr->data.array.capacity * 2;
JsonValue **new_items = realloc(arr->data.array.items,
sizeof(JsonValue*) * new_capacity);
if (!new_items) return false;
arr->data.array.items = new_items;
arr->data.array.capacity = new_capacity;
}
arr->data.array.items[arr->data.array.count] = value;
arr->data.array.count++;
return true;
}
JsonValue* json_array_get(JsonValue *arr, size_t index) {
if (!arr || arr->type != JSON_ARRAY || index >= arr->data.array.count) {
return NULL;
}
return arr->data.array.items[index];
}
size_t json_array_size(JsonValue *arr) {
if (!arr || arr->type != JSON_ARRAY) return 0;
return arr->data.array.count;
}
void json_free(JsonValue *val) {
if (!val) return;
switch (val->type) {
case JSON_STRING:
free(val->data.string);
break;
case JSON_OBJECT: {
JsonPair *current = val->data.object;
while (current) {
JsonPair *next = current->next;
free(current->key);
json_free(current->value);
free(current);
current = next;
}
break;
}
case JSON_ARRAY:
for (size_t i = 0; i < val->data.array.count; i++) {
json_free(val->data.array.items[i]);
}
free(val->data.array.items);
break;
default:
break;
}
free(val);
}
+207
Ver fichero
@@ -0,0 +1,207 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/**
* @file main.c
* @brief Demo program showing how to use the ACA decoder
*
* This is a demonstration program that shows how to use the ACA decoder
* with sample raw data to decode ACA error information.
*/
#include <stdio.h>
#include <ras_decode_api.h>
#include <aca_version.h>
#include <ras_decode_constants.h>
#include <aca_decode.h>
#include <json_printer.h>
#include <stdint.h>
#include <inttypes.h>
// Function prototype
void print_version_info(void);
void demonstrate_json_decoding(void);
void print_version_info(void)
{
printf("=== ACA Decoder Library Version Information ===\n");
printf("Version: %s\n", aca_get_version_string());
printf("Major: %d\n", aca_get_version_major());
printf("Minor: %d\n", aca_get_version_minor());
printf("Patch: %d\n", aca_get_version_patch());
aca_version_info_t version_info = aca_get_version_info();
printf("Complete version info:\n");
printf(" Major: %d\n", version_info.major);
printf(" Minor: %d\n", version_info.minor);
printf(" Patch: %d\n", version_info.patch);
printf(" String: %s\n", version_info.string);
printf("===============================================\n\n");
}
// Function to demonstrate JSON decoding functionality
void demonstrate_json_decoding(void)
{
printf("=== ACA Decoder - JSON Output Examples ===\n");
// Example 1: HBM FATAL ERROR (32-byte array)
uint64_t register_array_32[RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbaa000000004081b, 0x0, 0x209600090f00, 0x5d000000};
printf("\n--- HBM FATAL ERROR (32-byte array) ---\n");
printf("Decoded AFID: %d\n", decode_afid(register_array_32, RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1, 1));
JsonValue *json_result_32 = decode_error_info(register_array_32, RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1, 1);
if (json_result_32) {
print_json_value(json_result_32);
json_free(json_result_32);
}
// Example 2: GC FATAL ERROR
uint64_t register_array_test[RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbea00000003b0000, 0x100000029, 0x1200136430400, 0x20b};
printf("\n--- GC FATAL ERROR ---\n");
printf("Decoded AFID: %d\n", decode_afid(register_array_test, RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1, 1));
JsonValue *json_result_test = decode_error_info(register_array_test, RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1, 1);
if (json_result_test) {
print_json_value(json_result_test);
json_free(json_result_test);
}
// Example 3: HBM CORRECTED ERROR (128-byte array)
uint64_t register_array_128[RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES] = {
0xffff,
0xdc2040000000011b,
0x0,
0xd008000801000000,
0x25000001ff,
0x209600191f00,
0xa000000,
0x0,
0x0,
0x0,
0xd008000801000000,
0x0,
0x0,
0x0,
0x0,
0x0};
printf("\n--- HBM CORRECTED ERROR (128-byte array) ---\n");
printf("Decoded AFID: %d\n", decode_afid(register_array_128, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1, 1));
JsonValue *json_result_128 = decode_error_info(register_array_128, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1, 1);
if (json_result_128) {
print_json_value(json_result_128);
json_free(json_result_128);
}
// Example 4: PCS XGMI Error
uint64_t register_array_pcs_xgmi[RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES] = {
0xffffffff,
0x9820000000060150,
0x0,
0xd008000200000000,
0x27000001f9,
0xe05012109201,
0xaf812d4a000000,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0};
printf("\n--- PCS XGMI Error ---\n");
printf("Decoded AFID: %d\n", decode_afid(register_array_pcs_xgmi, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1, 1));
JsonValue *json_result_pcs = decode_error_info(register_array_pcs_xgmi, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1, 1);
if (json_result_pcs) {
print_json_value(json_result_pcs);
json_free(json_result_pcs);
}
// Example 5: Bad page (threshold exceeded flag)
uint64_t register_array_bad_page[RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES] = {
0x1,
0xb000000000000137,
0x0,
0x0,
0x1ff00000002,
0x9600000000,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0,
0x0};
printf("\n--- Bad Page (Threshold Exceeded) ---\n");
printf("Decoded AFID: %d\n", decode_afid(register_array_bad_page, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, RAS_DECODE_FLAG_THRESHOLD_EXCEEDED, 1, 1));
JsonValue *json_result_bad_page = decode_error_info(register_array_bad_page, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, RAS_DECODE_FLAG_THRESHOLD_EXCEEDED, 1, 1);
if (json_result_bad_page) {
print_json_value(json_result_bad_page);
json_free(json_result_bad_page);
}
// Example 6: Boot Error Demo
uint64_t boot_messages[8] = {
0x3c000228a4, // Oam0bootmsg
0x3c001228a4, // Oam1bootmsg
0x3c002228a4, // Oam2bootmsg
0x3c003128a4, // Oam3bootmsg
0x3c004328a4, // Oam4bootmsg
0x3c005228a4, // Oam5bootmsg
0x3c006228a4, // Oam6bootmsg
0x3c007228a4 // Oam7bootmsg
};
printf("\n--- Boot Error Demo ---\n");
printf("Decoded AFID: %d\n", decode_afid(boot_messages, sizeof(boot_messages)/sizeof(boot_messages[0]), 0, 1, 9));
JsonValue *json_result_boot = decode_error_info(boot_messages, sizeof(boot_messages)/sizeof(boot_messages[0]), 0, 1, 9);
if (json_result_boot) {
print_json_value(json_result_boot);
json_free(json_result_boot);
} else {
printf("Failed to decode boot messages\n");
}
printf("\n===========================================\n");
}
int main()
{
// Display version information
print_version_info();
// Demonstrate the new JSON-based ACA decoding functionality
demonstrate_json_decoding();
return 0;
}
@@ -0,0 +1,204 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "aca_decode.h"
#include "ras_decode_constants.h"
#include "boot_decode.h"
#include "error_map.h"
#include "json_util.h"
#include <limits.h>
#include <stdlib.h>
#include <string.h>
int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type)
{
if (!register_array)
{
return -1;
}
// Use decode_error_info to get the JSON result
JsonValue *json_result = decode_error_info(register_array, array_len, flag, hw_revision, register_context_type);
if (!json_result) {
return -1;
}
// Use the decode_error_info_afid function to extract AFID
int afid = decode_error_info_afid(json_result);
json_free(json_result);
return afid;
}
JsonValue* decode_error_info(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type)
{
if (!register_array)
{
return NULL;
}
// Check register context type parameter
if (register_context_type == 9)
{
// For boot decode, use boot_decode_orchestrator with register_array and array_len
// Flag is not used in boot decode
return boot_decode_orchestrator((uint64_t*)register_array, array_len);
}
else if (register_context_type == 1)
{
// For ACA decode, use existing logic
aca_raw_data_t raw_data = {0};
if (array_len == RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes
{
raw_data.aca_status = register_array[0];
raw_data.aca_addr = register_array[1];
raw_data.aca_ipid = register_array[2];
raw_data.aca_synd = register_array[3];
}
else if (array_len == RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes
{
raw_data.aca_status = register_array[1];
raw_data.aca_addr = register_array[2];
raw_data.aca_ipid = register_array[5];
raw_data.aca_synd = register_array[6];
}
else
{
return NULL; // Unsupported size
}
raw_data.flags = flag;
raw_data.hw_revision = hw_revision;
return aca_decode(&raw_data);
}
else
{
return NULL; // Invalid register context type
}
}
int decode_error_info_afid(JsonValue *error_json)
{
if (!error_json || error_json->type != JSON_OBJECT) {
return -1; // Invalid AFID for null or invalid JSON
}
// Check if this is MCA error
JsonValue *category_value = json_object_get(error_json, "error_category");
JsonValue *type_value = json_object_get(error_json, "error_type");
JsonValue *severity_value = json_object_get(error_json, "severity");
if (category_value && type_value && severity_value &&
category_value->type == JSON_STRING && type_value->type == JSON_STRING && severity_value->type == JSON_STRING) {
const char *error_category = category_value->data.string;
const char *error_type = type_value->data.string;
const char *error_severity = severity_value->data.string;
// Check for the specific case: HBM Errors + Bad Page Retirement Threshold + Fatal
if (strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0 &&
strcmp(error_type, RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0 &&
strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0) {
// Use the error_type directly as service_error for this case
return get_error_id(error_category, error_type, error_severity);
}
// For other cases, we need to determine the service_error_type based on the logic
// from get_service_error_type function
const char *service_error = NULL;
// Extract bank if needed for service error type determination
JsonValue *bank_value = json_object_get(error_json, "bank");
const char *error_bank = (bank_value && bank_value->type == JSON_STRING) ? bank_value->data.string : "";
if (strcmp(error_type, RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0) {
service_error = RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD;
}
else if (strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0 && strcmp(error_severity, RAS_DECODE_SEVERITY_CORRECTED) == 0) {
service_error = RAS_DECODE_ERROR_TYPE_ALL;
}
else if (strcmp(error_type, "RdCrcErr") == 0) {
service_error = RAS_DECODE_ERROR_TYPE_END_TO_END_CRC;
}
else if (strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0 && strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0 &&
strcmp(error_type, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC) != 0 && strcmp(error_type, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC) != 0) {
service_error = RAS_DECODE_ERROR_TYPE_ALL_OTHERS;
}
else if (strcmp(error_category, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS) == 0) {
if ((strcmp(error_severity, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL) == 0 ||
strcmp(error_severity, RAS_DECODE_SEVERITY_CORRECTED) == 0 ||
strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0) &&
strcmp(error_type, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION) != 0 &&
strcmp(error_type, RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT) != 0) {
service_error = RAS_DECODE_ERROR_TYPE_ALL_OTHERS;
}
}
else if (strcmp(error_category, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS) == 0) {
if (strcmp(error_bank, RAS_DECODE_BANK_PCS_XGMI) == 0) {
service_error = RAS_DECODE_ERROR_TYPE_XGMI;
}
else if (strcmp(error_bank, RAS_DECODE_BANK_KPX_WAFL) == 0) {
service_error = RAS_DECODE_ERROR_TYPE_WAFL;
}
}
if (!service_error) {
service_error = error_type; // Fallback to error_type
}
return get_error_id(error_category, service_error, error_severity);
}
// Check if this is a boot error
// Find the first msg<i> key to get the error_type
JsonPair *current_pair = error_json->data.object;
JsonValue *first_msg = NULL;
int lowest_msg_index = INT_MAX;
while (current_pair) {
if (strncmp(current_pair->key, "msg", 3) == 0) {
// Extract the message index
int msg_index = atoi(current_pair->key + 3);
if (msg_index < lowest_msg_index) {
lowest_msg_index = msg_index;
first_msg = current_pair->value;
}
}
current_pair = current_pair->next;
}
if (first_msg && first_msg->type == JSON_OBJECT) {
// This is a boot error - extract error_type from the first message
JsonValue *boot_error_type = json_object_get(first_msg, "error_type");
if (boot_error_type && boot_error_type->type == JSON_STRING) {
const char *service_error = NULL;
service_error = boot_error_type->data.string;
// For boot errors, always use Boot-Time Errors category and Fail-to-init severity
return get_error_id(RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, service_error, RAS_DECODE_SEVERITY_FAIL_TO_INIT);
}
}
return -1; // Invalid AFID if neither MCA nor boot error format
}
+1 -1
Ver fichero
@@ -44,7 +44,7 @@ fi
# Find all source files
mapfile -t FILES < <(
find . \( -name build -o -name .git -o -path "./src/aca-decode" -o -path "./esmi_ib_library" -o -path "./rocm_smi/include/rocm_smi/kfd_ioctl.h" \) -prune -o \
find . \( -name build -o -name .git -o -path "./src/ras-decode-instinct-staging" -o -path "./esmi_ib_library" -o -path "./rocm_smi/include/rocm_smi/kfd_ioctl.h" \) -prune -o \
\( -name "*.cc" -o -name "*.cpp" -o -name "*.c" \) -print
)