diff --git a/projects/amdsmi/include/aca-decode/aca_constants.h b/projects/amdsmi/include/aca-decode/aca_constants.h deleted file mode 100644 index a0170a4e7f..0000000000 --- a/projects/amdsmi/include/aca-decode/aca_constants.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -/** - * @file aca_constants.h - * @brief Shared constants for ACA error decoding - * - * This file contains string constants and numerical constants that are used - * across multiple source files to improve maintainability and prevent typos. - */ - -#ifndef ACA_CONSTANTS_H -#define ACA_CONSTANTS_H - -/* Error severity constants */ -#define ACA_SEVERITY_UNKNOWN "UNKNOWN" -#define ACA_SEVERITY_FATAL "Fatal" -#define ACA_SEVERITY_CORRECTED "Corrected" -#define ACA_SEVERITY_UNCORRECTED_NON_FATAL "Uncorrected, Non-fatal" -#define ACA_SEVERITY_FAIL_TO_INIT "Fail-to-init" -#define ACA_SEVERITY_ALL_CAPS "ALL" - -/* Error category constants */ -#define ACA_CATEGORY_HBM_ERRORS "HBM Errors" -#define ACA_CATEGORY_DEVICE_INTERNAL_ERRORS "Device Internal Errors" -#define ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS "Off-Package Link Errors" -#define ACA_CATEGORY_BOOT_TIME_ERRORS "Boot-Time Errors" -#define ACA_CATEGORY_CPER_FORMAT "CPER Format" -#define ACA_CATEGORY_UNIDENTIFIED_ERRORS "Unidentified Errors" - -/* Common error type constants */ -#define ACA_ERROR_TYPE_ALL_OTHERS "All Others" -#define ACA_ERROR_TYPE_ALL "All" -#define ACA_ERROR_TYPE_DECODE_INAPPLICABLE "Decode Inapplicable" -#define ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD "Bad Page Retirement Threshold" -#define ACA_ERROR_TYPE_HARDWARE_ASSERTION "Hardware Assertion (HWA)" -#define ACA_ERROR_TYPE_WATCHDOG_TIMEOUT "Watchdog Timeout (WDT)" -#define ACA_ERROR_TYPE_ON_DIE_ECC "On-die ECC" -#define ACA_ERROR_TYPE_END_TO_END_CRC "End-to-end CRC" -#define ACA_ERROR_TYPE_WAFL "WAFL" -#define ACA_ERROR_TYPE_XGMI "XGMI" - -/* Boot-time error type constants */ -#define ACA_ERROR_TYPE_FW_LOAD "FW Load" -#define ACA_ERROR_TYPE_HBM_BIST_TEST "HBM BIST Test" -#define ACA_ERROR_TYPE_HBM_MEMORY_TEST "HBM Memory Test" -#define ACA_ERROR_TYPE_HBM_TRAINING "HBM Training" -#define ACA_ERROR_TYPE_UNHANDLED "Unhandled" -#define ACA_ERROR_TYPE_UNKNOWN_ERROR "Unknown" -#define ACA_ERROR_TYPE_USR_CP_LINK_TRAINING "USR CP Link Training" -#define ACA_ERROR_TYPE_USR_DP_LINK_TRAINING "USR DP Link Training" -#define ACA_ERROR_TYPE_WAFL_LINK_TRAINING "WAFL Link Training" -#define ACA_ERROR_TYPE_XGMI_LINK_TRAINING "XGMI Link Training" -#define ACA_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT "Boot Controller Data Abort" -#define ACA_ERROR_TYPE_BOOT_CONTROLLER_GENERIC "Boot Controller Generic" - -/* Link error type constants */ -#define ACA_ERROR_TYPE_PCIE_AER "PCIe AER" - -/* CPER format error type constants */ -#define ACA_ERROR_TYPE_MALFORMED_CPER "Malformed CPER" -#define ACA_ERROR_TYPE_INCOMPLETE_ACA_DATA "Incomplete ACA Data" -#define ACA_ERROR_TYPE_INVALID_ACA_DATA "Invalid ACA Data" -#define ACA_ERROR_TYPE_UNIDENTIFIED_ERROR "Unidentified Error" - -/* Protocol constants */ -#define ACA_PROTOCOL_CPER "CPER" -#define ACA_PROTOCOL_CPER_WITH_SPACE "CPER " - -/* Bank name strings */ -#define ACA_BANK_UMC "umc" -#define ACA_BANK_PSP "psp" -#define ACA_BANK_CS "cs" -#define ACA_BANK_PIE "pie" -#define ACA_BANK_PCS_XGMI "pcs_xgmi" -#define ACA_BANK_KPX_SERDES "kpx_serdes" -#define ACA_BANK_KPX_WAFL "kpx_wafl" - -/* Numerical constants */ -#define ACA_FLAG_THRESHOLD_EXCEEDED 0x8 -#define ACA_REGISTER_ARRAY_SIZE_32_BYTES 4 -#define ACA_REGISTER_ARRAY_SIZE_128_BYTES 16 - -/* Error code ranges */ -#define ACA_ERROR_CODE_EXT_MIN 0x3A -#define ACA_ERROR_CODE_EXT_MAX 0x3E - -/* Instance ID values for XCD and AID error decoding */ -#define ACA_INSTANCE_ID_XCD0_400 0x36430400 -#define ACA_INSTANCE_ID_XCD1_400 0x38430400 -#define ACA_INSTANCE_ID_XCD0_401 0x36430401 -#define ACA_INSTANCE_ID_XCD1_401 0x38430401 -#define ACA_INSTANCE_ID_AID_400 0x3B30400 -#define ACA_INSTANCE_ID_AID_401 0x3B30401 - -/* Error return codes */ -#define ACA_ERROR_INVALID_ACA_DATA_ID 33 -#define ACA_ERROR_UNIDENTIFIED_ERROR_ID 34 - -#endif /* ACA_CONSTANTS_H */ diff --git a/projects/amdsmi/include/aca-decode/utils.h b/projects/amdsmi/include/aca-decode/utils.h deleted file mode 100644 index 3b8ecc054e..0000000000 --- a/projects/amdsmi/include/aca-decode/utils.h +++ /dev/null @@ -1,60 +0,0 @@ -// SPDX-License-Identifier: MIT -/* - * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -/** - * @file utils.h - * @brief Common utility functions - */ -#ifndef UTILS_H -#define UTILS_H - -#include - -/** - * @brief Convert a 64-bit value from little endian to big endian - * @param[in] value Value to convert - * @return Converted value in big endian - */ -static inline uint64_t le64_to_be64(uint64_t value) { - return ((value & 0xFF00000000000000ULL) >> 56) | - ((value & 0x00FF000000000000ULL) >> 40) | - ((value & 0x0000FF0000000000ULL) >> 24) | - ((value & 0x000000FF00000000ULL) >> 8) | - ((value & 0x00000000FF000000ULL) << 8) | - ((value & 0x0000000000FF0000ULL) << 24) | - ((value & 0x000000000000FF00ULL) << 40) | - ((value & 0x00000000000000FFULL) << 56); -} - -/** - * @brief Convert an array of 64-bit values from little endian to big endian - * @param[in,out] array Array to convert - * @param[in] len Length of the array - */ -static inline void convert_array_le_to_be(uint64_t *array, size_t len) { - for (size_t i = 0; i < len; i++) { - array[i] = le64_to_be64(array[i]); - } -} - -#endif /* UTILS_H */ \ No newline at end of file diff --git a/projects/amdsmi/include/aca-decode/aca_decode.h b/projects/amdsmi/include/ras-decode/aca_decode.h old mode 100755 new mode 100644 similarity index 86% rename from projects/amdsmi/include/aca-decode/aca_decode.h rename to projects/amdsmi/include/ras-decode/aca_decode.h index e038e2f86a..f61d8bbb01 --- a/projects/amdsmi/include/aca-decode/aca_decode.h +++ b/projects/amdsmi/include/ras-decode/aca_decode.h @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * @@ -21,20 +20,16 @@ * THE SOFTWARE. */ - /** +/** * @file aca_decode.h * @brief Internal decoder interface and data structures */ +#ifndef RAS_DECODE_DECODE_H +#define RAS_DECODE_DECODE_H -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef ACA_DECODE_H -#define ACA_DECODE_H - -#include "aca_api.h" +#include "ras_decode_api.h" #include "aca_fields.h" +#include "json_util.h" /** * @brief Internal decoder structure with parsed register fields @@ -67,13 +62,10 @@ typedef struct } aca_raw_data_t; /** - * @brief Main decode function that processes raw ACA error data + * @brief Main decode function that processes raw ACA error data and returns JSON * @param[in] raw_data Pointer to structure containing raw ACA error data - * @return Decoded error information structure + * @return JsonValue* containing the decoded error information, or NULL on failure */ -aca_error_info_t aca_decode(const aca_raw_data_t *raw_data); +JsonValue* aca_decode(const aca_raw_data_t *raw_data); -#ifdef __cplusplus -} -#endif -#endif /* ACA_DECODE_H */ +#endif /* RAS_DECODE_DECODE_H */ diff --git a/projects/amdsmi/include/aca-decode/aca_fields.h b/projects/amdsmi/include/ras-decode/aca_fields.h similarity index 95% rename from projects/amdsmi/include/aca-decode/aca_fields.h rename to projects/amdsmi/include/ras-decode/aca_fields.h index df1a5e3cd7..9055586a37 100644 --- a/projects/amdsmi/include/aca-decode/aca_fields.h +++ b/projects/amdsmi/include/ras-decode/aca_fields.h @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * @@ -21,7 +20,7 @@ * THE SOFTWARE. */ - /** +/** * @file aca_fields.h * @brief ACA register field definitions and manipulation functions * @@ -30,8 +29,8 @@ * definitions for status, IPID, and syndrome registers, along with * functions to initialize and access these fields. */ -#ifndef ACA_FIELDS_H -#define ACA_FIELDS_H +#ifndef RAS_DECODE_FIELDS_H +#define RAS_DECODE_FIELDS_H #include diff --git a/projects/amdsmi/include/aca-decode/aca_tables.h b/projects/amdsmi/include/ras-decode/aca_tables.h similarity index 95% rename from projects/amdsmi/include/aca-decode/aca_tables.h rename to projects/amdsmi/include/ras-decode/aca_tables.h index 6686dd24e8..499637cb49 100644 --- a/projects/amdsmi/include/aca-decode/aca_tables.h +++ b/projects/amdsmi/include/ras-decode/aca_tables.h @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * @@ -28,8 +27,8 @@ * into their corresponding names and types. */ -#ifndef ACA_TABLES_H -#define ACA_TABLES_H +#ifndef RAS_DECODE_TABLES_H +#define RAS_DECODE_TABLES_H #include #include diff --git a/projects/amdsmi/include/aca-decode/aca_version.h b/projects/amdsmi/include/ras-decode/aca_version.h similarity index 80% rename from projects/amdsmi/include/aca-decode/aca_version.h rename to projects/amdsmi/include/ras-decode/aca_version.h index 53a5dc2fa9..aafc55f276 100644 --- a/projects/amdsmi/include/aca-decode/aca_version.h +++ b/projects/amdsmi/include/ras-decode/aca_version.h @@ -20,8 +20,8 @@ * THE SOFTWARE. */ -#ifndef ACA_VERSION_H -#define ACA_VERSION_H +#ifndef RAS_DECODE_VERSION_H +#define RAS_DECODE_VERSION_H #ifdef __cplusplus extern "C" @@ -40,12 +40,19 @@ extern "C" */ /* Version Components */ -#define ACA_VERSION_MAJOR 1 /**< Major version number */ -#define ACA_VERSION_MINOR 0 /**< Minor version number */ -#define ACA_VERSION_PATCH 0 /**< Patch version number */ +#define RAS_DECODE_VERSION_MAJOR 2 /**< Major version number */ +#define RAS_DECODE_VERSION_MINOR 0 /**< Minor version number */ +#define RAS_DECODE_VERSION_PATCH 0 /**< Patch version number */ -/* Version String */ -#define ACA_VERSION_STRING "1.0.0" +/* Helper macros for string concatenation */ +#define RAS_DECODE_STRINGIFY(x) #x +#define RAS_DECODE_TOSTRING(x) RAS_DECODE_STRINGIFY(x) + +/* Version String - dynamically constructed from components */ +#define RAS_DECODE_VERSION_STRING \ + RAS_DECODE_TOSTRING(RAS_DECODE_VERSION_MAJOR) "." \ + RAS_DECODE_TOSTRING(RAS_DECODE_VERSION_MINOR) "." \ + RAS_DECODE_TOSTRING(RAS_DECODE_VERSION_PATCH) /** * @brief Structure containing version information @@ -92,4 +99,4 @@ extern "C" } #endif -#endif /* ACA_VERSION_H */ +#endif /* RAS_DECODE_VERSION_H */ diff --git a/projects/amdsmi/include/ras-decode/boot_decode.h b/projects/amdsmi/include/ras-decode/boot_decode.h new file mode 100644 index 0000000000..6add864e53 --- /dev/null +++ b/projects/amdsmi/include/ras-decode/boot_decode.h @@ -0,0 +1,219 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BOOT_DECODE_H +#define BOOT_DECODE_H + +#include +#include +#include +#include "json_util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Boot message structure representing OAM boot message + */ +typedef struct { + uint64_t value; ///< 64-bit boot message value +} OamBootMsg; + +/** + * @brief Decoder function pointer type + * @param msg Boot message to decode + * @return JsonValue containing decoded information or NULL on failure + */ +typedef JsonValue* (*boot_decoder_func_t)(OamBootMsg *msg); + +/** + * @brief Decoder mapping entry + */ +typedef struct { + uint8_t encoding; ///< Error encoding value + boot_decoder_func_t decoder; ///< Decoder function +} boot_decoder_entry_t; + +/** + * @brief Boot message constants + */ +#define BOOT_ERROR_PRESENT_MARKER 0xA4 +#define BOOT_IN_BOOT_MARKER 0xBA +#define BOOT_SUCCESS_ENCODING 0xBA + +/** + * @brief Error encoding constants + */ +#define BOOT_ENCODING_HBM_TRAINING 0x01 +#define BOOT_ENCODING_FW_LOAD 0x04 +#define BOOT_ENCODING_WAFL_LINK 0x05 +#define BOOT_ENCODING_XGMI_LINK 0x06 +#define BOOT_ENCODING_USR_CP_LINK 0x07 +#define BOOT_ENCODING_USR_DP_LINK 0x08 +#define BOOT_ENCODING_HBM_MEM_TEST 0x09 +#define BOOT_ENCODING_HBM_BIST_TEST 0x0A +#define BOOT_ENCODING_BOOT_CTRL_GEN_V0 0x0B +#define BOOT_ENCODING_BOOT_CTRL_GEN_V1 0x0C +#define BOOT_ENCODING_DATA_ABORT 0x0D + +/** + * @brief HBM stack decoder constants + */ +#define HBM_STACK_0 0x01 +#define HBM_STACK_1 0x02 +#define HBM_STACK_UNKNOWN -1 + +/** + * @brief Extract specific byte from 64-bit value + * @param value 64-bit value + * @param byte_index Byte index (0-7) + * @return Extracted byte value + */ +static inline uint8_t extract_byte(uint64_t value, int byte_index) { + return (uint8_t)((value >> (byte_index * 8)) & 0xFF); +} + +/** + * @brief Extract specific bits mask + * @param num_bits Number of bits to extract + * @return Bit mask + */ +static inline uint32_t extract_bits(int num_bits) { + return (1U << num_bits) - 1; +} + +/** + * @brief Get boot version from boot message + * @param msg Boot message + * @return Boot version (0 or 1) + */ +int get_boot_version(OamBootMsg *msg); + +/** + * @brief Get error encoding from boot message + * @param msg Boot message + * @return Error encoding value + */ +int get_error_encoding(OamBootMsg *msg); + +/** + * @brief Check if error is present in boot message + * @param msg Boot message + * @return true if error present, false otherwise + */ +bool error_present(OamBootMsg *msg); + +/** + * @brief Check if in boot mode + * @param msg Boot message + * @return true if in boot mode, false otherwise + */ +bool in_boot(OamBootMsg *msg); + +/** + * @brief Get socket number from boot message + * @param msg Boot message + * @param version Boot version + * @return Socket number + */ +int get_socket(OamBootMsg *msg, int version); + +/** + * @brief Get AID number from boot message + * @param msg Boot message + * @param version Boot version + * @return AID number + */ +int get_aid(OamBootMsg *msg, int version); + +/** + * @brief Decode HBM stack value + * @param stack Stack value + * @return Decoded stack number or HBM_STACK_UNKNOWN + */ +int decode_hbm_stack(uint8_t stack); + +/** + * @brief Create JSON array of failed links + * @param byte_value Byte containing link status bits + * @param max_links Maximum number of links to check + * @return JsonValue array or NULL on failure + */ +JsonValue* create_failed_links_array(uint8_t byte_value, int max_links); + +/** + * @brief Create hex string representation + * @param value Value to convert + * @param width Width of hex string (with padding) + * @return Dynamically allocated hex string or NULL on failure + */ +char* create_hex_string(uint64_t value, int width); + +// Decoder functions for Version 0 +JsonValue* decode_hbm_training_v0(OamBootMsg *msg); +JsonValue* decode_fw_load_v0(OamBootMsg *msg); +JsonValue* decode_wafl_link_training_v0(OamBootMsg *msg); +JsonValue* decode_xgmi_link_training_v0(OamBootMsg *msg); +JsonValue* decode_usr_cp_link_training_v0(OamBootMsg *msg); +JsonValue* decode_usr_dp_link_training_v0(OamBootMsg *msg); +JsonValue* decode_hbm_mem_test_v0(OamBootMsg *msg); +JsonValue* decode_hbm_bist_test_v0(OamBootMsg *msg); +JsonValue* decode_boot_controller_generic_v0(OamBootMsg *msg); + +// Decoder functions for Version 1 +JsonValue* decode_hbm_training_v1(OamBootMsg *msg); +JsonValue* decode_fw_load_v1(OamBootMsg *msg); +JsonValue* decode_wafl_link_training_v1(OamBootMsg *msg); +JsonValue* decode_xgmi_link_training_v1(OamBootMsg *msg); +JsonValue* decode_usr_cp_link_training_v1(OamBootMsg *msg); +JsonValue* decode_usr_dp_link_training_v1(OamBootMsg *msg); +JsonValue* decode_hbm_mem_test_v1(OamBootMsg *msg); +JsonValue* decode_hbm_bist_test_v1(OamBootMsg *msg); +JsonValue* decode_boot_controller_generic_v1(OamBootMsg *msg); +JsonValue* decode_data_abort_v1(OamBootMsg *msg); +JsonValue* decode_boot_success_v1(OamBootMsg *msg); + +// Unhandled error decoders +JsonValue* decode_unhandled_error_v0(OamBootMsg *msg); +JsonValue* decode_unhandled_error_v1(OamBootMsg *msg); + +/** + * @brief Get appropriate decoder function for boot message + * @param msg Boot message + * @return Decoder function pointer or NULL if no decoder found + */ +boot_decoder_func_t get_decoder_function(OamBootMsg *msg); + +/** + * @brief Orchestrate decoding of multiple boot messages + * @param oam_boot_msgs Array of boot message values + * @param count Number of boot messages + * @return JsonValue object containing decoded results or NULL on failure + */ +JsonValue* boot_decode_orchestrator(uint64_t *oam_boot_msgs, size_t count); + +#ifdef __cplusplus +} +#endif + +#endif /* BOOT_DECODE_H */ diff --git a/projects/amdsmi/include/aca-decode/error_map.h b/projects/amdsmi/include/ras-decode/error_map.h similarity index 95% rename from projects/amdsmi/include/aca-decode/error_map.h rename to projects/amdsmi/include/ras-decode/error_map.h index 318b823478..0c8c555390 100644 --- a/projects/amdsmi/include/aca-decode/error_map.h +++ b/projects/amdsmi/include/ras-decode/error_map.h @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * diff --git a/projects/amdsmi/include/ras-decode/json_printer.h b/projects/amdsmi/include/ras-decode/json_printer.h new file mode 100644 index 0000000000..d82f264054 --- /dev/null +++ b/projects/amdsmi/include/ras-decode/json_printer.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef JSON_PRINTER_H +#define JSON_PRINTER_H + +#include "json_util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Print a JSON value to stdout in formatted form + * @param value JSON value to print + */ +void print_json_value(JsonValue *value); + +#ifdef __cplusplus +} +#endif + +#endif /* JSON_PRINTER_H */ diff --git a/projects/amdsmi/include/ras-decode/json_util.h b/projects/amdsmi/include/ras-decode/json_util.h new file mode 100644 index 0000000000..24e358262f --- /dev/null +++ b/projects/amdsmi/include/ras-decode/json_util.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef JSON_UTIL_H +#define JSON_UTIL_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief JSON value types enumeration + */ +typedef enum { + JSON_NULL, + JSON_BOOL, + JSON_NUMBER, + JSON_STRING, + JSON_OBJECT, + JSON_ARRAY +} JsonType; + +typedef struct JsonValue JsonValue; +typedef struct JsonPair JsonPair; + +/** + * @brief JSON key-value pair structure for objects + */ +struct JsonPair { + char *key; + JsonValue *value; + JsonPair *next; +}; + +/** + * @brief JSON value structure + */ +struct JsonValue { + JsonType type; + union { + bool boolean; + double number; + char *string; + JsonPair *object; // Linked list of key-value pairs + struct { + JsonValue **items; + size_t count; + size_t capacity; + } array; + } data; +}; + +/** + * @brief Create a null JSON value + * @return Pointer to new JsonValue or NULL on failure + */ +JsonValue* json_create_null(void); + +/** + * @brief Create a boolean JSON value + * @param b Boolean value + * @return Pointer to new JsonValue or NULL on failure + */ +JsonValue* json_create_bool(bool b); + +/** + * @brief Create a number JSON value + * @param num Numeric value + * @return Pointer to new JsonValue or NULL on failure + */ +JsonValue* json_create_number(double num); + +/** + * @brief Create a string JSON value + * @param str String value (will be copied) + * @return Pointer to new JsonValue or NULL on failure + */ +JsonValue* json_create_string(const char *str); + +/** + * @brief Create an empty JSON object + * @return Pointer to new JsonValue or NULL on failure + */ +JsonValue* json_create_object(void); + +/** + * @brief Create an empty JSON array + * @return Pointer to new JsonValue or NULL on failure + */ +JsonValue* json_create_array(void); + +/** + * @brief Add a key-value pair to a JSON object + * @param obj JSON object to modify + * @param key Key string (will be copied) + * @param value Value to add + */ +void json_object_set(JsonValue *obj, const char *key, JsonValue *value); + +/** + * @brief Get a value by key from a JSON object + * @param obj JSON object to search + * @param key Key to search for + * @return Pointer to JsonValue or NULL if not found + */ +JsonValue* json_object_get(JsonValue *obj, const char *key); + +/** + * @brief Check if a key exists in a JSON object + * @param obj JSON object to check + * @param key Key to check for + * @return true if key exists, false otherwise + */ +bool json_object_has_key(JsonValue *obj, const char *key); + +/** + * @brief Add a value to a JSON array + * @param arr JSON array to modify + * @param value Value to add + * @return true on success, false on failure + */ +bool json_array_push(JsonValue *arr, JsonValue *value); + +/** + * @brief Get a value by index from a JSON array + * @param arr JSON array to access + * @param index Array index + * @return Pointer to JsonValue or NULL if index out of bounds + */ +JsonValue* json_array_get(JsonValue *arr, size_t index); + +/** + * @brief Get the size of a JSON array + * @param arr JSON array + * @return Number of elements in array, or 0 if not an array + */ +size_t json_array_size(JsonValue *arr); + +/** + * @brief Free a JSON value and all its contents + * @param val JSON value to free + */ +void json_free(JsonValue *val); + +#ifdef __cplusplus +} +#endif + +#endif /* JSON_UTIL_H */ diff --git a/projects/amdsmi/include/aca-decode/aca_api.h b/projects/amdsmi/include/ras-decode/ras_decode_api.h similarity index 70% rename from projects/amdsmi/include/aca-decode/aca_api.h rename to projects/amdsmi/include/ras-decode/ras_decode_api.h index cd74a1a205..8477a60772 100644 --- a/projects/amdsmi/include/aca-decode/aca_api.h +++ b/projects/amdsmi/include/ras-decode/ras_decode_api.h @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * @@ -21,11 +20,13 @@ * THE SOFTWARE. */ -#ifndef ACA_API_H -#define ACA_API_H +#ifndef RAS_DECODE_API_H +#define RAS_DECODE_API_H #include #include +#include "aca_version.h" +#include "json_util.h" /** * @brief Structure containing decoded error information @@ -39,12 +40,13 @@ typedef struct const char *instance_ref; /**< Reference to instance name string */ int oam; /**< OAM value */ int aid; /**< AID value */ - int afid; /**< AFID value (AMD Field ID) */ uint64_t raw_status; /**< Raw status register value */ uint64_t raw_addr; /**< Raw address register value */ uint64_t raw_ipid; /**< Raw IPID register value */ uint64_t raw_synd; /**< Raw syndrome register value */ uint8_t scrub; /**< Scrub bit from status */ + uint8_t poison; /**< Poison bit from status */ + uint8_t deferred; /**< Deferred bit from status */ uint8_t error_code_ext; /**< Extended error code from status */ } aca_error_info_t; @@ -54,18 +56,27 @@ typedef struct * @param[in] array_len Size of register array in elements * @param[in] flag Decoder flags * @param[in] hw_revision Hardware revision number + * @param[in] register_context_type Register context type (16-bit): 1 for ACA decode, 9 for boot decode * @return AFID value or -1 if decoding fails */ -int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision); +int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type); /** - * @brief Decodes and returns complete error information from a register array + * @brief Decodes and returns complete error information from a register array as JSON * @param[in] register_array Pointer to an array of 64-bit register values * @param[in] array_len Size of register array in elements * @param[in] flag Decoder flags * @param[in] hw_revision Hardware revision number - * @return Complete error information structure + * @param[in] register_context_type Register context type (16-bit): 1 for ACA decode, 9 for boot decode + * @return JsonValue* containing complete error information, or NULL on failure */ -aca_error_info_t decode_error_info(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision); +JsonValue* decode_error_info(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type); -#endif // ACA_API_H +/** + * @brief Decodes the AFID from a JSON error object based on error category, type, and severity + * @param[in] error_json Pointer to JSON object containing error information + * @return AFID value or -1 if decoding fails or JSON is NULL + */ +int decode_error_info_afid(JsonValue *error_json); + +#endif // RAS_DECODE_API_H diff --git a/projects/amdsmi/include/ras-decode/ras_decode_constants.h b/projects/amdsmi/include/ras-decode/ras_decode_constants.h new file mode 100644 index 0000000000..504d4e08e3 --- /dev/null +++ b/projects/amdsmi/include/ras-decode/ras_decode_constants.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/** + * @file ras_decode_constants.h + * @brief Shared constants for ACA error decoding + * + * This file contains string constants and numerical constants that are used + * across multiple source files to improve maintainability and prevent typos. + */ + +#ifndef RAS_DECODE_CONSTANTS_H +#define RAS_DECODE_CONSTANTS_H + +/* Error severity constants */ +#define RAS_DECODE_SEVERITY_UNKNOWN "UNKNOWN" +#define RAS_DECODE_SEVERITY_FATAL "Fatal" +#define RAS_DECODE_SEVERITY_CORRECTED "Corrected" +#define RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL "Uncorrected, Non-fatal" +#define RAS_DECODE_SEVERITY_FAIL_TO_INIT "Fail-to-init" +#define RAS_DECODE_SEVERITY_ALL_CAPS "ALL" + +/* Error category constants */ +#define RAS_DECODE_CATEGORY_HBM_ERRORS "HBM Errors" +#define RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS "Device Internal Errors" +#define RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS "Off-Package Link Errors" +#define RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS "Boot-Time Errors" +#define RAS_DECODE_CATEGORY_CPER_FORMAT "CPER Format" +#define RAS_DECODE_CATEGORY_UNIDENTIFIED_ERRORS "Unidentified Errors" + +/* Common error type constants */ +#define RAS_DECODE_ERROR_TYPE_ALL_OTHERS "All Others" +#define RAS_DECODE_ERROR_TYPE_ALL "All" +#define RAS_DECODE_ERROR_TYPE_DECODE_INAPPLICABLE "Decode Inapplicable" +#define RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD "Bad Page Retirement Threshold" +#define RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION "Hardware Assertion (HWA)" +#define RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT "Watchdog Timeout (WDT)" +#define RAS_DECODE_ERROR_TYPE_ON_DIE_ECC "On-die ECC" +#define RAS_DECODE_ERROR_TYPE_END_TO_END_CRC "End-to-end CRC" +#define RAS_DECODE_ERROR_TYPE_WAFL "WAFL" +#define RAS_DECODE_ERROR_TYPE_XGMI "XGMI" + +/* Boot-time error type constants */ +#define RAS_DECODE_ERROR_TYPE_FW_LOAD "FW Load" +#define RAS_DECODE_ERROR_TYPE_HBM_BIST_TEST "HBM BIST Test" +#define RAS_DECODE_ERROR_TYPE_HBM_MEMORY_TEST "HBM Memory Test" +#define RAS_DECODE_ERROR_TYPE_HBM_TRAINING "HBM Training" +#define RAS_DECODE_ERROR_TYPE_UNHANDLED "Unhandled" +#define RAS_DECODE_ERROR_TYPE_UNKNOWN_ERROR "Unknown" +#define RAS_DECODE_ERROR_TYPE_USR_CP_LINK_TRAINING "USR CP Link Training" +#define RAS_DECODE_ERROR_TYPE_USR_DP_LINK_TRAINING "USR DP Link Training" +#define RAS_DECODE_ERROR_TYPE_WAFL_LINK_TRAINING "WAFL Link Training" +#define RAS_DECODE_ERROR_TYPE_XGMI_LINK_TRAINING "XGMI Link Training" +#define RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT "Boot Controller Data Abort" +#define RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_GENERIC "Boot Controller Generic" +#define RAS_DECODE_ERROR_TYPE_BOOT_SUCCESS "Boot Success" + +/* Link error type constants */ +#define RAS_DECODE_ERROR_TYPE_PCIE_AER "PCIe AER" + +/* CPER format error type constants */ +#define RAS_DECODE_ERROR_TYPE_MALFORMED_CPER "Malformed CPER" +#define RAS_DECODE_ERROR_TYPE_INCOMPLETE_RAS_DECODE_DATA "Incomplete ACA Data" +#define RAS_DECODE_ERROR_TYPE_INVALID_RAS_DECODE_DATA "Invalid ACA Data" +#define RAS_DECODE_ERROR_TYPE_UNIDENTIFIED_ERROR "Unidentified Error" + +/* Protocol constants */ +#define RAS_DECODE_PROTOCOL_CPER "CPER" +#define RAS_DECODE_PROTOCOL_CPER_WITH_SPACE "CPER " + +/* Bank name strings */ +#define RAS_DECODE_BANK_UMC "umc" +#define RAS_DECODE_BANK_PSP "psp" +#define RAS_DECODE_BANK_CS "cs" +#define RAS_DECODE_BANK_PIE "pie" +#define RAS_DECODE_BANK_PCS_XGMI "pcs_xgmi" +#define RAS_DECODE_BANK_KPX_SERDES "kpx_serdes" +#define RAS_DECODE_BANK_KPX_WAFL "kpx_wafl" + +/* Numerical constants */ +#define RAS_DECODE_FLAG_THRESHOLD_EXCEEDED 0x8 +#define RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES 4 +#define RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES 16 +#define BOOT_REGISTER_ARRAY_SIZE_64_BYTES 8 + +/* Error code ranges */ +#define RAS_DECODE_ERROR_CODE_EXT_MIN 0x3A +#define RAS_DECODE_ERROR_CODE_EXT_MAX 0x3E + +/* Instance ID values for XCD and AID error decoding */ +#define RAS_DECODE_INSTANCE_ID_XCD0_400 0x36430400 +#define RAS_DECODE_INSTANCE_ID_XCD1_400 0x38430400 +#define RAS_DECODE_INSTANCE_ID_XCD0_401 0x36430401 +#define RAS_DECODE_INSTANCE_ID_XCD1_401 0x38430401 +#define RAS_DECODE_INSTANCE_ID_AID_400 0x3B30400 +#define RAS_DECODE_INSTANCE_ID_AID_401 0x3B30401 + +/* Error return codes */ +#define RAS_DECODE_ERROR_INVALID_RAS_DECODE_DATA_ID 33 +#define RAS_DECODE_ERROR_UNIDENTIFIED_ERROR_ID 34 + +#endif /* RAS_DECODE_CONSTANTS_H */ diff --git a/projects/amdsmi/src/CMakeLists.txt b/projects/amdsmi/src/CMakeLists.txt index e8ca98d0cf..21e1c331e2 100644 --- a/projects/amdsmi/src/CMakeLists.txt +++ b/projects/amdsmi/src/CMakeLists.txt @@ -44,13 +44,33 @@ set(INC_LIST "${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi.h" "${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi_utils.h") -set(ACA_SRC_DIR "aca-decode") -set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_api.c ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c - ${ACA_SRC_DIR}/aca_tables.c ${ACA_SRC_DIR}/error_map.c) -set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/aca-decode") -set(INC_LIST ${INC_LIST} ${ACA_INC_DIR}/aca_decode.h ${ACA_INC_DIR}/aca_fields.h ${ACA_INC_DIR}/aca_tables.h - ${ACA_INC_DIR}/error_map.h) - +set(RAS_DECODE "ras-decode") +set(ACA_SRC_DIR "${PROJECT_SOURCE_DIR}/src/${RAS_DECODE}") +set(SRC_LIST ${SRC_LIST} + ${ACA_SRC_DIR}/aca_decode.c + ${ACA_SRC_DIR}/aca_fields.c + ${ACA_SRC_DIR}/aca_tables.c + ${ACA_SRC_DIR}/aca_version.c + ${ACA_SRC_DIR}/boot_decode.c + ${ACA_SRC_DIR}/error_map.c + ${ACA_SRC_DIR}/json_printer.c + ${ACA_SRC_DIR}/json_util.c + # ${ACA_SRC_DIR}/main.c + ${ACA_SRC_DIR}/ras_decode_api.c +) +set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/${RAS_DECODE}") +set(INC_LIST ${INC_LIST} + ${ACA_INC_DIR}/aca_decode.h + ${ACA_INC_DIR}/aca_fields.h + ${ACA_INC_DIR}/aca_tables.h + ${ACA_INC_DIR}/aca_version.h + ${ACA_INC_DIR}/boot_decode.h + ${ACA_INC_DIR}/error_map.h + ${ACA_INC_DIR}/json_printer.h + ${ACA_INC_DIR}/json_util.h + ${ACA_INC_DIR}/ras_decode_api.h + ${ACA_INC_DIR}/ras_decode_constants.h +) if(ENABLE_ESMI_LIB) list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi.h) list(APPEND INC_LIST ${ESMI_INC_DIR}/e_smi/e_smi_monitor.h) diff --git a/projects/amdsmi/src/aca-decode/aca_api.c b/projects/amdsmi/src/aca-decode/aca_api.c deleted file mode 100644 index 3aa0c3ec7f..0000000000 --- a/projects/amdsmi/src/aca-decode/aca_api.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "aca_decode.h" -#include "aca_constants.h" - -int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision) -{ - if (!register_array) - { - return -1; - } - - aca_raw_data_t raw_data; - - if (array_len == ACA_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes - { - raw_data.aca_status = register_array[0]; - raw_data.aca_addr = register_array[1]; - raw_data.aca_ipid = register_array[2]; - raw_data.aca_synd = register_array[3]; - } - else if (array_len == ACA_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes - { - raw_data.aca_status = register_array[1]; - raw_data.aca_addr = register_array[2]; - raw_data.aca_ipid = register_array[5]; - raw_data.aca_synd = register_array[6]; - } - - else - { - return -1; // Unsupported size - } - - raw_data.flags = flag; - raw_data.hw_revision = hw_revision; - - aca_error_info_t error_info = aca_decode(&raw_data); - return error_info.afid; -} - -aca_error_info_t decode_error_info(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision) -{ - aca_raw_data_t raw_data = {0}; - aca_error_info_t error_info = {0}; - - if (!register_array) - { - return error_info; - } if (array_len == ACA_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes - { - raw_data.aca_status = register_array[0]; - raw_data.aca_addr = register_array[1]; - raw_data.aca_ipid = register_array[2]; - raw_data.aca_synd = register_array[3]; - } - else if (array_len == ACA_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes - { - raw_data.aca_status = register_array[1]; - raw_data.aca_addr = register_array[2]; - raw_data.aca_ipid = register_array[5]; - raw_data.aca_synd = register_array[6]; - } - else - { - return error_info; // Return zero-initialized structure for unsupported size - } - - raw_data.flags = flag; - raw_data.hw_revision = hw_revision; - - return aca_decode(&raw_data); -} - diff --git a/projects/amdsmi/src/aca-decode/error_map.c b/projects/amdsmi/src/aca-decode/error_map.c deleted file mode 100644 index edbd5312a9..0000000000 --- a/projects/amdsmi/src/aca-decode/error_map.c +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "error_map.h" -#include "aca_constants.h" -#include - -#define AFID_VERSION "0.7" - -static const error_map_entry_t error_map[] = { - {1, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_FW_LOAD, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {2, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_BIST_TEST, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {3, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_MEMORY_TEST, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {4, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {5, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_UNHANDLED, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {6, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_UNKNOWN_ERROR, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {7, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_USR_CP_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {8, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_USR_DP_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {9, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_WAFL_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {10, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_XGMI_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {11, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, - {12, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_BOOT_CONTROLLER_GENERIC, ACA_PROTOCOL_CPER_WITH_SPACE, ACA_SEVERITY_FAIL_TO_INIT}, - {13, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_PCIE_AER, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, - {14, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_PCIE_AER, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {15, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_WAFL, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, - {16, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_WAFL, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {17, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_XGMI, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, - {18, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_XGMI, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {19, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {20, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ON_DIE_ECC, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {21, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_END_TO_END_CRC, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {22, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ON_DIE_ECC, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL}, - {23, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_END_TO_END_CRC, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL}, - {24, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ALL, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, - {25, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {26, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_HARDWARE_ASSERTION, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {27, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {28, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL}, - {29, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, - {30, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, - {31, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_MALFORMED_CPER, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}, - {32, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_INCOMPLETE_ACA_DATA, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}, - {33, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_INVALID_ACA_DATA, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}, - {34, ACA_CATEGORY_UNIDENTIFIED_ERRORS, ACA_ERROR_TYPE_UNIDENTIFIED_ERROR, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}}; - -static const size_t NUM_ERROR_ENTRIES = sizeof(error_map) / sizeof(error_map[0]); - -int get_error_id(const char *error_category, const char *error_type, const char *error_severity) -{ - if (!error_category || !error_type || !error_severity || - strcmp(error_category, ACA_SEVERITY_UNKNOWN) == 0 || - strcmp(error_type, ACA_SEVERITY_UNKNOWN) == 0 || - strcmp(error_severity, ACA_SEVERITY_UNKNOWN) == 0) - { - return ACA_ERROR_INVALID_ACA_DATA_ID; // Return ID for "Invalid Error" if any input is "UNKNOWN" or NULL - } - - for (size_t i = 0; i < NUM_ERROR_ENTRIES; i++) - { - if (strcmp(error_map[i].error_category, error_category) == 0 && - strcmp(error_map[i].error_type, error_type) == 0 && - strcmp(error_map[i].error_severity, error_severity) == 0) - { - return (int)error_map[i].id; - } - } - - return ACA_ERROR_UNIDENTIFIED_ERROR_ID; // Return ID for "Unidentified Errors" if no match found -} diff --git a/projects/amdsmi/src/aca-decode/main.c b/projects/amdsmi/src/aca-decode/main.c deleted file mode 100644 index 2dfbc41426..0000000000 --- a/projects/amdsmi/src/aca-decode/main.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -/** - * @file main.c - * @brief Demo program showing how to use the ACA decoder - * - * This is a demonstration program that shows how to use the ACA decoder - * with sample raw data to decode ACA error information. - */ - -#include -#include -#include -#include -#include -#include - -// Function prototype -void print_error_info(const aca_error_info_t *info); -void print_version_info(void); - -// Function to print error info in JSON format -void print_error_info(const aca_error_info_t *info) -{ - printf("{\n"); - printf(" \"bank\": \"%s\",\n", info->bank_ref); - printf(" \"error_location\": {\n"); - printf(" \"oam\": \"%d\",\n", info->oam); - printf(" \"aid\": \"%d\",\n", info->aid); - printf(" \"instance\": \"%s\"\n", info->instance_ref); - printf(" },\n"); - printf(" \"severity\": \"%s\",\n", info->severity_ref); - printf(" \"afid\": \"%d\",\n", info->afid); - printf(" \"scrub\": \"%u\",\n", info->scrub); - printf(" \"err_ext\": \"%u\",\n", info->error_code_ext); - printf(" \"error_category\": \"%s\",\n", info->category_ref); - printf(" \"error_type\": \"%s\",\n", info->error_type_ref); - printf(" \"address\": \"0x%" PRIx64 "\",\n", info->raw_addr); - printf(" \"syndrome\": \"0x%" PRIx64 "\"\n", info->raw_synd); - printf("}\n"); -} - -// Function to print version information -void print_version_info(void) -{ - printf("=== ACA Decoder Library Version Information ===\n"); - printf("Version: %s\n", aca_get_version_string()); - printf("Major: %d\n", aca_get_version_major()); - printf("Minor: %d\n", aca_get_version_minor()); - printf("Patch: %d\n", aca_get_version_patch()); - - aca_version_info_t version_info = aca_get_version_info(); - printf("Complete version info:\n"); - printf(" Major: %d\n", version_info.major); - printf(" Minor: %d\n", version_info.minor); - printf(" Patch: %d\n", version_info.patch); - printf(" String: %s\n", version_info.string); - printf("===============================================\n\n"); -} - -int main() -{ - // Display version information - print_version_info(); - - // Sample usage of decode_afid with 32-byte register array (HBM FATAL ERROR, expected output is 4) - uint64_t register_array_32[ACA_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbaa000000004081b, 0x0, 0x209600090f00, 0x5d000000}; - int afid_32 = decode_afid(register_array_32, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1); - printf("Decoded AFID (32-byte array): %d\n", afid_32); - - // Sample usage of decode_afid with 32-byte register array (GC FATAL ERROR, expected output is 3) - uint64_t register_array_test[ACA_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbea00000003b0000, 0x100000029, 0x1200136430400, 0x20b}; - int afid_test = decode_afid(register_array_test, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1); - printf("Decoded AFID (test array): %d\n", afid_test); - - // Sample usage of decode_afid with 128-byte register array (HBM CORRECTED ERROR, expected output is 1) - uint64_t register_array_128[ACA_REGISTER_ARRAY_SIZE_128_BYTES] = { - 0xffff, - 0xdc2040000000011b, - 0x0, - 0xd008000801000000, - 0x25000001ff, - 0x209600191f00, - 0xa000000, - 0x0, - 0x0, - 0x0, - 0xd008000801000000, - 0x0, - 0x0, - 0x0, - 0x0, - 0x0}; - int afid_128 = decode_afid(register_array_128, ACA_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1); - printf("Decoded AFID (128-byte array): %d\n", afid_128); - - // sample for bad page - uint64_t register_array_bad_page[ACA_REGISTER_ARRAY_SIZE_128_BYTES] = { - 0x1, - 0xb000000000000137, - 0x0, - 0x0, - 0x1ff00000002, - 0x9600000000, - 0x0, - 0x0, - 0x0, - 0x0, - 0x0, - 0x0, - 0x0, - 0x0, - 0x0, - 0x0}; - - // when flag is 0b1000, it indicates that the error threshold has been exceeded - // and is always a HBM error. The expected output is 19. - int afid_bad_page = decode_afid(register_array_bad_page, ACA_REGISTER_ARRAY_SIZE_128_BYTES, ACA_FLAG_THRESHOLD_EXCEEDED, 1); - printf("Decoded AFID (bad page): %d\n", afid_bad_page); - - const aca_error_info_t error_info_32 = decode_error_info(register_array_32, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1); - print_error_info(&error_info_32); - - const aca_error_info_t error_info_128 = decode_error_info(register_array_128, ACA_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1); - print_error_info(&error_info_128); - - return 0; -} diff --git a/projects/amdsmi/src/amd_smi/amd_smi_cper.cc b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc index 4e11256f4b..79516f63f6 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_cper.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc @@ -29,7 +29,7 @@ #include extern "C" { -#include "aca-decode/aca_decode.h" +#include "ras-decode/aca_decode.h" } #include "amd_smi/impl/amd_smi_cper.h" #include "rocm_smi/rocm_smi_logger.h" @@ -254,16 +254,16 @@ static int cper_dump_sec_desc(const struct cper_sec_desc *desc) return 0; } -static int aca_decode_fatal(const cper_sec_crashdump_data &data, uint32_t flag, uint16_t hw_revision) +static int aca_decode_fatal(const cper_sec_crashdump_data &data, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type) { const uint64_t *register_array = reinterpret_cast(&data.dump.fatal_err); - return decode_afid(register_array, sizeof(data.dump.fatal_err)/sizeof(uint64_t), flag, hw_revision); + return decode_afid(register_array, sizeof(data.dump.fatal_err)/sizeof(uint64_t), flag, hw_revision, register_context_type); } -static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes, uint32_t flag, uint16_t hw_revision) +static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type) { const uint64_t *register_array = reinterpret_cast(reg_dump); - return decode_afid(register_array, num_bytes, flag, hw_revision); + return decode_afid(register_array, num_bytes, flag, hw_revision, register_context_type); } static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err, const cper_sec_desc *section) @@ -299,7 +299,7 @@ exit: LOG_DEBUG(ss); return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump)/sizeof(uint64_t), - section->flags_mask, section->revision_major); + section->flags_mask, section->revision_major, body->err_ctx.reg_ctx_type); } static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section) @@ -320,7 +320,7 @@ static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump, const LOG_DEBUG(ss); - return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major); + return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major, crashdump->data.reg_ctx_type); } static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section) @@ -335,7 +335,7 @@ static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump, const c ss << "~~~~CRASH DUMP - BOOT TIME~~~\n\n"; LOG_DEBUG(ss); - return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major); + return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major, crashdump->data.reg_ctx_type); } } //namespace diff --git a/projects/amdsmi/src/aca-decode/aca_decode.c b/projects/amdsmi/src/ras-decode/aca_decode.c similarity index 54% rename from projects/amdsmi/src/aca-decode/aca_decode.c rename to projects/amdsmi/src/ras-decode/aca_decode.c index fd96c43b64..a7d91ea919 100644 --- a/projects/amdsmi/src/aca-decode/aca_decode.c +++ b/projects/amdsmi/src/ras-decode/aca_decode.c @@ -32,8 +32,11 @@ #include "aca_decode.h" #include "aca_tables.h" #include "error_map.h" -#include "aca_constants.h" +#include "ras_decode_constants.h" +#include "json_util.h" #include +#include +#include /** * @brief Gets the bank name based on hardware ID and ACA type @@ -61,18 +64,18 @@ aca_decoder_get_bank(const aca_decoder_t *decoder, const char **bank_name) static const char *get_error_severity(const aca_status_fields_t *status) { if (status->poison) - return ACA_SEVERITY_UNCORRECTED_NON_FATAL; + return RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL; if (status->pcc) - return ACA_SEVERITY_FATAL; + return RAS_DECODE_SEVERITY_FATAL; if (!status->pcc && status->uc && status->tcc) - return ACA_SEVERITY_FATAL; + return RAS_DECODE_SEVERITY_FATAL; if (!status->pcc && status->uc && !status->tcc) - return ACA_SEVERITY_UNCORRECTED_NON_FATAL; + return RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL; if (!status->pcc && !status->uc && !status->tcc && status->deferred) - return ACA_SEVERITY_UNCORRECTED_NON_FATAL; + return RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL; if (!status->pcc && !status->uc && !status->tcc && !status->deferred) - return ACA_SEVERITY_CORRECTED; - return ACA_SEVERITY_UNKNOWN; + return RAS_DECODE_SEVERITY_CORRECTED; + return RAS_DECODE_SEVERITY_UNKNOWN; } /** @@ -85,31 +88,31 @@ static const char *get_error_category(const char *bank, const char *error_type) { if (!bank || !error_type) { - return ACA_SEVERITY_UNKNOWN; + return RAS_DECODE_SEVERITY_UNKNOWN; } - if (strcmp(bank, ACA_BANK_UMC) == 0) + if (strcmp(bank, RAS_DECODE_BANK_UMC) == 0) { - if (strcmp(error_type, ACA_ERROR_TYPE_ON_DIE_ECC) == 0 || + if (strcmp(error_type, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC) == 0 || strcmp(error_type, "WriteDataPoisonErr") == 0 || strcmp(error_type, "AddressCommandParityErr") == 0 || strcmp(error_type, "WriteDataCrcErr") == 0 || strcmp(error_type, "EcsErr") == 0 || strcmp(error_type, "RdCrcErr") == 0 || - strcmp(error_type, ACA_ERROR_TYPE_END_TO_END_CRC) == 0) + strcmp(error_type, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC) == 0) { - return ACA_CATEGORY_HBM_ERRORS; + return RAS_DECODE_CATEGORY_HBM_ERRORS; } } - else if (strcmp(bank, ACA_BANK_PCS_XGMI) == 0 || - strcmp(bank, ACA_BANK_KPX_SERDES) == 0 || - strcmp(bank, ACA_BANK_KPX_WAFL) == 0 || - (strcmp(bank, ACA_BANK_PSP) == 0 && strcmp(error_type, ACA_ERROR_TYPE_WAFL) == 0)) + else if (strcmp(bank, RAS_DECODE_BANK_PCS_XGMI) == 0 || + strcmp(bank, RAS_DECODE_BANK_KPX_SERDES) == 0 || + strcmp(bank, RAS_DECODE_BANK_KPX_WAFL) == 0 || + (strcmp(bank, RAS_DECODE_BANK_PSP) == 0 && strcmp(error_type, RAS_DECODE_ERROR_TYPE_WAFL) == 0)) { - return ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS; + return RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS; } - return ACA_CATEGORY_DEVICE_INTERNAL_ERRORS; + return RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS; } /** @@ -125,55 +128,55 @@ static int get_service_error_type(const char *error_category, const char *error_ const char *error_severity, const char **service_error_type) { if (!error_category || !error_type || !error_severity || !service_error_type || - strcmp(error_category, ACA_SEVERITY_UNKNOWN) == 0 || - strcmp(error_type, ACA_SEVERITY_UNKNOWN) == 0 || - strcmp(error_severity, ACA_SEVERITY_UNKNOWN) == 0) + strcmp(error_category, RAS_DECODE_SEVERITY_UNKNOWN) == 0 || + strcmp(error_type, RAS_DECODE_SEVERITY_UNKNOWN) == 0 || + strcmp(error_severity, RAS_DECODE_SEVERITY_UNKNOWN) == 0) { return -1; } - if (strcmp(error_type, ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0) + if (strcmp(error_type, RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0) { - *service_error_type = ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD; + *service_error_type = RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD; return 0; } - if ((strcmp(error_category, ACA_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, ACA_SEVERITY_CORRECTED) == 0)) + if ((strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, RAS_DECODE_SEVERITY_CORRECTED) == 0)) { - *service_error_type = ACA_ERROR_TYPE_ALL; + *service_error_type = RAS_DECODE_ERROR_TYPE_ALL; return 0; } if (strcmp(error_type, "RdCrcErr") == 0) { - *service_error_type = ACA_ERROR_TYPE_END_TO_END_CRC; + *service_error_type = RAS_DECODE_ERROR_TYPE_END_TO_END_CRC; return 0; } - if ((strcmp(error_category, ACA_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, ACA_SEVERITY_FATAL) == 0) && - (strcmp(error_type, ACA_ERROR_TYPE_ON_DIE_ECC) != 0) && (strcmp(error_type, ACA_ERROR_TYPE_END_TO_END_CRC) != 0)) + if ((strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0) && + (strcmp(error_type, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC) != 0) && (strcmp(error_type, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC) != 0)) { - *service_error_type = ACA_ERROR_TYPE_ALL_OTHERS; + *service_error_type = RAS_DECODE_ERROR_TYPE_ALL_OTHERS; return 0; } - if (strcmp(error_category, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS) == 0) + if (strcmp(error_category, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS) == 0) { - if ((strcmp(error_severity, ACA_SEVERITY_UNCORRECTED_NON_FATAL) == 0 || - strcmp(error_severity, ACA_SEVERITY_CORRECTED) == 0 || - strcmp(error_severity, ACA_SEVERITY_FATAL) == 0) && - strcmp(error_type, ACA_ERROR_TYPE_HARDWARE_ASSERTION) != 0 && - strcmp(error_type, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT) != 0) + if ((strcmp(error_severity, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL) == 0 || + strcmp(error_severity, RAS_DECODE_SEVERITY_CORRECTED) == 0 || + strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0) && + strcmp(error_type, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION) != 0 && + strcmp(error_type, RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT) != 0) { - *service_error_type = ACA_ERROR_TYPE_ALL_OTHERS; + *service_error_type = RAS_DECODE_ERROR_TYPE_ALL_OTHERS; return 0; } } - if (strcmp(error_category, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS) == 0) + if (strcmp(error_category, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS) == 0) { - if (strcmp(error_bank, ACA_BANK_PCS_XGMI) == 0) + if (strcmp(error_bank, RAS_DECODE_BANK_PCS_XGMI) == 0) { - *service_error_type = ACA_ERROR_TYPE_XGMI; + *service_error_type = RAS_DECODE_ERROR_TYPE_XGMI; return 0; } - if (strcmp(error_bank, ACA_BANK_KPX_WAFL) == 0) + if (strcmp(error_bank, RAS_DECODE_BANK_KPX_WAFL) == 0) { - *service_error_type = ACA_ERROR_TYPE_WAFL; + *service_error_type = RAS_DECODE_ERROR_TYPE_WAFL; return 0; } } @@ -199,13 +202,15 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i info->raw_synd = decoder->aca_synd; info->scrub = decoder->status.scrub; + info->poison = decoder->status.poison; + info->deferred = decoder->status.deferred; info->error_code_ext = decoder->status.error_code_ext; result = aca_decoder_get_bank(decoder, &bank); if (result < 0) { - bank = ACA_SEVERITY_UNKNOWN; + bank = RAS_DECODE_SEVERITY_UNKNOWN; } info->bank_ref = bank; @@ -215,13 +220,13 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i } else { - info->instance_ref = ACA_ERROR_TYPE_DECODE_INAPPLICABLE; + info->instance_ref = RAS_DECODE_ERROR_TYPE_DECODE_INAPPLICABLE; } // 0b1000 indicate error threshold has been exceeded, and is always fatal - if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED) + if (decoder->flags & RAS_DECODE_FLAG_THRESHOLD_EXCEEDED) { - info->severity_ref = ACA_SEVERITY_FATAL; + info->severity_ref = RAS_DECODE_SEVERITY_FATAL; } else { @@ -242,31 +247,31 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i info->aid = -1; // Invalid value } - if (decoder->status.error_code_ext >= ACA_ERROR_CODE_EXT_MIN && decoder->status.error_code_ext <= ACA_ERROR_CODE_EXT_MAX) + if (decoder->status.error_code_ext >= RAS_DECODE_ERROR_CODE_EXT_MIN && decoder->status.error_code_ext <= RAS_DECODE_ERROR_CODE_EXT_MAX) { uint32_t instance_id = decoder->ipid.instance_id_lo; uint32_t error_info = decoder->synd.error_information & 0xFF; - if ((instance_id == ACA_INSTANCE_ID_XCD0_400 || instance_id == ACA_INSTANCE_ID_XCD1_400 || - instance_id == ACA_INSTANCE_ID_XCD0_401 || instance_id == ACA_INSTANCE_ID_XCD1_401) && + if ((instance_id == RAS_DECODE_INSTANCE_ID_XCD0_400 || instance_id == RAS_DECODE_INSTANCE_ID_XCD1_400 || + instance_id == RAS_DECODE_INSTANCE_ID_XCD0_401 || instance_id == RAS_DECODE_INSTANCE_ID_XCD1_401) && find_error_in_table(xcd_error_table, NUM_XCD_ERRORS, error_info, &error_type) == 0) { info->error_type_ref = error_type; } - else if ((instance_id == ACA_INSTANCE_ID_AID_400 || instance_id == ACA_INSTANCE_ID_AID_401) && + else if ((instance_id == RAS_DECODE_INSTANCE_ID_AID_400 || instance_id == RAS_DECODE_INSTANCE_ID_AID_401) && find_error_in_table(aid_error_table, NUM_AID_ERRORS, error_info, &error_type) == 0) { info->error_type_ref = error_type; } else { - info->error_type_ref = ACA_SEVERITY_UNKNOWN; + info->error_type_ref = RAS_DECODE_SEVERITY_UNKNOWN; } } // 0b1000 indicate error threshold has been exceeded - else if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED) + else if (decoder->flags & RAS_DECODE_FLAG_THRESHOLD_EXCEEDED) { - info->error_type_ref = ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD; + info->error_type_ref = RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD; } else { @@ -276,14 +281,14 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i } else { - info->error_type_ref = ACA_SEVERITY_UNKNOWN; + info->error_type_ref = RAS_DECODE_SEVERITY_UNKNOWN; } } // 0b1000 indicate error threshold has been exceeded, and is always a HBM error - if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED) + if (decoder->flags & RAS_DECODE_FLAG_THRESHOLD_EXCEEDED) { - info->category_ref = ACA_CATEGORY_HBM_ERRORS; + info->category_ref = RAS_DECODE_CATEGORY_HBM_ERRORS; } else { @@ -295,8 +300,6 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i { service_error = info->error_type_ref; } - - info->afid = get_error_id(info->category_ref, service_error, info->severity_ref); } /** @@ -324,8 +327,17 @@ static void aca_decoder_init(aca_decoder_t *decoder, uint16_t hw_revision, uint3 aca_synd_init(&decoder->synd, synd_reg); } -aca_error_info_t aca_decode(const aca_raw_data_t *raw_data) +/** + * @brief Main decode function that processes raw ACA error data and returns JSON + * @param[in] raw_data Pointer to structure containing raw ACA error data + * @return JsonValue* containing the decoded error information, or NULL on failure + */ +JsonValue* aca_decode(const aca_raw_data_t *raw_data) { + if (!raw_data) { + return NULL; + } + aca_decoder_t decoder = {0}; aca_error_info_t info = {0}; @@ -337,5 +349,68 @@ aca_error_info_t aca_decode(const aca_raw_data_t *raw_data) raw_data->aca_synd); aca_decoder_get_error_info(&decoder, &info); - return info; + + // Create the main JSON object + JsonValue *json_obj = json_create_object(); + if (!json_obj) { + return NULL; + } + + // Add bank + json_object_set(json_obj, "bank", json_create_string(info.bank_ref)); + + // Create error_location object + JsonValue *error_location = json_create_object(); + if (error_location) { + char oam_str[16], aid_str[16]; + snprintf(oam_str, sizeof(oam_str), "%d", info.oam); + snprintf(aid_str, sizeof(aid_str), "%d", info.aid); + + json_object_set(error_location, "oam", json_create_string(oam_str)); + json_object_set(error_location, "aid", json_create_string(aid_str)); + json_object_set(error_location, "instance", json_create_string(info.instance_ref)); + + json_object_set(json_obj, "error_location", error_location); + } + + // Add severity + json_object_set(json_obj, "severity", json_create_string(info.severity_ref)); + + // Add scrub as string + char scrub_str[16]; + snprintf(scrub_str, sizeof(scrub_str), "%u", info.scrub); + json_object_set(json_obj, "scrub", json_create_string(scrub_str)); + + // Add poison as string + char poison_str[16]; + snprintf(poison_str, sizeof(poison_str), "%u", info.poison); + json_object_set(json_obj, "poison", json_create_string(poison_str)); + + // Add deferred as string + char deferred_str[16]; + snprintf(deferred_str, sizeof(deferred_str), "%u", info.deferred); + json_object_set(json_obj, "deferred", json_create_string(deferred_str)); + + // Add err_ext as string + char err_ext_str[16]; + snprintf(err_ext_str, sizeof(err_ext_str), "%u", info.error_code_ext); + json_object_set(json_obj, "err_ext", json_create_string(err_ext_str)); + + // Add error_category + json_object_set(json_obj, "error_category", json_create_string(info.category_ref)); + + // Add error_type + json_object_set(json_obj, "error_type", json_create_string(info.error_type_ref)); + + // Add address as hex string + char address_str[32]; + snprintf(address_str, sizeof(address_str), "0x%" PRIx64, info.raw_addr); + json_object_set(json_obj, "address", json_create_string(address_str)); + + // Add syndrome as hex string + char syndrome_str[32]; + snprintf(syndrome_str, sizeof(syndrome_str), "0x%" PRIx64, info.raw_synd); + json_object_set(json_obj, "syndrome", json_create_string(syndrome_str)); + + return json_obj; } diff --git a/projects/amdsmi/src/aca-decode/aca_fields.c b/projects/amdsmi/src/ras-decode/aca_fields.c similarity index 97% rename from projects/amdsmi/src/aca-decode/aca_fields.c rename to projects/amdsmi/src/ras-decode/aca_fields.c index 6b008600ef..bff878df0e 100644 --- a/projects/amdsmi/src/aca-decode/aca_fields.c +++ b/projects/amdsmi/src/ras-decode/aca_fields.c @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * @@ -21,7 +20,7 @@ * THE SOFTWARE. */ - /** +/** * @file aca_fields.c * @brief Implementation of ACA register field handling * diff --git a/projects/amdsmi/src/aca-decode/aca_tables.c b/projects/amdsmi/src/ras-decode/aca_tables.c similarity index 92% rename from projects/amdsmi/src/aca-decode/aca_tables.c rename to projects/amdsmi/src/ras-decode/aca_tables.c index fc742ed39d..3ea4f84e3f 100644 --- a/projects/amdsmi/src/aca-decode/aca_tables.c +++ b/projects/amdsmi/src/ras-decode/aca_tables.c @@ -33,7 +33,7 @@ */ #include "aca_tables.h" -#include "aca_constants.h" +#include "ras_decode_constants.h" #include #include #include @@ -80,19 +80,19 @@ const aca_error_type_t error_table[] = { {"cs", 0xe, "FTI_ND_ILL_REQ"}, {"cs", 0xf, "FTI_ND_ADDR_VIOL"}, {"cs", 0x10, "FTI_ND_SEC_VIOL"}, - {"cs", 0x11, ACA_ERROR_TYPE_HARDWARE_ASSERTION}, + {"cs", 0x11, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION}, {"cs", 0x12, "ST_PRT_ERR"}, {"cs", 0x13, "ST_ECC_ERR"}, {"cs", 0x14, "ST_TXN_ERR"}, - {"pie", 0x0, ACA_ERROR_TYPE_HARDWARE_ASSERTION}, + {"pie", 0x0, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION}, {"pie", 0x1, "CSW"}, {"pie", 0x2, "GMI"}, {"pie", 0x3, "FTI_DAT_STAT"}, {"pie", 0x4, "DEF"}, - {"pie", 0x5, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT}, + {"pie", 0x5, RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT}, {"pie", 0x6, "CNLI"}, {"pie", 0x7, "RSLVFCI"}, - {"umc", 0x0, ACA_ERROR_TYPE_ON_DIE_ECC}, + {"umc", 0x0, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC}, {"umc", 0x1, "WriteDataPoisonErr"}, {"umc", 0x2, "SdpParityErr"}, {"umc", 0x4, "AddressCommandParityErr"}, @@ -103,7 +103,7 @@ const aca_error_type_t error_table[] = { {"umc", 0xb, "RdCrcErr"}, {"umc", 0xd, "MpFwErr"}, {"umc", 0xe, "MpParErr"}, - {"umc", 0xf, ACA_ERROR_TYPE_END_TO_END_CRC}, + {"umc", 0xf, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC}, {"psp", 0x0, "Mp0HighSramError"}, {"psp", 0x1, "Mp0LowSramError"}, {"psp", 0x2, "Mp0IDataBank0Error"}, @@ -127,7 +127,7 @@ const aca_error_type_t error_table[] = { {"psp", 0x3b, "SRAM_EDC"}, {"psp", 0x3c, "SMN_Parity"}, {"psp", 0x3d, "SMN_Timeout"}, - {"psp", 0x3f, ACA_ERROR_TYPE_WAFL}, + {"psp", 0x3f, RAS_DECODE_ERROR_TYPE_WAFL}, {"smu", 0x0, "Mp5HighSramError"}, {"smu", 0x1, "Mp5LowSramError"}, {"smu", 0x2, "Mp5DCacheAError"}, @@ -452,7 +452,11 @@ static const aca_instance_entry_t instance_table[] = { {"umc", 0x193F00, "ch7 umc0"}, {"umc", 0x393F00, "ch7 umc1"}, {"umc", 0x593F00, "ch7 umc2"}, - {"umc", 0x793F00, "ch7 umc3"}}; + {"umc", 0x793F00, "ch7 umc3"}, + {"pcs_xgmi", 0x11A09200, "serdes a pcs0"}, + {"pcs_xgmi", 0x12109200, "serdes b pcs7"}, + {"pcs_xgmi", 0x12209200, "serdes b pcs8"}, + {"pcs_xgmi", 0x11B09200, "xgmi pcs"}}; const size_t NUM_OAM_AID_ENTRIES = sizeof(oam_aid_table) / sizeof(oam_aid_table[0]); const size_t NUM_BANKS = sizeof(bank_table) / sizeof(bank_table[0]); @@ -478,7 +482,7 @@ int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name) } } - *bank_name = ACA_SEVERITY_UNKNOWN; + *bank_name = RAS_DECODE_SEVERITY_UNKNOWN; return 1; } @@ -499,7 +503,7 @@ int find_error_type_by_bank(const char *bank, uint32_t error_code, const char ** } } - *error_type = ACA_SEVERITY_UNKNOWN; + *error_type = RAS_DECODE_SEVERITY_UNKNOWN; return 1; } @@ -520,7 +524,7 @@ int find_error_in_table(const aca_error_entry_t *table, size_t table_size, } } - *error_type = ACA_SEVERITY_UNKNOWN; + *error_type = RAS_DECODE_SEVERITY_UNKNOWN; return 1; } @@ -556,6 +560,6 @@ int find_instance_name(const char *bank, uint32_t instance_id_lo, const char **i } } - *instance_name = ACA_SEVERITY_UNKNOWN; + *instance_name = RAS_DECODE_SEVERITY_UNKNOWN; return 1; } diff --git a/projects/amdsmi/src/aca-decode/aca_version.c b/projects/amdsmi/src/ras-decode/aca_version.c similarity index 82% rename from projects/amdsmi/src/aca-decode/aca_version.c rename to projects/amdsmi/src/ras-decode/aca_version.c index 348039e184..db9ae72925 100644 --- a/projects/amdsmi/src/aca-decode/aca_version.c +++ b/projects/amdsmi/src/ras-decode/aca_version.c @@ -26,32 +26,32 @@ int aca_get_version_major(void) { - return ACA_VERSION_MAJOR; + return RAS_DECODE_VERSION_MAJOR; } int aca_get_version_minor(void) { - return ACA_VERSION_MINOR; + return RAS_DECODE_VERSION_MINOR; } int aca_get_version_patch(void) { - return ACA_VERSION_PATCH; + return RAS_DECODE_VERSION_PATCH; } const char *aca_get_version_string(void) { - return ACA_VERSION_STRING; + return RAS_DECODE_VERSION_STRING; } aca_version_info_t aca_get_version_info(void) { aca_version_info_t info; - info.major = ACA_VERSION_MAJOR; - info.minor = ACA_VERSION_MINOR; - info.patch = ACA_VERSION_PATCH; - info.string = ACA_VERSION_STRING; + info.major = RAS_DECODE_VERSION_MAJOR; + info.minor = RAS_DECODE_VERSION_MINOR; + info.patch = RAS_DECODE_VERSION_PATCH; + info.string = RAS_DECODE_VERSION_STRING; return info; } diff --git a/projects/amdsmi/src/ras-decode/boot_decode.c b/projects/amdsmi/src/ras-decode/boot_decode.c new file mode 100644 index 0000000000..b5f51a1b47 --- /dev/null +++ b/projects/amdsmi/src/ras-decode/boot_decode.c @@ -0,0 +1,862 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "boot_decode.h" +#include "ras_decode_constants.h" +#include +#include +#include + +// Boot decoder mapping tables +static const boot_decoder_entry_t boot_decoder_map_v0[] = { + {BOOT_ENCODING_HBM_TRAINING, decode_hbm_training_v0}, + {BOOT_ENCODING_FW_LOAD, decode_fw_load_v0}, + {BOOT_ENCODING_WAFL_LINK, decode_wafl_link_training_v0}, + {BOOT_ENCODING_XGMI_LINK, decode_xgmi_link_training_v0}, + {BOOT_ENCODING_USR_CP_LINK, decode_usr_cp_link_training_v0}, + {BOOT_ENCODING_USR_DP_LINK, decode_usr_dp_link_training_v0}, + {BOOT_ENCODING_HBM_MEM_TEST, decode_hbm_mem_test_v0}, + {BOOT_ENCODING_HBM_BIST_TEST, decode_hbm_bist_test_v0}, + {BOOT_ENCODING_BOOT_CTRL_GEN_V0, decode_boot_controller_generic_v0}, + {0, NULL} // Sentinel +}; + +static const boot_decoder_entry_t boot_decoder_map_v1[] = { + {BOOT_ENCODING_HBM_TRAINING, decode_hbm_training_v1}, + {BOOT_ENCODING_FW_LOAD, decode_fw_load_v1}, + {BOOT_ENCODING_WAFL_LINK, decode_wafl_link_training_v1}, + {BOOT_ENCODING_XGMI_LINK, decode_xgmi_link_training_v1}, + {BOOT_ENCODING_USR_CP_LINK, decode_usr_cp_link_training_v1}, + {BOOT_ENCODING_USR_DP_LINK, decode_usr_dp_link_training_v1}, + {BOOT_ENCODING_HBM_MEM_TEST, decode_hbm_mem_test_v1}, + {BOOT_ENCODING_HBM_BIST_TEST, decode_hbm_bist_test_v1}, + {BOOT_ENCODING_BOOT_CTRL_GEN_V1, decode_boot_controller_generic_v1}, + {BOOT_ENCODING_DATA_ABORT, decode_data_abort_v1}, + {BOOT_SUCCESS_ENCODING, decode_boot_success_v1}, + {0, NULL} // Sentinel +}; + +int get_boot_version(OamBootMsg *msg) +{ + if (!msg) + return 0; + return extract_byte(msg->value, 1) >> 5; +} + +int get_error_encoding(OamBootMsg *msg) +{ + if (!msg) + return 0; + return (int)(extract_byte(msg->value, 1) & extract_bits(5)); +} + +bool error_present(OamBootMsg *msg) +{ + if (!msg) + return false; + return extract_byte(msg->value, 0) == BOOT_ERROR_PRESENT_MARKER; +} + +bool in_boot(OamBootMsg *msg) +{ + if (!msg) + return false; + return extract_byte(msg->value, 0) == BOOT_IN_BOOT_MARKER; +} + +int get_socket(OamBootMsg *msg, int version) +{ + if (!msg) + return 0; + + if (version == 0) + { + return extract_byte(msg->value, 4); + } + else + { + return (int)((extract_byte(msg->value, 2) >> 4) & extract_bits(4)); + } +} + +int get_aid(OamBootMsg *msg, int version) +{ + if (!msg) + return 0; + + if (version == 0) + { + return extract_byte(msg->value, 5); + } + else + { + return (int)(extract_byte(msg->value, 2) & extract_bits(4)); + } +} + +int decode_hbm_stack(uint8_t stack) +{ + switch (stack) + { + case HBM_STACK_0: + return 0; + case HBM_STACK_1: + return 1; + default: + return HBM_STACK_UNKNOWN; + } +} + +JsonValue *create_failed_links_array(uint8_t byte_value, int max_links) +{ + JsonValue *array = json_create_array(); + if (!array) + return NULL; + + for (int i = 0; i < max_links; i++) + { + if ((byte_value >> i) & 0x1) + { + JsonValue *link_num = json_create_number(i); + if (link_num) + { + json_array_push(array, link_num); + } + } + } + + return array; +} + +char *create_hex_string(uint64_t value, int width) +{ + if (width < 0) + return NULL; + size_t buffer_size = (size_t)width + 3U; // "0x" + digits + null terminator + char *hex_str = malloc(buffer_size); + if (!hex_str) + return NULL; + + snprintf(hex_str, buffer_size, "0x%0*llX", width, (unsigned long long)value); + return hex_str; +} + +// Version 0 decoder implementations +JsonValue *decode_hbm_training_v0(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte3 = extract_byte(msg->value, 3); + uint8_t byte2 = extract_byte(msg->value, 2); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte3))); + json_object_set(result, "hbm_channel", json_create_number(byte2)); + + return result; +} + +JsonValue *decode_fw_load_v0(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte3 = extract_byte(msg->value, 3); + uint8_t byte2 = extract_byte(msg->value, 2); + uint16_t fw_id = (byte3 << 8) | byte2; + + char *fw_id_str = create_hex_string(fw_id, 4); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_FW_LOAD)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "fw_id", json_create_string(fw_id_str ? fw_id_str : "0x0000")); + + free(fw_id_str); + return result; +} + +JsonValue *decode_wafl_link_training_v0(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte2 = extract_byte(msg->value, 2); + JsonValue *failed_links = create_failed_links_array(byte2, 2); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_WAFL_LINK_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array()); + + return result; +} + +JsonValue *decode_xgmi_link_training_v0(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte2 = extract_byte(msg->value, 2); + JsonValue *failed_links = create_failed_links_array(byte2, 8); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_XGMI_LINK_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array()); + + return result; +} + +JsonValue *decode_usr_cp_link_training_v0(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte2 = extract_byte(msg->value, 2); + JsonValue *failed_links = create_failed_links_array(byte2, 2); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_USR_CP_LINK_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array()); + + return result; +} + +JsonValue *decode_usr_dp_link_training_v0(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte2 = extract_byte(msg->value, 2); + JsonValue *failed_links = create_failed_links_array(byte2, 4); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_USR_DP_LINK_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array()); + + return result; +} + +JsonValue *decode_hbm_mem_test_v0(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte3 = extract_byte(msg->value, 3); + uint8_t byte2 = extract_byte(msg->value, 2); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_MEMORY_TEST)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte3))); + json_object_set(result, "hbm_channel", json_create_number(byte2)); + + return result; +} + +JsonValue *decode_hbm_bist_test_v0(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte3 = extract_byte(msg->value, 3); + uint8_t byte2 = extract_byte(msg->value, 2); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_BIST_TEST)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte3))); + json_object_set(result, "hbm_channel", json_create_number(byte2)); + + return result; +} + +JsonValue *decode_boot_controller_generic_v0(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_GENERIC)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + + return result; +} + +// Version 1 decoder implementations +JsonValue *decode_hbm_training_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte5 = extract_byte(msg->value, 5); + uint8_t byte4 = extract_byte(msg->value, 4); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte5))); + json_object_set(result, "hbm_channel", json_create_number(byte4)); + + return result; +} + +JsonValue *decode_fw_load_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte5 = extract_byte(msg->value, 5); + uint8_t byte4 = extract_byte(msg->value, 4); + uint16_t fw_id = (byte5 << 8) | byte4; + + char *fw_id_str = create_hex_string(fw_id, 4); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_FW_LOAD)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "fw_id", json_create_string(fw_id_str ? fw_id_str : "0x0000")); + + free(fw_id_str); + return result; +} + +JsonValue *decode_wafl_link_training_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte4 = extract_byte(msg->value, 4); + JsonValue *failed_links = create_failed_links_array(byte4, 2); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_WAFL_LINK_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array()); + + return result; +} + +JsonValue *decode_xgmi_link_training_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte4 = extract_byte(msg->value, 4); + JsonValue *failed_links = create_failed_links_array(byte4, 8); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_XGMI_LINK_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array()); + + return result; +} + +JsonValue *decode_usr_cp_link_training_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte4 = extract_byte(msg->value, 4); + JsonValue *failed_links = create_failed_links_array(byte4, 2); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_USR_CP_LINK_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array()); + + return result; +} + +JsonValue *decode_usr_dp_link_training_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte4 = extract_byte(msg->value, 4); + JsonValue *failed_links = create_failed_links_array(byte4, 4); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_USR_DP_LINK_TRAINING)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "failed_links", failed_links ? failed_links : json_create_array()); + + return result; +} + +JsonValue *decode_hbm_mem_test_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte5 = extract_byte(msg->value, 5); + uint8_t byte4 = extract_byte(msg->value, 4); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_MEMORY_TEST)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte5))); + json_object_set(result, "hbm_channel", json_create_number(byte4)); + + return result; +} + +JsonValue *decode_hbm_bist_test_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte5 = extract_byte(msg->value, 5); + uint8_t byte4 = extract_byte(msg->value, 4); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_HBM_BIST_TEST)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "hbm_stack", json_create_number(decode_hbm_stack(byte5))); + json_object_set(result, "hbm_channel", json_create_number(byte4)); + + return result; +} + +JsonValue *decode_boot_controller_generic_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte4 = extract_byte(msg->value, 4); + uint8_t byte0 = extract_byte(msg->value, 0); + uint8_t byte5 = extract_byte(msg->value, 5); + uint8_t byte6 = extract_byte(msg->value, 6); + uint8_t byte7 = extract_byte(msg->value, 7); + + char *boot_step_str = create_hex_string(byte4, 2); + uint32_t boot_status = (byte7 << 24) | (byte6 << 16) | (byte5 << 8) | byte0; + char *boot_status_str = create_hex_string(boot_status, 8); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_GENERIC)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "last_successful_boot_step_number", + json_create_string(boot_step_str ? boot_step_str : "0x00")); + json_object_set(result, "fw_boot_status", + json_create_string(boot_status_str ? boot_status_str : "0x00000000")); + + free(boot_step_str); + free(boot_status_str); + return result; +} + +JsonValue *decode_data_abort_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + int version = get_boot_version(msg); + uint8_t byte3 = extract_byte(msg->value, 3); + uint8_t byte4 = extract_byte(msg->value, 4); + uint8_t byte5 = extract_byte(msg->value, 5); + uint8_t byte6 = extract_byte(msg->value, 6); + uint8_t byte7 = extract_byte(msg->value, 7); + + char *boot_step_str = create_hex_string(byte3, 2); + uint32_t exception_addr = (byte7 << 24) | (byte6 << 16) | (byte5 << 8) | byte4; + char *exception_addr_str = create_hex_string(exception_addr, 8); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT)); + json_object_set(result, "socket", json_create_number(get_socket(msg, version))); + json_object_set(result, "aid", json_create_number(get_aid(msg, version))); + json_object_set(result, "last_successful_boot_step_number", + json_create_string(boot_step_str ? boot_step_str : "0x00")); + json_object_set(result, "exception_address", + json_create_string(exception_addr_str ? exception_addr_str : "0x00000000")); + + free(boot_step_str); + free(exception_addr_str); + return result; +} + +JsonValue *decode_boot_success_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + uint8_t byte4 = extract_byte(msg->value, 4); + uint8_t byte0 = extract_byte(msg->value, 0); + uint8_t byte5 = extract_byte(msg->value, 5); + uint8_t byte6 = extract_byte(msg->value, 6); + uint8_t byte7 = extract_byte(msg->value, 7); + + char *boot_step_str = create_hex_string(byte4, 2); + uint32_t boot_status = (byte7 << 24) | (byte6 << 16) | (byte5 << 8) | byte0; + char *boot_status_str = create_hex_string(boot_status, 8); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_BOOT_SUCCESS)); + json_object_set(result, "last_successful_boot_step_number", + json_create_string(boot_step_str ? boot_step_str : "0x00")); + json_object_set(result, "fw_boot_status", + json_create_string(boot_status_str ? boot_status_str : "0x00000000")); + + free(boot_step_str); + free(boot_status_str); + return result; +} + +// Unhandled error decoders +JsonValue *decode_unhandled_error_v0(OamBootMsg *msg) +{ + (void)msg; // Suppress unused parameter warning + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_UNHANDLED)); + + return result; +} + +JsonValue *decode_unhandled_error_v1(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + JsonValue *result = json_create_object(); + if (!result) + return NULL; + + uint8_t byte4 = extract_byte(msg->value, 4); + uint8_t byte0 = extract_byte(msg->value, 0); + uint8_t byte5 = extract_byte(msg->value, 5); + uint8_t byte6 = extract_byte(msg->value, 6); + uint8_t byte7 = extract_byte(msg->value, 7); + + char *boot_step_str = create_hex_string(byte4, 2); + uint32_t boot_status = (byte7 << 24) | (byte6 << 16) | (byte5 << 8) | byte0; + char *boot_status_str = create_hex_string(boot_status, 8); + + json_object_set(result, "error_type", json_create_string(RAS_DECODE_ERROR_TYPE_UNHANDLED)); + json_object_set(result, "last_successful_boot_step_number", + json_create_string(boot_step_str ? boot_step_str : "0x00")); + json_object_set(result, "fw_boot_status", + json_create_string(boot_status_str ? boot_status_str : "0x00000000")); + + free(boot_step_str); + free(boot_status_str); + return result; +} + +boot_decoder_func_t get_decoder_function(OamBootMsg *msg) +{ + if (!msg) + return NULL; + + uint8_t byte0 = extract_byte(msg->value, 0); + if (byte0 == BOOT_IN_BOOT_MARKER) + { + int version = get_boot_version(msg); + if (version == 1) + { + return decode_boot_success_v1; + } + } + + int version = get_boot_version(msg); + int encoding = get_error_encoding(msg); + + const boot_decoder_entry_t *decoder_map = (version == 0) ? boot_decoder_map_v0 : boot_decoder_map_v1; + + for (int i = 0; decoder_map[i].decoder != NULL; i++) + { + if (decoder_map[i].encoding == encoding) + { + return decoder_map[i].decoder; + } + } + + return NULL; // No decoder found +} + +JsonValue *boot_decode_orchestrator(uint64_t *oam_boot_msgs, size_t count) +{ + if (!oam_boot_msgs || count == 0) + return NULL; + + JsonValue *results = json_create_object(); + if (!results) + return NULL; + + // Convert to OamBootMsg structures + OamBootMsg *msgs = malloc(count * sizeof(OamBootMsg)); + if (!msgs) + { + json_free(results); + return NULL; + } + + for (size_t i = 0; i < count; i++) + { + msgs[i].value = oam_boot_msgs[i]; + } + + // Check error markers across all messages + size_t messages_with_markers = 0; + bool *has_marker = malloc(count * sizeof(bool)); + if (!has_marker) { + free(msgs); + json_free(results); + return NULL; + } + + // Count messages with error markers (0xA4) or boot markers (0xBA) + for (size_t i = 0; i < count; i++) + { + has_marker[i] = error_present(&msgs[i]) || in_boot(&msgs[i]); + if (has_marker[i]) { + messages_with_markers++; + } + } + + // Determine decoding strategy based on the presence of error markers + bool decode_all_as_unhandled = (messages_with_markers == 0); + bool decode_only_marked = (messages_with_markers > 0 && messages_with_markers < count); + bool decode_all_normally = (messages_with_markers == count); + + // Check if all decoders are NULL (for unhandled error handling) + bool all_decoders_none = true; + if (!decode_all_as_unhandled) { + for (size_t i = 0; i < count; i++) + { + if (has_marker[i] && get_decoder_function(&msgs[i]) != NULL) + { + all_decoders_none = false; + break; + } + } + } + + // Process each message + for (size_t i = 0; i < count; i++) + { + char msg_key[32]; + snprintf(msg_key, sizeof(msg_key), "msg%zu", i); + + // Skip messages without markers if we're in selective decode mode + if (decode_only_marked && !has_marker[i]) { + continue; + } + + JsonValue *msg_result = json_create_object(); + if (!msg_result) + continue; + + boot_decoder_func_t decoder_func = NULL; + + if (decode_all_as_unhandled) + { + // Rule 3: No messages have markers, decode all as UNHANDLED + decoder_func = decode_unhandled_error_v1; + } + else if (has_marker[i] || decode_all_normally) + { + // Rule 1 & 2: Decode messages with markers (or all if all have markers) + if (all_decoders_none) + { + // Use unhandled error decoders + int encoding = get_error_encoding(&msgs[i]); + decoder_func = (encoding == 0) ? decode_unhandled_error_v0 : decode_unhandled_error_v1; + } + else + { + decoder_func = get_decoder_function(&msgs[i]); + } + } + // If no decoder function is found, skip this message + + if (decoder_func) + { + JsonValue *decoded = decoder_func(&msgs[i]); + if (decoded) + { + // Copy all fields from decoded result to msg_result + for (JsonPair *pair = decoded->data.object; pair != NULL; pair = pair->next) + { + // Create a copy of the value for the new object + JsonValue *value_copy = NULL; + switch (pair->value->type) + { + case JSON_STRING: + value_copy = json_create_string(pair->value->data.string); + break; + case JSON_NUMBER: + value_copy = json_create_number(pair->value->data.number); + break; + case JSON_BOOL: + value_copy = json_create_bool(pair->value->data.boolean); + break; + case JSON_NULL: + value_copy = json_create_null(); + break; + case JSON_ARRAY: + // For arrays, we need to copy each element + value_copy = json_create_array(); + if (value_copy) + { + for (size_t j = 0; j < pair->value->data.array.count; j++) + { + JsonValue *elem = pair->value->data.array.items[j]; + JsonValue *elem_copy = NULL; + if (elem->type == JSON_NUMBER) + { + elem_copy = json_create_number(elem->data.number); + } + if (elem_copy) + { + json_array_push(value_copy, elem_copy); + } + } + } + break; + default: + break; + } + + if (value_copy) + { + json_object_set(msg_result, pair->key, value_copy); + } + } + json_free(decoded); + } + } + + json_object_set(results, msg_key, msg_result); + } + + free(msgs); + free(has_marker); + return results; +} diff --git a/projects/amdsmi/src/ras-decode/error_map.c b/projects/amdsmi/src/ras-decode/error_map.c new file mode 100644 index 0000000000..4acc9fb11a --- /dev/null +++ b/projects/amdsmi/src/ras-decode/error_map.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "error_map.h" +#include "ras_decode_constants.h" +#include + +#define AFID_VERSION "0.7" + +static const error_map_entry_t error_map[] = { + {1, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_FW_LOAD, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {2, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_HBM_BIST_TEST, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {3, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_HBM_MEMORY_TEST, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {4, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_HBM_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {5, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_UNHANDLED, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {6, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_UNKNOWN_ERROR, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {7, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_USR_CP_LINK_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {8, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_USR_DP_LINK_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {9, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_WAFL_LINK_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {10, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_XGMI_LINK_TRAINING, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {11, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {12, RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, RAS_DECODE_ERROR_TYPE_BOOT_CONTROLLER_GENERIC, RAS_DECODE_PROTOCOL_CPER_WITH_SPACE, RAS_DECODE_SEVERITY_FAIL_TO_INIT}, + {13, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_PCIE_AER, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED}, + {14, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_PCIE_AER, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {15, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_WAFL, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED}, + {16, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_WAFL, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {17, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_XGMI, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED}, + {18, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS, RAS_DECODE_ERROR_TYPE_XGMI, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {19, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {20, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {21, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {22, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL}, + {23, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL}, + {24, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_ALL, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED}, + {25, RAS_DECODE_CATEGORY_HBM_ERRORS, RAS_DECODE_ERROR_TYPE_ALL_OTHERS, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {26, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {27, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {28, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_ALL_OTHERS, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL}, + {29, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_ALL_OTHERS, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_CORRECTED}, + {30, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS, RAS_DECODE_ERROR_TYPE_ALL_OTHERS, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_FATAL}, + {31, RAS_DECODE_CATEGORY_CPER_FORMAT, RAS_DECODE_ERROR_TYPE_MALFORMED_CPER, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_ALL_CAPS}, + {32, RAS_DECODE_CATEGORY_CPER_FORMAT, RAS_DECODE_ERROR_TYPE_INCOMPLETE_RAS_DECODE_DATA, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_ALL_CAPS}, + {33, RAS_DECODE_CATEGORY_CPER_FORMAT, RAS_DECODE_ERROR_TYPE_INVALID_RAS_DECODE_DATA, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_ALL_CAPS}, + {34, RAS_DECODE_CATEGORY_UNIDENTIFIED_ERRORS, RAS_DECODE_ERROR_TYPE_UNIDENTIFIED_ERROR, RAS_DECODE_PROTOCOL_CPER, RAS_DECODE_SEVERITY_ALL_CAPS}}; + +static const size_t NUM_ERROR_ENTRIES = sizeof(error_map) / sizeof(error_map[0]); + +int get_error_id(const char *error_category, const char *error_type, const char *error_severity) +{ + if (!error_category || !error_type || !error_severity || + strcmp(error_category, RAS_DECODE_SEVERITY_UNKNOWN) == 0 || + strcmp(error_type, RAS_DECODE_SEVERITY_UNKNOWN) == 0 || + strcmp(error_severity, RAS_DECODE_SEVERITY_UNKNOWN) == 0) + { + return RAS_DECODE_ERROR_INVALID_RAS_DECODE_DATA_ID; // Return ID for "Invalid Error" if any input is "UNKNOWN" or NULL + } + + for (size_t i = 0; i < NUM_ERROR_ENTRIES; i++) + { + if (strcmp(error_map[i].error_category, error_category) == 0 && + strcmp(error_map[i].error_type, error_type) == 0 && + strcmp(error_map[i].error_severity, error_severity) == 0) + { + return (int)error_map[i].id; + } + } + + return RAS_DECODE_ERROR_UNIDENTIFIED_ERROR_ID; // Return ID for "Unidentified Errors" if no match found +} diff --git a/projects/amdsmi/src/ras-decode/json_printer.c b/projects/amdsmi/src/ras-decode/json_printer.c new file mode 100644 index 0000000000..a63c72d30d --- /dev/null +++ b/projects/amdsmi/src/ras-decode/json_printer.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "json_printer.h" +#include + +static void print_json_value_internal(JsonValue *value, int indent) { + if (!value) return; + + switch (value->type) { + case JSON_NULL: + printf("null"); + break; + case JSON_BOOL: + printf("%s", value->data.boolean ? "true" : "false"); + break; + case JSON_NUMBER: + printf("%.0f", value->data.number); + break; + case JSON_STRING: + printf("\"%s\"", value->data.string ? value->data.string : ""); + break; + case JSON_OBJECT: { + printf("{\n"); + JsonPair *pair = value->data.object; + bool first = true; + while (pair) { + if (!first) printf(",\n"); + for (int i = 0; i < indent + 3; i++) printf(" "); + printf("\"%s\": ", pair->key); + print_json_value_internal(pair->value, indent + 3); + pair = pair->next; + first = false; + } + printf("\n"); + for (int i = 0; i < indent; i++) printf(" "); + printf("}"); + break; + } + case JSON_ARRAY: { + printf("["); + for (size_t i = 0; i < value->data.array.count; i++) { + if (i > 0) printf(", "); + print_json_value_internal(value->data.array.items[i], indent); + } + printf("]"); + break; + } + } +} + +void print_json_value(JsonValue *value) { + print_json_value_internal(value, 0); + printf("\n"); +} diff --git a/projects/amdsmi/src/ras-decode/json_util.c b/projects/amdsmi/src/ras-decode/json_util.c new file mode 100644 index 0000000000..3e57a0d11a --- /dev/null +++ b/projects/amdsmi/src/ras-decode/json_util.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "json_util.h" +#include +#include +#include + +#define JSON_ARRAY_INITIAL_CAPACITY 16 + +JsonValue* json_create_null(void) { + JsonValue *val = calloc(1, sizeof(JsonValue)); + if (!val) return NULL; + val->type = JSON_NULL; + return val; +} + +JsonValue* json_create_bool(bool b) { + JsonValue *val = calloc(1, sizeof(JsonValue)); + if (!val) return NULL; + val->type = JSON_BOOL; + val->data.boolean = b; + return val; +} + +JsonValue* json_create_number(double num) { + JsonValue *val = calloc(1, sizeof(JsonValue)); + if (!val) return NULL; + val->type = JSON_NUMBER; + val->data.number = num; + return val; +} + +JsonValue* json_create_string(const char *str) { + if (!str) return NULL; + + JsonValue *val = calloc(1, sizeof(JsonValue)); + if (!val) return NULL; + + val->type = JSON_STRING; + val->data.string = strdup(str); + if (!val->data.string) { + free(val); + return NULL; + } + return val; +} + +JsonValue* json_create_object(void) { + JsonValue *val = calloc(1, sizeof(JsonValue)); + if (!val) return NULL; + val->type = JSON_OBJECT; + val->data.object = NULL; + return val; +} + +JsonValue* json_create_array(void) { + JsonValue *val = calloc(1, sizeof(JsonValue)); + if (!val) return NULL; + + val->type = JSON_ARRAY; + val->data.array.items = malloc(sizeof(JsonValue*) * JSON_ARRAY_INITIAL_CAPACITY); + if (!val->data.array.items) { + free(val); + return NULL; + } + val->data.array.count = 0; + val->data.array.capacity = JSON_ARRAY_INITIAL_CAPACITY; + return val; +} + +void json_object_set(JsonValue *obj, const char *key, JsonValue *value) { + if (!obj || obj->type != JSON_OBJECT || !key || !value) return; + + // Check if key already exists and update it + JsonPair *current = obj->data.object; + while (current) { + if (strcmp(current->key, key) == 0) { + json_free(current->value); + current->value = value; + return; + } + current = current->next; + } + + // Key doesn't exist, create new pair + JsonPair *pair = malloc(sizeof(JsonPair)); + if (!pair) return; + + pair->key = strdup(key); + if (!pair->key) { + free(pair); + return; + } + + pair->value = value; + pair->next = NULL; + + if (!obj->data.object) { + obj->data.object = pair; + } else { + JsonPair *last = obj->data.object; + while (last->next) { + last = last->next; + } + last->next = pair; + } +} + +JsonValue* json_object_get(JsonValue *obj, const char *key) { + if (!obj || obj->type != JSON_OBJECT || !key) return NULL; + + JsonPair *current = obj->data.object; + while (current) { + if (strcmp(current->key, key) == 0) { + return current->value; + } + current = current->next; + } + return NULL; +} + +bool json_object_has_key(JsonValue *obj, const char *key) { + return json_object_get(obj, key) != NULL; +} + +bool json_array_push(JsonValue *arr, JsonValue *value) { + if (!arr || arr->type != JSON_ARRAY || !value) return false; + + // Resize array if needed + if (arr->data.array.count >= arr->data.array.capacity) { + size_t new_capacity = arr->data.array.capacity * 2; + JsonValue **new_items = realloc(arr->data.array.items, + sizeof(JsonValue*) * new_capacity); + if (!new_items) return false; + + arr->data.array.items = new_items; + arr->data.array.capacity = new_capacity; + } + + arr->data.array.items[arr->data.array.count] = value; + arr->data.array.count++; + return true; +} + +JsonValue* json_array_get(JsonValue *arr, size_t index) { + if (!arr || arr->type != JSON_ARRAY || index >= arr->data.array.count) { + return NULL; + } + return arr->data.array.items[index]; +} + +size_t json_array_size(JsonValue *arr) { + if (!arr || arr->type != JSON_ARRAY) return 0; + return arr->data.array.count; +} + +void json_free(JsonValue *val) { + if (!val) return; + + switch (val->type) { + case JSON_STRING: + free(val->data.string); + break; + case JSON_OBJECT: { + JsonPair *current = val->data.object; + while (current) { + JsonPair *next = current->next; + free(current->key); + json_free(current->value); + free(current); + current = next; + } + break; + } + case JSON_ARRAY: + for (size_t i = 0; i < val->data.array.count; i++) { + json_free(val->data.array.items[i]); + } + free(val->data.array.items); + break; + default: + break; + } + free(val); +} diff --git a/projects/amdsmi/src/ras-decode/main.c b/projects/amdsmi/src/ras-decode/main.c new file mode 100644 index 0000000000..c0b007f859 --- /dev/null +++ b/projects/amdsmi/src/ras-decode/main.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/** + * @file main.c + * @brief Demo program showing how to use the ACA decoder + * + * This is a demonstration program that shows how to use the ACA decoder + * with sample raw data to decode ACA error information. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +// Function prototype +void print_version_info(void); +void demonstrate_json_decoding(void); + +void print_version_info(void) +{ + printf("=== ACA Decoder Library Version Information ===\n"); + printf("Version: %s\n", aca_get_version_string()); + printf("Major: %d\n", aca_get_version_major()); + printf("Minor: %d\n", aca_get_version_minor()); + printf("Patch: %d\n", aca_get_version_patch()); + + aca_version_info_t version_info = aca_get_version_info(); + printf("Complete version info:\n"); + printf(" Major: %d\n", version_info.major); + printf(" Minor: %d\n", version_info.minor); + printf(" Patch: %d\n", version_info.patch); + printf(" String: %s\n", version_info.string); + printf("===============================================\n\n"); +} + +// Function to demonstrate JSON decoding functionality +void demonstrate_json_decoding(void) +{ + printf("=== ACA Decoder - JSON Output Examples ===\n"); + + // Example 1: HBM FATAL ERROR (32-byte array) + uint64_t register_array_32[RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbaa000000004081b, 0x0, 0x209600090f00, 0x5d000000}; + printf("\n--- HBM FATAL ERROR (32-byte array) ---\n"); + printf("Decoded AFID: %d\n", decode_afid(register_array_32, RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1, 1)); + + JsonValue *json_result_32 = decode_error_info(register_array_32, RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1, 1); + if (json_result_32) { + print_json_value(json_result_32); + json_free(json_result_32); + } + + // Example 2: GC FATAL ERROR + uint64_t register_array_test[RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbea00000003b0000, 0x100000029, 0x1200136430400, 0x20b}; + printf("\n--- GC FATAL ERROR ---\n"); + printf("Decoded AFID: %d\n", decode_afid(register_array_test, RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1, 1)); + + JsonValue *json_result_test = decode_error_info(register_array_test, RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1, 1); + if (json_result_test) { + print_json_value(json_result_test); + json_free(json_result_test); + } + + // Example 3: HBM CORRECTED ERROR (128-byte array) + uint64_t register_array_128[RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES] = { + 0xffff, + 0xdc2040000000011b, + 0x0, + 0xd008000801000000, + 0x25000001ff, + 0x209600191f00, + 0xa000000, + 0x0, + 0x0, + 0x0, + 0xd008000801000000, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0}; + + printf("\n--- HBM CORRECTED ERROR (128-byte array) ---\n"); + printf("Decoded AFID: %d\n", decode_afid(register_array_128, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1, 1)); + + JsonValue *json_result_128 = decode_error_info(register_array_128, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1, 1); + if (json_result_128) { + print_json_value(json_result_128); + json_free(json_result_128); + } + + // Example 4: PCS XGMI Error + uint64_t register_array_pcs_xgmi[RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES] = { + 0xffffffff, + 0x9820000000060150, + 0x0, + 0xd008000200000000, + 0x27000001f9, + 0xe05012109201, + 0xaf812d4a000000, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0}; + + printf("\n--- PCS XGMI Error ---\n"); + printf("Decoded AFID: %d\n", decode_afid(register_array_pcs_xgmi, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1, 1)); + + JsonValue *json_result_pcs = decode_error_info(register_array_pcs_xgmi, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1, 1); + if (json_result_pcs) { + print_json_value(json_result_pcs); + json_free(json_result_pcs); + } + + // Example 5: Bad page (threshold exceeded flag) + uint64_t register_array_bad_page[RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES] = { + 0x1, + 0xb000000000000137, + 0x0, + 0x0, + 0x1ff00000002, + 0x9600000000, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0}; + + printf("\n--- Bad Page (Threshold Exceeded) ---\n"); + printf("Decoded AFID: %d\n", decode_afid(register_array_bad_page, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, RAS_DECODE_FLAG_THRESHOLD_EXCEEDED, 1, 1)); + + JsonValue *json_result_bad_page = decode_error_info(register_array_bad_page, RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES, RAS_DECODE_FLAG_THRESHOLD_EXCEEDED, 1, 1); + if (json_result_bad_page) { + print_json_value(json_result_bad_page); + json_free(json_result_bad_page); + } + + // Example 6: Boot Error Demo + uint64_t boot_messages[8] = { + 0x3c000228a4, // Oam0bootmsg + 0x3c001228a4, // Oam1bootmsg + 0x3c002228a4, // Oam2bootmsg + 0x3c003128a4, // Oam3bootmsg + 0x3c004328a4, // Oam4bootmsg + 0x3c005228a4, // Oam5bootmsg + 0x3c006228a4, // Oam6bootmsg + 0x3c007228a4 // Oam7bootmsg + }; + + printf("\n--- Boot Error Demo ---\n"); + printf("Decoded AFID: %d\n", decode_afid(boot_messages, sizeof(boot_messages)/sizeof(boot_messages[0]), 0, 1, 9)); + + JsonValue *json_result_boot = decode_error_info(boot_messages, sizeof(boot_messages)/sizeof(boot_messages[0]), 0, 1, 9); + if (json_result_boot) { + print_json_value(json_result_boot); + json_free(json_result_boot); + } else { + printf("Failed to decode boot messages\n"); + } + + printf("\n===========================================\n"); +} + +int main() +{ + // Display version information + print_version_info(); + + // Demonstrate the new JSON-based ACA decoding functionality + demonstrate_json_decoding(); + + return 0; +} diff --git a/projects/amdsmi/src/ras-decode/ras_decode_api.c b/projects/amdsmi/src/ras-decode/ras_decode_api.c new file mode 100644 index 0000000000..3078dbb4fa --- /dev/null +++ b/projects/amdsmi/src/ras-decode/ras_decode_api.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "aca_decode.h" +#include "ras_decode_constants.h" +#include "boot_decode.h" +#include "error_map.h" +#include "json_util.h" +#include +#include +#include + +int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type) +{ + if (!register_array) + { + return -1; + } + + // Use decode_error_info to get the JSON result + JsonValue *json_result = decode_error_info(register_array, array_len, flag, hw_revision, register_context_type); + if (!json_result) { + return -1; + } + + // Use the decode_error_info_afid function to extract AFID + int afid = decode_error_info_afid(json_result); + + json_free(json_result); + return afid; +} + +JsonValue* decode_error_info(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision, uint16_t register_context_type) +{ + if (!register_array) + { + return NULL; + } + + // Check register context type parameter + if (register_context_type == 9) + { + // For boot decode, use boot_decode_orchestrator with register_array and array_len + // Flag is not used in boot decode + return boot_decode_orchestrator((uint64_t*)register_array, array_len); + } + else if (register_context_type == 1) + { + // For ACA decode, use existing logic + aca_raw_data_t raw_data = {0}; + + if (array_len == RAS_DECODE_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes + { + raw_data.aca_status = register_array[0]; + raw_data.aca_addr = register_array[1]; + raw_data.aca_ipid = register_array[2]; + raw_data.aca_synd = register_array[3]; + } + else if (array_len == RAS_DECODE_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes + { + raw_data.aca_status = register_array[1]; + raw_data.aca_addr = register_array[2]; + raw_data.aca_ipid = register_array[5]; + raw_data.aca_synd = register_array[6]; + } + else + { + return NULL; // Unsupported size + } + + raw_data.flags = flag; + raw_data.hw_revision = hw_revision; + + return aca_decode(&raw_data); + } + else + { + return NULL; // Invalid register context type + } +} + +int decode_error_info_afid(JsonValue *error_json) +{ + if (!error_json || error_json->type != JSON_OBJECT) { + return -1; // Invalid AFID for null or invalid JSON + } + + // Check if this is MCA error + JsonValue *category_value = json_object_get(error_json, "error_category"); + JsonValue *type_value = json_object_get(error_json, "error_type"); + JsonValue *severity_value = json_object_get(error_json, "severity"); + + if (category_value && type_value && severity_value && + category_value->type == JSON_STRING && type_value->type == JSON_STRING && severity_value->type == JSON_STRING) { + const char *error_category = category_value->data.string; + const char *error_type = type_value->data.string; + const char *error_severity = severity_value->data.string; + + // Check for the specific case: HBM Errors + Bad Page Retirement Threshold + Fatal + if (strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0 && + strcmp(error_type, RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0 && + strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0) { + + // Use the error_type directly as service_error for this case + return get_error_id(error_category, error_type, error_severity); + } + + // For other cases, we need to determine the service_error_type based on the logic + // from get_service_error_type function + const char *service_error = NULL; + + // Extract bank if needed for service error type determination + JsonValue *bank_value = json_object_get(error_json, "bank"); + const char *error_bank = (bank_value && bank_value->type == JSON_STRING) ? bank_value->data.string : ""; + + if (strcmp(error_type, RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0) { + service_error = RAS_DECODE_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD; + } + else if (strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0 && strcmp(error_severity, RAS_DECODE_SEVERITY_CORRECTED) == 0) { + service_error = RAS_DECODE_ERROR_TYPE_ALL; + } + else if (strcmp(error_type, "RdCrcErr") == 0) { + service_error = RAS_DECODE_ERROR_TYPE_END_TO_END_CRC; + } + else if (strcmp(error_category, RAS_DECODE_CATEGORY_HBM_ERRORS) == 0 && strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0 && + strcmp(error_type, RAS_DECODE_ERROR_TYPE_ON_DIE_ECC) != 0 && strcmp(error_type, RAS_DECODE_ERROR_TYPE_END_TO_END_CRC) != 0) { + service_error = RAS_DECODE_ERROR_TYPE_ALL_OTHERS; + } + else if (strcmp(error_category, RAS_DECODE_CATEGORY_DEVICE_INTERNAL_ERRORS) == 0) { + if ((strcmp(error_severity, RAS_DECODE_SEVERITY_UNCORRECTED_NON_FATAL) == 0 || + strcmp(error_severity, RAS_DECODE_SEVERITY_CORRECTED) == 0 || + strcmp(error_severity, RAS_DECODE_SEVERITY_FATAL) == 0) && + strcmp(error_type, RAS_DECODE_ERROR_TYPE_HARDWARE_ASSERTION) != 0 && + strcmp(error_type, RAS_DECODE_ERROR_TYPE_WATCHDOG_TIMEOUT) != 0) { + service_error = RAS_DECODE_ERROR_TYPE_ALL_OTHERS; + } + } + else if (strcmp(error_category, RAS_DECODE_CATEGORY_OFF_PACKAGE_LINK_ERRORS) == 0) { + if (strcmp(error_bank, RAS_DECODE_BANK_PCS_XGMI) == 0) { + service_error = RAS_DECODE_ERROR_TYPE_XGMI; + } + else if (strcmp(error_bank, RAS_DECODE_BANK_KPX_WAFL) == 0) { + service_error = RAS_DECODE_ERROR_TYPE_WAFL; + } + } + + if (!service_error) { + service_error = error_type; // Fallback to error_type + } + + return get_error_id(error_category, service_error, error_severity); + } + + // Check if this is a boot error + // Find the first msg key to get the error_type + JsonPair *current_pair = error_json->data.object; + JsonValue *first_msg = NULL; + int lowest_msg_index = INT_MAX; + + while (current_pair) { + if (strncmp(current_pair->key, "msg", 3) == 0) { + // Extract the message index + int msg_index = atoi(current_pair->key + 3); + if (msg_index < lowest_msg_index) { + lowest_msg_index = msg_index; + first_msg = current_pair->value; + } + } + current_pair = current_pair->next; + } + + if (first_msg && first_msg->type == JSON_OBJECT) { + // This is a boot error - extract error_type from the first message + JsonValue *boot_error_type = json_object_get(first_msg, "error_type"); + if (boot_error_type && boot_error_type->type == JSON_STRING) { + const char *service_error = NULL; + service_error = boot_error_type->data.string; + + // For boot errors, always use Boot-Time Errors category and Fail-to-init severity + return get_error_id(RAS_DECODE_CATEGORY_BOOT_TIME_ERRORS, service_error, RAS_DECODE_SEVERITY_FAIL_TO_INIT); + } + } + + return -1; // Invalid AFID if neither MCA nor boot error format +} diff --git a/projects/amdsmi/tools/run-clang-tidy.sh b/projects/amdsmi/tools/run-clang-tidy.sh index c9f1cb4cd0..6c6bde0f06 100755 --- a/projects/amdsmi/tools/run-clang-tidy.sh +++ b/projects/amdsmi/tools/run-clang-tidy.sh @@ -44,7 +44,7 @@ fi # Find all source files mapfile -t FILES < <( - find . \( -name build -o -name .git -o -path "./src/aca-decode" -o -path "./esmi_ib_library" -o -path "./rocm_smi/include/rocm_smi/kfd_ioctl.h" \) -prune -o \ + find . \( -name build -o -name .git -o -path "./src/ras-decode-instinct-staging" -o -path "./esmi_ib_library" -o -path "./rocm_smi/include/rocm_smi/kfd_ioctl.h" \) -prune -o \ \( -name "*.cc" -o -name "*.cpp" -o -name "*.c" \) -print )