diff --git a/projects/amdsmi/include/aca-decode/aca_constants.h b/projects/amdsmi/include/aca-decode/aca_constants.h new file mode 100644 index 0000000000..a0170a4e7f --- /dev/null +++ b/projects/amdsmi/include/aca-decode/aca_constants.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/** + * @file aca_constants.h + * @brief Shared constants for ACA error decoding + * + * This file contains string constants and numerical constants that are used + * across multiple source files to improve maintainability and prevent typos. + */ + +#ifndef ACA_CONSTANTS_H +#define ACA_CONSTANTS_H + +/* Error severity constants */ +#define ACA_SEVERITY_UNKNOWN "UNKNOWN" +#define ACA_SEVERITY_FATAL "Fatal" +#define ACA_SEVERITY_CORRECTED "Corrected" +#define ACA_SEVERITY_UNCORRECTED_NON_FATAL "Uncorrected, Non-fatal" +#define ACA_SEVERITY_FAIL_TO_INIT "Fail-to-init" +#define ACA_SEVERITY_ALL_CAPS "ALL" + +/* Error category constants */ +#define ACA_CATEGORY_HBM_ERRORS "HBM Errors" +#define ACA_CATEGORY_DEVICE_INTERNAL_ERRORS "Device Internal Errors" +#define ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS "Off-Package Link Errors" +#define ACA_CATEGORY_BOOT_TIME_ERRORS "Boot-Time Errors" +#define ACA_CATEGORY_CPER_FORMAT "CPER Format" +#define ACA_CATEGORY_UNIDENTIFIED_ERRORS "Unidentified Errors" + +/* Common error type constants */ +#define ACA_ERROR_TYPE_ALL_OTHERS "All Others" +#define ACA_ERROR_TYPE_ALL "All" +#define ACA_ERROR_TYPE_DECODE_INAPPLICABLE "Decode Inapplicable" +#define ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD "Bad Page Retirement Threshold" +#define ACA_ERROR_TYPE_HARDWARE_ASSERTION "Hardware Assertion (HWA)" +#define ACA_ERROR_TYPE_WATCHDOG_TIMEOUT "Watchdog Timeout (WDT)" +#define ACA_ERROR_TYPE_ON_DIE_ECC "On-die ECC" +#define ACA_ERROR_TYPE_END_TO_END_CRC "End-to-end CRC" +#define ACA_ERROR_TYPE_WAFL "WAFL" +#define ACA_ERROR_TYPE_XGMI "XGMI" + +/* Boot-time error type constants */ +#define ACA_ERROR_TYPE_FW_LOAD "FW Load" +#define ACA_ERROR_TYPE_HBM_BIST_TEST "HBM BIST Test" +#define ACA_ERROR_TYPE_HBM_MEMORY_TEST "HBM Memory Test" +#define ACA_ERROR_TYPE_HBM_TRAINING "HBM Training" +#define ACA_ERROR_TYPE_UNHANDLED "Unhandled" +#define ACA_ERROR_TYPE_UNKNOWN_ERROR "Unknown" +#define ACA_ERROR_TYPE_USR_CP_LINK_TRAINING "USR CP Link Training" +#define ACA_ERROR_TYPE_USR_DP_LINK_TRAINING "USR DP Link Training" +#define ACA_ERROR_TYPE_WAFL_LINK_TRAINING "WAFL Link Training" +#define ACA_ERROR_TYPE_XGMI_LINK_TRAINING "XGMI Link Training" +#define ACA_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT "Boot Controller Data Abort" +#define ACA_ERROR_TYPE_BOOT_CONTROLLER_GENERIC "Boot Controller Generic" + +/* Link error type constants */ +#define ACA_ERROR_TYPE_PCIE_AER "PCIe AER" + +/* CPER format error type constants */ +#define ACA_ERROR_TYPE_MALFORMED_CPER "Malformed CPER" +#define ACA_ERROR_TYPE_INCOMPLETE_ACA_DATA "Incomplete ACA Data" +#define ACA_ERROR_TYPE_INVALID_ACA_DATA "Invalid ACA Data" +#define ACA_ERROR_TYPE_UNIDENTIFIED_ERROR "Unidentified Error" + +/* Protocol constants */ +#define ACA_PROTOCOL_CPER "CPER" +#define ACA_PROTOCOL_CPER_WITH_SPACE "CPER " + +/* Bank name strings */ +#define ACA_BANK_UMC "umc" +#define ACA_BANK_PSP "psp" +#define ACA_BANK_CS "cs" +#define ACA_BANK_PIE "pie" +#define ACA_BANK_PCS_XGMI "pcs_xgmi" +#define ACA_BANK_KPX_SERDES "kpx_serdes" +#define ACA_BANK_KPX_WAFL "kpx_wafl" + +/* Numerical constants */ +#define ACA_FLAG_THRESHOLD_EXCEEDED 0x8 +#define ACA_REGISTER_ARRAY_SIZE_32_BYTES 4 +#define ACA_REGISTER_ARRAY_SIZE_128_BYTES 16 + +/* Error code ranges */ +#define ACA_ERROR_CODE_EXT_MIN 0x3A +#define ACA_ERROR_CODE_EXT_MAX 0x3E + +/* Instance ID values for XCD and AID error decoding */ +#define ACA_INSTANCE_ID_XCD0_400 0x36430400 +#define ACA_INSTANCE_ID_XCD1_400 0x38430400 +#define ACA_INSTANCE_ID_XCD0_401 0x36430401 +#define ACA_INSTANCE_ID_XCD1_401 0x38430401 +#define ACA_INSTANCE_ID_AID_400 0x3B30400 +#define ACA_INSTANCE_ID_AID_401 0x3B30401 + +/* Error return codes */ +#define ACA_ERROR_INVALID_ACA_DATA_ID 33 +#define ACA_ERROR_UNIDENTIFIED_ERROR_ID 34 + +#endif /* ACA_CONSTANTS_H */ diff --git a/projects/amdsmi/include/aca-decode/aca_version.h b/projects/amdsmi/include/aca-decode/aca_version.h new file mode 100644 index 0000000000..53a5dc2fa9 --- /dev/null +++ b/projects/amdsmi/include/aca-decode/aca_version.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef ACA_VERSION_H +#define ACA_VERSION_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @brief ACA Decoder Library Version Information + * + * This header defines version constants and functions for the ACA Decoder library. + * Version follows Semantic Versioning (SemVer) specification: MAJOR.MINOR.PATCH + * + * - MAJOR: Incremented for incompatible API changes + * - MINOR: Incremented for backward-compatible functionality additions + * - PATCH: Incremented for backward-compatible bug fixes + */ + +/* Version Components */ +#define ACA_VERSION_MAJOR 1 /**< Major version number */ +#define ACA_VERSION_MINOR 0 /**< Minor version number */ +#define ACA_VERSION_PATCH 0 /**< Patch version number */ + +/* Version String */ +#define ACA_VERSION_STRING "1.0.0" + + /** + * @brief Structure containing version information + */ + typedef struct + { + int major; /**< Major version number */ + int minor; /**< Minor version number */ + int patch; /**< Patch version number */ + const char *string; /**< Version string (e.g., "1.0.0") */ + } aca_version_info_t; + + /** + * @brief Get the major version number + * @return Major version number + */ + int aca_get_version_major(void); + + /** + * @brief Get the minor version number + * @return Minor version number + */ + int aca_get_version_minor(void); + + /** + * @brief Get the patch version number + * @return Patch version number + */ + int aca_get_version_patch(void); + + /** + * @brief Get the version string + * @return Pointer to version string (e.g., "1.0.0") + */ + const char *aca_get_version_string(void); + + /** + * @brief Get complete version information + * @return Structure containing all version information + */ + aca_version_info_t aca_get_version_info(void); + +#ifdef __cplusplus +} +#endif + +#endif /* ACA_VERSION_H */ diff --git a/projects/amdsmi/src/aca-decode/aca_api.c b/projects/amdsmi/src/aca-decode/aca_api.c index ed507dadab..3aa0c3ec7f 100644 --- a/projects/amdsmi/src/aca-decode/aca_api.c +++ b/projects/amdsmi/src/aca-decode/aca_api.c @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * @@ -21,8 +20,8 @@ * THE SOFTWARE. */ - #include "aca_decode.h" -#include +#include "aca_decode.h" +#include "aca_constants.h" int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision) { @@ -33,20 +32,21 @@ int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, aca_raw_data_t raw_data; - if (array_len == 4) // 32 bytes + if (array_len == ACA_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes { raw_data.aca_status = register_array[0]; raw_data.aca_addr = register_array[1]; raw_data.aca_ipid = register_array[2]; raw_data.aca_synd = register_array[3]; } - else if (array_len == 16) // 128 bytes + else if (array_len == ACA_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes { raw_data.aca_status = register_array[1]; raw_data.aca_addr = register_array[2]; raw_data.aca_ipid = register_array[5]; raw_data.aca_synd = register_array[6]; } + else { return -1; // Unsupported size @@ -67,32 +67,19 @@ aca_error_info_t decode_error_info(const uint64_t *register_array, size_t array_ if (!register_array) { return error_info; - } - - // Create a copy of the register array to avoid modifying the original - uint64_t converted_array[16]; - if (array_len > 16) { - return error_info; - } - - // Copy and convert the array - for (size_t i = 0; i < array_len; i++) { - converted_array[i] = le64_to_be64(register_array[i]); - } - - if (array_len == 4) // 32 bytes + } if (array_len == ACA_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes { - raw_data.aca_status = converted_array[0]; - raw_data.aca_addr = converted_array[1]; - raw_data.aca_ipid = converted_array[2]; - raw_data.aca_synd = converted_array[3]; + raw_data.aca_status = register_array[0]; + raw_data.aca_addr = register_array[1]; + raw_data.aca_ipid = register_array[2]; + raw_data.aca_synd = register_array[3]; } - else if (array_len == 16) // 128 bytes + else if (array_len == ACA_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes { - raw_data.aca_status = converted_array[1]; - raw_data.aca_addr = converted_array[2]; - raw_data.aca_ipid = converted_array[5]; - raw_data.aca_synd = converted_array[6]; + raw_data.aca_status = register_array[1]; + raw_data.aca_addr = register_array[2]; + raw_data.aca_ipid = register_array[5]; + raw_data.aca_synd = register_array[6]; } else { diff --git a/projects/amdsmi/src/aca-decode/aca_decode.c b/projects/amdsmi/src/aca-decode/aca_decode.c index f09d7519f9..fd96c43b64 100644 --- a/projects/amdsmi/src/aca-decode/aca_decode.c +++ b/projects/amdsmi/src/aca-decode/aca_decode.c @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * @@ -33,6 +32,7 @@ #include "aca_decode.h" #include "aca_tables.h" #include "error_map.h" +#include "aca_constants.h" #include /** @@ -61,18 +61,18 @@ aca_decoder_get_bank(const aca_decoder_t *decoder, const char **bank_name) static const char *get_error_severity(const aca_status_fields_t *status) { if (status->poison) - return "Uncorrected, Non-fatal"; + return ACA_SEVERITY_UNCORRECTED_NON_FATAL; if (status->pcc) - return "Fatal"; + return ACA_SEVERITY_FATAL; if (!status->pcc && status->uc && status->tcc) - return "Fatal"; + return ACA_SEVERITY_FATAL; if (!status->pcc && status->uc && !status->tcc) - return "Uncorrected, Non-fatal"; + return ACA_SEVERITY_UNCORRECTED_NON_FATAL; if (!status->pcc && !status->uc && !status->tcc && status->deferred) - return "Uncorrected, Non-fatal"; + return ACA_SEVERITY_UNCORRECTED_NON_FATAL; if (!status->pcc && !status->uc && !status->tcc && !status->deferred) - return "Corrected"; - return "UNKNOWN"; + return ACA_SEVERITY_CORRECTED; + return ACA_SEVERITY_UNKNOWN; } /** @@ -85,31 +85,31 @@ static const char *get_error_category(const char *bank, const char *error_type) { if (!bank || !error_type) { - return "UNKNOWN"; + return ACA_SEVERITY_UNKNOWN; } - if (strcmp(bank, "umc") == 0) + if (strcmp(bank, ACA_BANK_UMC) == 0) { - if (strcmp(error_type, "On-die ECC") == 0 || + if (strcmp(error_type, ACA_ERROR_TYPE_ON_DIE_ECC) == 0 || strcmp(error_type, "WriteDataPoisonErr") == 0 || strcmp(error_type, "AddressCommandParityErr") == 0 || strcmp(error_type, "WriteDataCrcErr") == 0 || strcmp(error_type, "EcsErr") == 0 || strcmp(error_type, "RdCrcErr") == 0 || - strcmp(error_type, "End-to-end CRC") == 0) + strcmp(error_type, ACA_ERROR_TYPE_END_TO_END_CRC) == 0) { - return "HBM Errors"; + return ACA_CATEGORY_HBM_ERRORS; } } - else if (strcmp(bank, "pcs_xgmi") == 0 || - strcmp(bank, "kpx_serdes") == 0 || - strcmp(bank, "kpx_wafl") == 0 || - (strcmp(bank, "psp") == 0 && strcmp(error_type, "WAFL") == 0)) + else if (strcmp(bank, ACA_BANK_PCS_XGMI) == 0 || + strcmp(bank, ACA_BANK_KPX_SERDES) == 0 || + strcmp(bank, ACA_BANK_KPX_WAFL) == 0 || + (strcmp(bank, ACA_BANK_PSP) == 0 && strcmp(error_type, ACA_ERROR_TYPE_WAFL) == 0)) { - return "Off-Package Link Errors"; + return ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS; } - return "Device Internal Errors"; + return ACA_CATEGORY_DEVICE_INTERNAL_ERRORS; } /** @@ -125,55 +125,55 @@ static int get_service_error_type(const char *error_category, const char *error_ const char *error_severity, const char **service_error_type) { if (!error_category || !error_type || !error_severity || !service_error_type || - strcmp(error_category, "UNKNOWN") == 0 || - strcmp(error_type, "UNKNOWN") == 0 || - strcmp(error_severity, "UNKNOWN") == 0) + strcmp(error_category, ACA_SEVERITY_UNKNOWN) == 0 || + strcmp(error_type, ACA_SEVERITY_UNKNOWN) == 0 || + strcmp(error_severity, ACA_SEVERITY_UNKNOWN) == 0) { return -1; } - if (strcmp(error_type, "Bad Page Retirement Threshold") == 0) + if (strcmp(error_type, ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0) { - *service_error_type = "Bad Page Retirement Threshold"; + *service_error_type = ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD; + return 0; + } + if ((strcmp(error_category, ACA_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, ACA_SEVERITY_CORRECTED) == 0)) + { + *service_error_type = ACA_ERROR_TYPE_ALL; return 0; } if (strcmp(error_type, "RdCrcErr") == 0) { - *service_error_type = "End-to-end CRC"; + *service_error_type = ACA_ERROR_TYPE_END_TO_END_CRC; return 0; } - if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Corrected") == 0)) + if ((strcmp(error_category, ACA_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, ACA_SEVERITY_FATAL) == 0) && + (strcmp(error_type, ACA_ERROR_TYPE_ON_DIE_ECC) != 0) && (strcmp(error_type, ACA_ERROR_TYPE_END_TO_END_CRC) != 0)) { - *service_error_type = "All"; + *service_error_type = ACA_ERROR_TYPE_ALL_OTHERS; return 0; } - if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Fatal") == 0) && - (strcmp(error_type, "On-die ECC") != 0) && (strcmp(error_type, "End-to-end CRC") != 0)) + if (strcmp(error_category, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS) == 0) { - *service_error_type = "All Others"; - return 0; - } - if (strcmp(error_category, "Device Internal Errors") == 0) - { - if ((strcmp(error_severity, "Uncorrected, Non-fatal") == 0 || - strcmp(error_severity, "Corrected") == 0 || - strcmp(error_severity, "Fatal") == 0) && - strcmp(error_type, "Hardware Assertion (HWA)") != 0 && - strcmp(error_type, "Watchdog Timeout (WDT)") != 0) + if ((strcmp(error_severity, ACA_SEVERITY_UNCORRECTED_NON_FATAL) == 0 || + strcmp(error_severity, ACA_SEVERITY_CORRECTED) == 0 || + strcmp(error_severity, ACA_SEVERITY_FATAL) == 0) && + strcmp(error_type, ACA_ERROR_TYPE_HARDWARE_ASSERTION) != 0 && + strcmp(error_type, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT) != 0) { - *service_error_type = "All Others"; + *service_error_type = ACA_ERROR_TYPE_ALL_OTHERS; return 0; } } - if (strcmp(error_category, "Off-Package Link Errors") == 0) + if (strcmp(error_category, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS) == 0) { - if (strcmp(error_bank, "pcs_xgmi") == 0) + if (strcmp(error_bank, ACA_BANK_PCS_XGMI) == 0) { - *service_error_type = "XGMI"; + *service_error_type = ACA_ERROR_TYPE_XGMI; return 0; } - if (strcmp(error_bank, "kpx_wafl") == 0) + if (strcmp(error_bank, ACA_BANK_KPX_WAFL) == 0) { - *service_error_type = "WAFL"; + *service_error_type = ACA_ERROR_TYPE_WAFL; return 0; } } @@ -205,7 +205,7 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i result = aca_decoder_get_bank(decoder, &bank); if (result < 0) { - bank = "UNKNOWN"; + bank = ACA_SEVERITY_UNKNOWN; } info->bank_ref = bank; @@ -215,13 +215,13 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i } else { - info->instance_ref = "Decode Inapplicable"; + info->instance_ref = ACA_ERROR_TYPE_DECODE_INAPPLICABLE; } // 0b1000 indicate error threshold has been exceeded, and is always fatal - if (decoder->flags & 0x8) + if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED) { - info->severity_ref = "Fatal"; + info->severity_ref = ACA_SEVERITY_FATAL; } else { @@ -242,31 +242,31 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i info->aid = -1; // Invalid value } - if (decoder->status.error_code_ext >= 0x3A && decoder->status.error_code_ext <= 0x3E) + if (decoder->status.error_code_ext >= ACA_ERROR_CODE_EXT_MIN && decoder->status.error_code_ext <= ACA_ERROR_CODE_EXT_MAX) { uint32_t instance_id = decoder->ipid.instance_id_lo; uint32_t error_info = decoder->synd.error_information & 0xFF; - if ((instance_id == 0x36430400 || instance_id == 0x38430400 || - instance_id == 0x36430401 || instance_id == 0x38430401) && + if ((instance_id == ACA_INSTANCE_ID_XCD0_400 || instance_id == ACA_INSTANCE_ID_XCD1_400 || + instance_id == ACA_INSTANCE_ID_XCD0_401 || instance_id == ACA_INSTANCE_ID_XCD1_401) && find_error_in_table(xcd_error_table, NUM_XCD_ERRORS, error_info, &error_type) == 0) { info->error_type_ref = error_type; } - else if ((instance_id == 0x3B30400 || instance_id == 0x3B30401) && + else if ((instance_id == ACA_INSTANCE_ID_AID_400 || instance_id == ACA_INSTANCE_ID_AID_401) && find_error_in_table(aid_error_table, NUM_AID_ERRORS, error_info, &error_type) == 0) { info->error_type_ref = error_type; } else { - info->error_type_ref = "UNKNOWN"; + info->error_type_ref = ACA_SEVERITY_UNKNOWN; } } // 0b1000 indicate error threshold has been exceeded - else if (decoder->flags & 0x8) + else if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED) { - info->error_type_ref = "Bad Page Retirement Threshold"; + info->error_type_ref = ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD; } else { @@ -276,14 +276,14 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i } else { - info->error_type_ref = "UNKNOWN"; + info->error_type_ref = ACA_SEVERITY_UNKNOWN; } } // 0b1000 indicate error threshold has been exceeded, and is always a HBM error - if (decoder->flags & 0x8) + if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED) { - info->category_ref = "HBM Errors"; + info->category_ref = ACA_CATEGORY_HBM_ERRORS; } else { diff --git a/projects/amdsmi/src/aca-decode/aca_tables.c b/projects/amdsmi/src/aca-decode/aca_tables.c index e9276a9c33..fc742ed39d 100644 --- a/projects/amdsmi/src/aca-decode/aca_tables.c +++ b/projects/amdsmi/src/aca-decode/aca_tables.c @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * @@ -34,6 +33,7 @@ */ #include "aca_tables.h" +#include "aca_constants.h" #include #include #include @@ -80,19 +80,19 @@ const aca_error_type_t error_table[] = { {"cs", 0xe, "FTI_ND_ILL_REQ"}, {"cs", 0xf, "FTI_ND_ADDR_VIOL"}, {"cs", 0x10, "FTI_ND_SEC_VIOL"}, - {"cs", 0x11, "Hardware Assertion (HWA)"}, + {"cs", 0x11, ACA_ERROR_TYPE_HARDWARE_ASSERTION}, {"cs", 0x12, "ST_PRT_ERR"}, {"cs", 0x13, "ST_ECC_ERR"}, {"cs", 0x14, "ST_TXN_ERR"}, - {"pie", 0x0, "Hardware Assertion (HWA)"}, + {"pie", 0x0, ACA_ERROR_TYPE_HARDWARE_ASSERTION}, {"pie", 0x1, "CSW"}, {"pie", 0x2, "GMI"}, {"pie", 0x3, "FTI_DAT_STAT"}, {"pie", 0x4, "DEF"}, - {"pie", 0x5, "Watchdog Timeout (WDT)"}, + {"pie", 0x5, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT}, {"pie", 0x6, "CNLI"}, {"pie", 0x7, "RSLVFCI"}, - {"umc", 0x0, "On-die ECC"}, + {"umc", 0x0, ACA_ERROR_TYPE_ON_DIE_ECC}, {"umc", 0x1, "WriteDataPoisonErr"}, {"umc", 0x2, "SdpParityErr"}, {"umc", 0x4, "AddressCommandParityErr"}, @@ -103,7 +103,7 @@ const aca_error_type_t error_table[] = { {"umc", 0xb, "RdCrcErr"}, {"umc", 0xd, "MpFwErr"}, {"umc", 0xe, "MpParErr"}, - {"umc", 0xf, "End-to-end CRC"}, + {"umc", 0xf, ACA_ERROR_TYPE_END_TO_END_CRC}, {"psp", 0x0, "Mp0HighSramError"}, {"psp", 0x1, "Mp0LowSramError"}, {"psp", 0x2, "Mp0IDataBank0Error"}, @@ -127,7 +127,7 @@ const aca_error_type_t error_table[] = { {"psp", 0x3b, "SRAM_EDC"}, {"psp", 0x3c, "SMN_Parity"}, {"psp", 0x3d, "SMN_Timeout"}, - {"psp", 0x3f, "WAFL"}, + {"psp", 0x3f, ACA_ERROR_TYPE_WAFL}, {"smu", 0x0, "Mp5HighSramError"}, {"smu", 0x1, "Mp5LowSramError"}, {"smu", 0x2, "Mp5DCacheAError"}, @@ -478,7 +478,7 @@ int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name) } } - *bank_name = "UNKNOWN"; + *bank_name = ACA_SEVERITY_UNKNOWN; return 1; } @@ -499,7 +499,7 @@ int find_error_type_by_bank(const char *bank, uint32_t error_code, const char ** } } - *error_type = "UNKNOWN"; + *error_type = ACA_SEVERITY_UNKNOWN; return 1; } @@ -520,7 +520,7 @@ int find_error_in_table(const aca_error_entry_t *table, size_t table_size, } } - *error_type = "UNKNOWN"; + *error_type = ACA_SEVERITY_UNKNOWN; return 1; } @@ -556,6 +556,6 @@ int find_instance_name(const char *bank, uint32_t instance_id_lo, const char **i } } - *instance_name = "UNKNOWN"; + *instance_name = ACA_SEVERITY_UNKNOWN; return 1; } diff --git a/projects/amdsmi/src/aca-decode/aca_version.c b/projects/amdsmi/src/aca-decode/aca_version.c new file mode 100644 index 0000000000..348039e184 --- /dev/null +++ b/projects/amdsmi/src/aca-decode/aca_version.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "aca_version.h" + +/* Implementation of version functions */ + +int aca_get_version_major(void) +{ + return ACA_VERSION_MAJOR; +} + +int aca_get_version_minor(void) +{ + return ACA_VERSION_MINOR; +} + +int aca_get_version_patch(void) +{ + return ACA_VERSION_PATCH; +} + +const char *aca_get_version_string(void) +{ + return ACA_VERSION_STRING; +} + +aca_version_info_t aca_get_version_info(void) +{ + aca_version_info_t info; + + info.major = ACA_VERSION_MAJOR; + info.minor = ACA_VERSION_MINOR; + info.patch = ACA_VERSION_PATCH; + info.string = ACA_VERSION_STRING; + + return info; +} diff --git a/projects/amdsmi/src/aca-decode/error_map.c b/projects/amdsmi/src/aca-decode/error_map.c index 35694c5d0f..edbd5312a9 100644 --- a/projects/amdsmi/src/aca-decode/error_map.c +++ b/projects/amdsmi/src/aca-decode/error_map.c @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * @@ -22,56 +21,57 @@ */ #include "error_map.h" +#include "aca_constants.h" #include #define AFID_VERSION "0.7" static const error_map_entry_t error_map[] = { - {1, "Boot-Time Errors", "FW Load", "CPER", "Fail-to-init"}, - {2, "Boot-Time Errors", "HBM BIST Test", "CPER", "Fail-to-init"}, - {3, "Boot-Time Errors", "HBM Memory Test", "CPER", "Fail-to-init"}, - {4, "Boot-Time Errors", "HBM Training", "CPER", "Fail-to-init"}, - {5, "Boot-Time Errors", "Unhandled", "CPER", "Fail-to-init"}, - {6, "Boot-Time Errors", "Unknown", "CPER", "Fail-to-init"}, - {7, "Boot-Time Errors", "USR CP Link Training", "CPER", "Fail-to-init"}, - {8, "Boot-Time Errors", "USR DP Link Training", "CPER", "Fail-to-init"}, - {9, "Boot-Time Errors", "WAFL Link Training", "CPER", "Fail-to-init"}, - {10, "Boot-Time Errors", "XGMI Link Training", "CPER", "Fail-to-init"}, - {11, "Boot-Time Errors", "Boot Controller Data Abort", "CPER", "Fail-to-init"}, - {12, "Boot-Time Errors", "Boot Controller Generic", "CPER ", "Fail-to-init"}, - {13, "Off-Package Link Errors", "PCIe AER", "CPER", "Corrected"}, - {14, "Off-Package Link Errors", "PCIe AER", "CPER", "Fatal"}, - {15, "Off-Package Link Errors", "WAFL", "CPER", "Corrected"}, - {16, "Off-Package Link Errors", "WAFL", "CPER", "Fatal"}, - {17, "Off-Package Link Errors", "XGMI", "CPER", "Corrected"}, - {18, "Off-Package Link Errors", "XGMI", "CPER", "Fatal"}, - {19, "HBM Errors", "Bad Page Retirement Threshold", "CPER", "Fatal"}, - {20, "HBM Errors", "On-die ECC", "CPER", "Fatal"}, - {21, "HBM Errors", "End-to-end CRC", "CPER", "Fatal"}, - {22, "HBM Errors", "On-die ECC", "CPER", "Uncorrected, Non-fatal"}, - {23, "HBM Errors", "End-to-end CRC", "CPER", "Uncorrected, Non-fatal"}, - {24, "HBM Errors", "All", "CPER", "Corrected"}, - {25, "HBM Errors", "All Others", "CPER", "Fatal"}, - {26, "Device Internal Errors", "Hardware Assertion (HWA)", "CPER", "Fatal"}, - {27, "Device Internal Errors", "Watchdog Timeout (WDT)", "CPER", "Fatal"}, - {28, "Device Internal Errors", "All Others", "CPER", "Uncorrected, Non-fatal"}, - {29, "Device Internal Errors", "All Others", "CPER", "Corrected"}, - {30, "Device Internal Errors", "All Others", "CPER", "Fatal"}, - {31, "CPER Format", "Malformed CPER", "CPER", "ALL"}, - {32, "CPER Format", "Incomplete ACA Data", "CPER", "ALL"}, - {33, "CPER Format", "Invalid ACA Data", "CPER", "ALL"}, - {34, "Unidentified Errors", "Unidentified Error", "CPER", "ALL"}}; + {1, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_FW_LOAD, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {2, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_BIST_TEST, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {3, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_MEMORY_TEST, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {4, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {5, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_UNHANDLED, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {6, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_UNKNOWN_ERROR, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {7, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_USR_CP_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {8, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_USR_DP_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {9, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_WAFL_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {10, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_XGMI_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {11, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT}, + {12, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_BOOT_CONTROLLER_GENERIC, ACA_PROTOCOL_CPER_WITH_SPACE, ACA_SEVERITY_FAIL_TO_INIT}, + {13, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_PCIE_AER, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, + {14, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_PCIE_AER, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {15, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_WAFL, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, + {16, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_WAFL, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {17, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_XGMI, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, + {18, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_XGMI, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {19, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {20, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ON_DIE_ECC, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {21, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_END_TO_END_CRC, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {22, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ON_DIE_ECC, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL}, + {23, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_END_TO_END_CRC, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL}, + {24, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ALL, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, + {25, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {26, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_HARDWARE_ASSERTION, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {27, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {28, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL}, + {29, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED}, + {30, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL}, + {31, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_MALFORMED_CPER, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}, + {32, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_INCOMPLETE_ACA_DATA, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}, + {33, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_INVALID_ACA_DATA, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}, + {34, ACA_CATEGORY_UNIDENTIFIED_ERRORS, ACA_ERROR_TYPE_UNIDENTIFIED_ERROR, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}}; static const size_t NUM_ERROR_ENTRIES = sizeof(error_map) / sizeof(error_map[0]); int get_error_id(const char *error_category, const char *error_type, const char *error_severity) { if (!error_category || !error_type || !error_severity || - strcmp(error_category, "UNKNOWN") == 0 || - strcmp(error_type, "UNKNOWN") == 0 || - strcmp(error_severity, "UNKNOWN") == 0) + strcmp(error_category, ACA_SEVERITY_UNKNOWN) == 0 || + strcmp(error_type, ACA_SEVERITY_UNKNOWN) == 0 || + strcmp(error_severity, ACA_SEVERITY_UNKNOWN) == 0) { - return 33; // Return ID for "Invalid Error" if any input is "UNKNOWN" or NULL + return ACA_ERROR_INVALID_ACA_DATA_ID; // Return ID for "Invalid Error" if any input is "UNKNOWN" or NULL } for (size_t i = 0; i < NUM_ERROR_ENTRIES; i++) @@ -84,5 +84,5 @@ int get_error_id(const char *error_category, const char *error_type, const char } } - return 34; // Return ID for "Unidentified Errors" if no match found + return ACA_ERROR_UNIDENTIFIED_ERROR_ID; // Return ID for "Unidentified Errors" if no match found } diff --git a/projects/amdsmi/src/aca-decode/main.c b/projects/amdsmi/src/aca-decode/main.c new file mode 100644 index 0000000000..2dfbc41426 --- /dev/null +++ b/projects/amdsmi/src/aca-decode/main.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/** + * @file main.c + * @brief Demo program showing how to use the ACA decoder + * + * This is a demonstration program that shows how to use the ACA decoder + * with sample raw data to decode ACA error information. + */ + +#include +#include +#include +#include +#include +#include + +// Function prototype +void print_error_info(const aca_error_info_t *info); +void print_version_info(void); + +// Function to print error info in JSON format +void print_error_info(const aca_error_info_t *info) +{ + printf("{\n"); + printf(" \"bank\": \"%s\",\n", info->bank_ref); + printf(" \"error_location\": {\n"); + printf(" \"oam\": \"%d\",\n", info->oam); + printf(" \"aid\": \"%d\",\n", info->aid); + printf(" \"instance\": \"%s\"\n", info->instance_ref); + printf(" },\n"); + printf(" \"severity\": \"%s\",\n", info->severity_ref); + printf(" \"afid\": \"%d\",\n", info->afid); + printf(" \"scrub\": \"%u\",\n", info->scrub); + printf(" \"err_ext\": \"%u\",\n", info->error_code_ext); + printf(" \"error_category\": \"%s\",\n", info->category_ref); + printf(" \"error_type\": \"%s\",\n", info->error_type_ref); + printf(" \"address\": \"0x%" PRIx64 "\",\n", info->raw_addr); + printf(" \"syndrome\": \"0x%" PRIx64 "\"\n", info->raw_synd); + printf("}\n"); +} + +// Function to print version information +void print_version_info(void) +{ + printf("=== ACA Decoder Library Version Information ===\n"); + printf("Version: %s\n", aca_get_version_string()); + printf("Major: %d\n", aca_get_version_major()); + printf("Minor: %d\n", aca_get_version_minor()); + printf("Patch: %d\n", aca_get_version_patch()); + + aca_version_info_t version_info = aca_get_version_info(); + printf("Complete version info:\n"); + printf(" Major: %d\n", version_info.major); + printf(" Minor: %d\n", version_info.minor); + printf(" Patch: %d\n", version_info.patch); + printf(" String: %s\n", version_info.string); + printf("===============================================\n\n"); +} + +int main() +{ + // Display version information + print_version_info(); + + // Sample usage of decode_afid with 32-byte register array (HBM FATAL ERROR, expected output is 4) + uint64_t register_array_32[ACA_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbaa000000004081b, 0x0, 0x209600090f00, 0x5d000000}; + int afid_32 = decode_afid(register_array_32, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1); + printf("Decoded AFID (32-byte array): %d\n", afid_32); + + // Sample usage of decode_afid with 32-byte register array (GC FATAL ERROR, expected output is 3) + uint64_t register_array_test[ACA_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbea00000003b0000, 0x100000029, 0x1200136430400, 0x20b}; + int afid_test = decode_afid(register_array_test, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1); + printf("Decoded AFID (test array): %d\n", afid_test); + + // Sample usage of decode_afid with 128-byte register array (HBM CORRECTED ERROR, expected output is 1) + uint64_t register_array_128[ACA_REGISTER_ARRAY_SIZE_128_BYTES] = { + 0xffff, + 0xdc2040000000011b, + 0x0, + 0xd008000801000000, + 0x25000001ff, + 0x209600191f00, + 0xa000000, + 0x0, + 0x0, + 0x0, + 0xd008000801000000, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0}; + int afid_128 = decode_afid(register_array_128, ACA_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1); + printf("Decoded AFID (128-byte array): %d\n", afid_128); + + // sample for bad page + uint64_t register_array_bad_page[ACA_REGISTER_ARRAY_SIZE_128_BYTES] = { + 0x1, + 0xb000000000000137, + 0x0, + 0x0, + 0x1ff00000002, + 0x9600000000, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0}; + + // when flag is 0b1000, it indicates that the error threshold has been exceeded + // and is always a HBM error. The expected output is 19. + int afid_bad_page = decode_afid(register_array_bad_page, ACA_REGISTER_ARRAY_SIZE_128_BYTES, ACA_FLAG_THRESHOLD_EXCEEDED, 1); + printf("Decoded AFID (bad page): %d\n", afid_bad_page); + + const aca_error_info_t error_info_32 = decode_error_info(register_array_32, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1); + print_error_info(&error_info_32); + + const aca_error_info_t error_info_128 = decode_error_info(register_array_128, ACA_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1); + print_error_info(&error_info_128); + + return 0; +} diff --git a/projects/amdsmi/src/amd_smi/amd_smi_cper.cc b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc index 83b3ac42e7..d00d83e204 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_cper.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc @@ -29,7 +29,9 @@ #include #include +extern "C" { #include "aca-decode/aca_decode.h" +} #include "amd_smi/impl/amd_smi_cper.h" #include "rocm_smi/rocm_smi_logger.h"