[SWDEV-547223] RAS HBM CRC Read CE failed due to AFID missing 24
cherry-pick aca-decode repo changeset: aca-decode repo: f9e5ad5 (HEAD -> main, origin/main, origin/HEAD) Fix bug in Corrected HBM Error being decoded as AFID 34 (#5)
[ROCm/amdsmi commit: ffca095246]
Этот коммит содержится в:
коммит произвёл
Arif, Maisam
родитель
3779562abb
Коммит
7c83dac63d
@@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file aca_constants.h
|
||||
* @brief Shared constants for ACA error decoding
|
||||
*
|
||||
* This file contains string constants and numerical constants that are used
|
||||
* across multiple source files to improve maintainability and prevent typos.
|
||||
*/
|
||||
|
||||
#ifndef ACA_CONSTANTS_H
|
||||
#define ACA_CONSTANTS_H
|
||||
|
||||
/* Error severity constants */
|
||||
#define ACA_SEVERITY_UNKNOWN "UNKNOWN"
|
||||
#define ACA_SEVERITY_FATAL "Fatal"
|
||||
#define ACA_SEVERITY_CORRECTED "Corrected"
|
||||
#define ACA_SEVERITY_UNCORRECTED_NON_FATAL "Uncorrected, Non-fatal"
|
||||
#define ACA_SEVERITY_FAIL_TO_INIT "Fail-to-init"
|
||||
#define ACA_SEVERITY_ALL_CAPS "ALL"
|
||||
|
||||
/* Error category constants */
|
||||
#define ACA_CATEGORY_HBM_ERRORS "HBM Errors"
|
||||
#define ACA_CATEGORY_DEVICE_INTERNAL_ERRORS "Device Internal Errors"
|
||||
#define ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS "Off-Package Link Errors"
|
||||
#define ACA_CATEGORY_BOOT_TIME_ERRORS "Boot-Time Errors"
|
||||
#define ACA_CATEGORY_CPER_FORMAT "CPER Format"
|
||||
#define ACA_CATEGORY_UNIDENTIFIED_ERRORS "Unidentified Errors"
|
||||
|
||||
/* Common error type constants */
|
||||
#define ACA_ERROR_TYPE_ALL_OTHERS "All Others"
|
||||
#define ACA_ERROR_TYPE_ALL "All"
|
||||
#define ACA_ERROR_TYPE_DECODE_INAPPLICABLE "Decode Inapplicable"
|
||||
#define ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD "Bad Page Retirement Threshold"
|
||||
#define ACA_ERROR_TYPE_HARDWARE_ASSERTION "Hardware Assertion (HWA)"
|
||||
#define ACA_ERROR_TYPE_WATCHDOG_TIMEOUT "Watchdog Timeout (WDT)"
|
||||
#define ACA_ERROR_TYPE_ON_DIE_ECC "On-die ECC"
|
||||
#define ACA_ERROR_TYPE_END_TO_END_CRC "End-to-end CRC"
|
||||
#define ACA_ERROR_TYPE_WAFL "WAFL"
|
||||
#define ACA_ERROR_TYPE_XGMI "XGMI"
|
||||
|
||||
/* Boot-time error type constants */
|
||||
#define ACA_ERROR_TYPE_FW_LOAD "FW Load"
|
||||
#define ACA_ERROR_TYPE_HBM_BIST_TEST "HBM BIST Test"
|
||||
#define ACA_ERROR_TYPE_HBM_MEMORY_TEST "HBM Memory Test"
|
||||
#define ACA_ERROR_TYPE_HBM_TRAINING "HBM Training"
|
||||
#define ACA_ERROR_TYPE_UNHANDLED "Unhandled"
|
||||
#define ACA_ERROR_TYPE_UNKNOWN_ERROR "Unknown"
|
||||
#define ACA_ERROR_TYPE_USR_CP_LINK_TRAINING "USR CP Link Training"
|
||||
#define ACA_ERROR_TYPE_USR_DP_LINK_TRAINING "USR DP Link Training"
|
||||
#define ACA_ERROR_TYPE_WAFL_LINK_TRAINING "WAFL Link Training"
|
||||
#define ACA_ERROR_TYPE_XGMI_LINK_TRAINING "XGMI Link Training"
|
||||
#define ACA_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT "Boot Controller Data Abort"
|
||||
#define ACA_ERROR_TYPE_BOOT_CONTROLLER_GENERIC "Boot Controller Generic"
|
||||
|
||||
/* Link error type constants */
|
||||
#define ACA_ERROR_TYPE_PCIE_AER "PCIe AER"
|
||||
|
||||
/* CPER format error type constants */
|
||||
#define ACA_ERROR_TYPE_MALFORMED_CPER "Malformed CPER"
|
||||
#define ACA_ERROR_TYPE_INCOMPLETE_ACA_DATA "Incomplete ACA Data"
|
||||
#define ACA_ERROR_TYPE_INVALID_ACA_DATA "Invalid ACA Data"
|
||||
#define ACA_ERROR_TYPE_UNIDENTIFIED_ERROR "Unidentified Error"
|
||||
|
||||
/* Protocol constants */
|
||||
#define ACA_PROTOCOL_CPER "CPER"
|
||||
#define ACA_PROTOCOL_CPER_WITH_SPACE "CPER "
|
||||
|
||||
/* Bank name strings */
|
||||
#define ACA_BANK_UMC "umc"
|
||||
#define ACA_BANK_PSP "psp"
|
||||
#define ACA_BANK_CS "cs"
|
||||
#define ACA_BANK_PIE "pie"
|
||||
#define ACA_BANK_PCS_XGMI "pcs_xgmi"
|
||||
#define ACA_BANK_KPX_SERDES "kpx_serdes"
|
||||
#define ACA_BANK_KPX_WAFL "kpx_wafl"
|
||||
|
||||
/* Numerical constants */
|
||||
#define ACA_FLAG_THRESHOLD_EXCEEDED 0x8
|
||||
#define ACA_REGISTER_ARRAY_SIZE_32_BYTES 4
|
||||
#define ACA_REGISTER_ARRAY_SIZE_128_BYTES 16
|
||||
|
||||
/* Error code ranges */
|
||||
#define ACA_ERROR_CODE_EXT_MIN 0x3A
|
||||
#define ACA_ERROR_CODE_EXT_MAX 0x3E
|
||||
|
||||
/* Instance ID values for XCD and AID error decoding */
|
||||
#define ACA_INSTANCE_ID_XCD0_400 0x36430400
|
||||
#define ACA_INSTANCE_ID_XCD1_400 0x38430400
|
||||
#define ACA_INSTANCE_ID_XCD0_401 0x36430401
|
||||
#define ACA_INSTANCE_ID_XCD1_401 0x38430401
|
||||
#define ACA_INSTANCE_ID_AID_400 0x3B30400
|
||||
#define ACA_INSTANCE_ID_AID_401 0x3B30401
|
||||
|
||||
/* Error return codes */
|
||||
#define ACA_ERROR_INVALID_ACA_DATA_ID 33
|
||||
#define ACA_ERROR_UNIDENTIFIED_ERROR_ID 34
|
||||
|
||||
#endif /* ACA_CONSTANTS_H */
|
||||
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef ACA_VERSION_H
|
||||
#define ACA_VERSION_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief ACA Decoder Library Version Information
|
||||
*
|
||||
* This header defines version constants and functions for the ACA Decoder library.
|
||||
* Version follows Semantic Versioning (SemVer) specification: MAJOR.MINOR.PATCH
|
||||
*
|
||||
* - MAJOR: Incremented for incompatible API changes
|
||||
* - MINOR: Incremented for backward-compatible functionality additions
|
||||
* - PATCH: Incremented for backward-compatible bug fixes
|
||||
*/
|
||||
|
||||
/* Version Components */
|
||||
#define ACA_VERSION_MAJOR 1 /**< Major version number */
|
||||
#define ACA_VERSION_MINOR 0 /**< Minor version number */
|
||||
#define ACA_VERSION_PATCH 0 /**< Patch version number */
|
||||
|
||||
/* Version String */
|
||||
#define ACA_VERSION_STRING "1.0.0"
|
||||
|
||||
/**
|
||||
* @brief Structure containing version information
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int major; /**< Major version number */
|
||||
int minor; /**< Minor version number */
|
||||
int patch; /**< Patch version number */
|
||||
const char *string; /**< Version string (e.g., "1.0.0") */
|
||||
} aca_version_info_t;
|
||||
|
||||
/**
|
||||
* @brief Get the major version number
|
||||
* @return Major version number
|
||||
*/
|
||||
int aca_get_version_major(void);
|
||||
|
||||
/**
|
||||
* @brief Get the minor version number
|
||||
* @return Minor version number
|
||||
*/
|
||||
int aca_get_version_minor(void);
|
||||
|
||||
/**
|
||||
* @brief Get the patch version number
|
||||
* @return Patch version number
|
||||
*/
|
||||
int aca_get_version_patch(void);
|
||||
|
||||
/**
|
||||
* @brief Get the version string
|
||||
* @return Pointer to version string (e.g., "1.0.0")
|
||||
*/
|
||||
const char *aca_get_version_string(void);
|
||||
|
||||
/**
|
||||
* @brief Get complete version information
|
||||
* @return Structure containing all version information
|
||||
*/
|
||||
aca_version_info_t aca_get_version_info(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* ACA_VERSION_H */
|
||||
@@ -1,4 +1,3 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
@@ -21,8 +20,8 @@
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "aca_decode.h"
|
||||
#include <utils.h>
|
||||
#include "aca_decode.h"
|
||||
#include "aca_constants.h"
|
||||
|
||||
int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision)
|
||||
{
|
||||
@@ -33,20 +32,21 @@ int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag,
|
||||
|
||||
aca_raw_data_t raw_data;
|
||||
|
||||
if (array_len == 4) // 32 bytes
|
||||
if (array_len == ACA_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes
|
||||
{
|
||||
raw_data.aca_status = register_array[0];
|
||||
raw_data.aca_addr = register_array[1];
|
||||
raw_data.aca_ipid = register_array[2];
|
||||
raw_data.aca_synd = register_array[3];
|
||||
}
|
||||
else if (array_len == 16) // 128 bytes
|
||||
else if (array_len == ACA_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes
|
||||
{
|
||||
raw_data.aca_status = register_array[1];
|
||||
raw_data.aca_addr = register_array[2];
|
||||
raw_data.aca_ipid = register_array[5];
|
||||
raw_data.aca_synd = register_array[6];
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
return -1; // Unsupported size
|
||||
@@ -67,32 +67,19 @@ aca_error_info_t decode_error_info(const uint64_t *register_array, size_t array_
|
||||
if (!register_array)
|
||||
{
|
||||
return error_info;
|
||||
}
|
||||
|
||||
// Create a copy of the register array to avoid modifying the original
|
||||
uint64_t converted_array[16];
|
||||
if (array_len > 16) {
|
||||
return error_info;
|
||||
}
|
||||
|
||||
// Copy and convert the array
|
||||
for (size_t i = 0; i < array_len; i++) {
|
||||
converted_array[i] = le64_to_be64(register_array[i]);
|
||||
}
|
||||
|
||||
if (array_len == 4) // 32 bytes
|
||||
} if (array_len == ACA_REGISTER_ARRAY_SIZE_32_BYTES) // 32 bytes
|
||||
{
|
||||
raw_data.aca_status = converted_array[0];
|
||||
raw_data.aca_addr = converted_array[1];
|
||||
raw_data.aca_ipid = converted_array[2];
|
||||
raw_data.aca_synd = converted_array[3];
|
||||
raw_data.aca_status = register_array[0];
|
||||
raw_data.aca_addr = register_array[1];
|
||||
raw_data.aca_ipid = register_array[2];
|
||||
raw_data.aca_synd = register_array[3];
|
||||
}
|
||||
else if (array_len == 16) // 128 bytes
|
||||
else if (array_len == ACA_REGISTER_ARRAY_SIZE_128_BYTES) // 128 bytes
|
||||
{
|
||||
raw_data.aca_status = converted_array[1];
|
||||
raw_data.aca_addr = converted_array[2];
|
||||
raw_data.aca_ipid = converted_array[5];
|
||||
raw_data.aca_synd = converted_array[6];
|
||||
raw_data.aca_status = register_array[1];
|
||||
raw_data.aca_addr = register_array[2];
|
||||
raw_data.aca_ipid = register_array[5];
|
||||
raw_data.aca_synd = register_array[6];
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
@@ -33,6 +32,7 @@
|
||||
#include "aca_decode.h"
|
||||
#include "aca_tables.h"
|
||||
#include "error_map.h"
|
||||
#include "aca_constants.h"
|
||||
#include <string.h>
|
||||
|
||||
/**
|
||||
@@ -61,18 +61,18 @@ aca_decoder_get_bank(const aca_decoder_t *decoder, const char **bank_name)
|
||||
static const char *get_error_severity(const aca_status_fields_t *status)
|
||||
{
|
||||
if (status->poison)
|
||||
return "Uncorrected, Non-fatal";
|
||||
return ACA_SEVERITY_UNCORRECTED_NON_FATAL;
|
||||
if (status->pcc)
|
||||
return "Fatal";
|
||||
return ACA_SEVERITY_FATAL;
|
||||
if (!status->pcc && status->uc && status->tcc)
|
||||
return "Fatal";
|
||||
return ACA_SEVERITY_FATAL;
|
||||
if (!status->pcc && status->uc && !status->tcc)
|
||||
return "Uncorrected, Non-fatal";
|
||||
return ACA_SEVERITY_UNCORRECTED_NON_FATAL;
|
||||
if (!status->pcc && !status->uc && !status->tcc && status->deferred)
|
||||
return "Uncorrected, Non-fatal";
|
||||
return ACA_SEVERITY_UNCORRECTED_NON_FATAL;
|
||||
if (!status->pcc && !status->uc && !status->tcc && !status->deferred)
|
||||
return "Corrected";
|
||||
return "UNKNOWN";
|
||||
return ACA_SEVERITY_CORRECTED;
|
||||
return ACA_SEVERITY_UNKNOWN;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -85,31 +85,31 @@ static const char *get_error_category(const char *bank, const char *error_type)
|
||||
{
|
||||
if (!bank || !error_type)
|
||||
{
|
||||
return "UNKNOWN";
|
||||
return ACA_SEVERITY_UNKNOWN;
|
||||
}
|
||||
|
||||
if (strcmp(bank, "umc") == 0)
|
||||
if (strcmp(bank, ACA_BANK_UMC) == 0)
|
||||
{
|
||||
if (strcmp(error_type, "On-die ECC") == 0 ||
|
||||
if (strcmp(error_type, ACA_ERROR_TYPE_ON_DIE_ECC) == 0 ||
|
||||
strcmp(error_type, "WriteDataPoisonErr") == 0 ||
|
||||
strcmp(error_type, "AddressCommandParityErr") == 0 ||
|
||||
strcmp(error_type, "WriteDataCrcErr") == 0 ||
|
||||
strcmp(error_type, "EcsErr") == 0 ||
|
||||
strcmp(error_type, "RdCrcErr") == 0 ||
|
||||
strcmp(error_type, "End-to-end CRC") == 0)
|
||||
strcmp(error_type, ACA_ERROR_TYPE_END_TO_END_CRC) == 0)
|
||||
{
|
||||
return "HBM Errors";
|
||||
return ACA_CATEGORY_HBM_ERRORS;
|
||||
}
|
||||
}
|
||||
else if (strcmp(bank, "pcs_xgmi") == 0 ||
|
||||
strcmp(bank, "kpx_serdes") == 0 ||
|
||||
strcmp(bank, "kpx_wafl") == 0 ||
|
||||
(strcmp(bank, "psp") == 0 && strcmp(error_type, "WAFL") == 0))
|
||||
else if (strcmp(bank, ACA_BANK_PCS_XGMI) == 0 ||
|
||||
strcmp(bank, ACA_BANK_KPX_SERDES) == 0 ||
|
||||
strcmp(bank, ACA_BANK_KPX_WAFL) == 0 ||
|
||||
(strcmp(bank, ACA_BANK_PSP) == 0 && strcmp(error_type, ACA_ERROR_TYPE_WAFL) == 0))
|
||||
{
|
||||
return "Off-Package Link Errors";
|
||||
return ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS;
|
||||
}
|
||||
|
||||
return "Device Internal Errors";
|
||||
return ACA_CATEGORY_DEVICE_INTERNAL_ERRORS;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -125,55 +125,55 @@ static int get_service_error_type(const char *error_category, const char *error_
|
||||
const char *error_severity, const char **service_error_type)
|
||||
{
|
||||
if (!error_category || !error_type || !error_severity || !service_error_type ||
|
||||
strcmp(error_category, "UNKNOWN") == 0 ||
|
||||
strcmp(error_type, "UNKNOWN") == 0 ||
|
||||
strcmp(error_severity, "UNKNOWN") == 0)
|
||||
strcmp(error_category, ACA_SEVERITY_UNKNOWN) == 0 ||
|
||||
strcmp(error_type, ACA_SEVERITY_UNKNOWN) == 0 ||
|
||||
strcmp(error_severity, ACA_SEVERITY_UNKNOWN) == 0)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
if (strcmp(error_type, "Bad Page Retirement Threshold") == 0)
|
||||
if (strcmp(error_type, ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD) == 0)
|
||||
{
|
||||
*service_error_type = "Bad Page Retirement Threshold";
|
||||
*service_error_type = ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD;
|
||||
return 0;
|
||||
}
|
||||
if ((strcmp(error_category, ACA_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, ACA_SEVERITY_CORRECTED) == 0))
|
||||
{
|
||||
*service_error_type = ACA_ERROR_TYPE_ALL;
|
||||
return 0;
|
||||
}
|
||||
if (strcmp(error_type, "RdCrcErr") == 0)
|
||||
{
|
||||
*service_error_type = "End-to-end CRC";
|
||||
*service_error_type = ACA_ERROR_TYPE_END_TO_END_CRC;
|
||||
return 0;
|
||||
}
|
||||
if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Corrected") == 0))
|
||||
if ((strcmp(error_category, ACA_CATEGORY_HBM_ERRORS) == 0) && (strcmp(error_severity, ACA_SEVERITY_FATAL) == 0) &&
|
||||
(strcmp(error_type, ACA_ERROR_TYPE_ON_DIE_ECC) != 0) && (strcmp(error_type, ACA_ERROR_TYPE_END_TO_END_CRC) != 0))
|
||||
{
|
||||
*service_error_type = "All";
|
||||
*service_error_type = ACA_ERROR_TYPE_ALL_OTHERS;
|
||||
return 0;
|
||||
}
|
||||
if ((strcmp(error_category, "HBM Errors") == 0) && (strcmp(error_severity, "Fatal") == 0) &&
|
||||
(strcmp(error_type, "On-die ECC") != 0) && (strcmp(error_type, "End-to-end CRC") != 0))
|
||||
if (strcmp(error_category, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS) == 0)
|
||||
{
|
||||
*service_error_type = "All Others";
|
||||
return 0;
|
||||
}
|
||||
if (strcmp(error_category, "Device Internal Errors") == 0)
|
||||
{
|
||||
if ((strcmp(error_severity, "Uncorrected, Non-fatal") == 0 ||
|
||||
strcmp(error_severity, "Corrected") == 0 ||
|
||||
strcmp(error_severity, "Fatal") == 0) &&
|
||||
strcmp(error_type, "Hardware Assertion (HWA)") != 0 &&
|
||||
strcmp(error_type, "Watchdog Timeout (WDT)") != 0)
|
||||
if ((strcmp(error_severity, ACA_SEVERITY_UNCORRECTED_NON_FATAL) == 0 ||
|
||||
strcmp(error_severity, ACA_SEVERITY_CORRECTED) == 0 ||
|
||||
strcmp(error_severity, ACA_SEVERITY_FATAL) == 0) &&
|
||||
strcmp(error_type, ACA_ERROR_TYPE_HARDWARE_ASSERTION) != 0 &&
|
||||
strcmp(error_type, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT) != 0)
|
||||
{
|
||||
*service_error_type = "All Others";
|
||||
*service_error_type = ACA_ERROR_TYPE_ALL_OTHERS;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (strcmp(error_category, "Off-Package Link Errors") == 0)
|
||||
if (strcmp(error_category, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS) == 0)
|
||||
{
|
||||
if (strcmp(error_bank, "pcs_xgmi") == 0)
|
||||
if (strcmp(error_bank, ACA_BANK_PCS_XGMI) == 0)
|
||||
{
|
||||
*service_error_type = "XGMI";
|
||||
*service_error_type = ACA_ERROR_TYPE_XGMI;
|
||||
return 0;
|
||||
}
|
||||
if (strcmp(error_bank, "kpx_wafl") == 0)
|
||||
if (strcmp(error_bank, ACA_BANK_KPX_WAFL) == 0)
|
||||
{
|
||||
*service_error_type = "WAFL";
|
||||
*service_error_type = ACA_ERROR_TYPE_WAFL;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@@ -205,7 +205,7 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i
|
||||
result = aca_decoder_get_bank(decoder, &bank);
|
||||
if (result < 0)
|
||||
{
|
||||
bank = "UNKNOWN";
|
||||
bank = ACA_SEVERITY_UNKNOWN;
|
||||
}
|
||||
info->bank_ref = bank;
|
||||
|
||||
@@ -215,13 +215,13 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i
|
||||
}
|
||||
else
|
||||
{
|
||||
info->instance_ref = "Decode Inapplicable";
|
||||
info->instance_ref = ACA_ERROR_TYPE_DECODE_INAPPLICABLE;
|
||||
}
|
||||
|
||||
// 0b1000 indicate error threshold has been exceeded, and is always fatal
|
||||
if (decoder->flags & 0x8)
|
||||
if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED)
|
||||
{
|
||||
info->severity_ref = "Fatal";
|
||||
info->severity_ref = ACA_SEVERITY_FATAL;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -242,31 +242,31 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i
|
||||
info->aid = -1; // Invalid value
|
||||
}
|
||||
|
||||
if (decoder->status.error_code_ext >= 0x3A && decoder->status.error_code_ext <= 0x3E)
|
||||
if (decoder->status.error_code_ext >= ACA_ERROR_CODE_EXT_MIN && decoder->status.error_code_ext <= ACA_ERROR_CODE_EXT_MAX)
|
||||
{
|
||||
uint32_t instance_id = decoder->ipid.instance_id_lo;
|
||||
uint32_t error_info = decoder->synd.error_information & 0xFF;
|
||||
|
||||
if ((instance_id == 0x36430400 || instance_id == 0x38430400 ||
|
||||
instance_id == 0x36430401 || instance_id == 0x38430401) &&
|
||||
if ((instance_id == ACA_INSTANCE_ID_XCD0_400 || instance_id == ACA_INSTANCE_ID_XCD1_400 ||
|
||||
instance_id == ACA_INSTANCE_ID_XCD0_401 || instance_id == ACA_INSTANCE_ID_XCD1_401) &&
|
||||
find_error_in_table(xcd_error_table, NUM_XCD_ERRORS, error_info, &error_type) == 0)
|
||||
{
|
||||
info->error_type_ref = error_type;
|
||||
}
|
||||
else if ((instance_id == 0x3B30400 || instance_id == 0x3B30401) &&
|
||||
else if ((instance_id == ACA_INSTANCE_ID_AID_400 || instance_id == ACA_INSTANCE_ID_AID_401) &&
|
||||
find_error_in_table(aid_error_table, NUM_AID_ERRORS, error_info, &error_type) == 0)
|
||||
{
|
||||
info->error_type_ref = error_type;
|
||||
}
|
||||
else
|
||||
{
|
||||
info->error_type_ref = "UNKNOWN";
|
||||
info->error_type_ref = ACA_SEVERITY_UNKNOWN;
|
||||
}
|
||||
}
|
||||
// 0b1000 indicate error threshold has been exceeded
|
||||
else if (decoder->flags & 0x8)
|
||||
else if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED)
|
||||
{
|
||||
info->error_type_ref = "Bad Page Retirement Threshold";
|
||||
info->error_type_ref = ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -276,14 +276,14 @@ static void aca_decoder_get_error_info(const aca_decoder_t *decoder, aca_error_i
|
||||
}
|
||||
else
|
||||
{
|
||||
info->error_type_ref = "UNKNOWN";
|
||||
info->error_type_ref = ACA_SEVERITY_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
// 0b1000 indicate error threshold has been exceeded, and is always a HBM error
|
||||
if (decoder->flags & 0x8)
|
||||
if (decoder->flags & ACA_FLAG_THRESHOLD_EXCEEDED)
|
||||
{
|
||||
info->category_ref = "HBM Errors";
|
||||
info->category_ref = ACA_CATEGORY_HBM_ERRORS;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
@@ -34,6 +33,7 @@
|
||||
*/
|
||||
|
||||
#include "aca_tables.h"
|
||||
#include "aca_constants.h"
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
@@ -80,19 +80,19 @@ const aca_error_type_t error_table[] = {
|
||||
{"cs", 0xe, "FTI_ND_ILL_REQ"},
|
||||
{"cs", 0xf, "FTI_ND_ADDR_VIOL"},
|
||||
{"cs", 0x10, "FTI_ND_SEC_VIOL"},
|
||||
{"cs", 0x11, "Hardware Assertion (HWA)"},
|
||||
{"cs", 0x11, ACA_ERROR_TYPE_HARDWARE_ASSERTION},
|
||||
{"cs", 0x12, "ST_PRT_ERR"},
|
||||
{"cs", 0x13, "ST_ECC_ERR"},
|
||||
{"cs", 0x14, "ST_TXN_ERR"},
|
||||
{"pie", 0x0, "Hardware Assertion (HWA)"},
|
||||
{"pie", 0x0, ACA_ERROR_TYPE_HARDWARE_ASSERTION},
|
||||
{"pie", 0x1, "CSW"},
|
||||
{"pie", 0x2, "GMI"},
|
||||
{"pie", 0x3, "FTI_DAT_STAT"},
|
||||
{"pie", 0x4, "DEF"},
|
||||
{"pie", 0x5, "Watchdog Timeout (WDT)"},
|
||||
{"pie", 0x5, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT},
|
||||
{"pie", 0x6, "CNLI"},
|
||||
{"pie", 0x7, "RSLVFCI"},
|
||||
{"umc", 0x0, "On-die ECC"},
|
||||
{"umc", 0x0, ACA_ERROR_TYPE_ON_DIE_ECC},
|
||||
{"umc", 0x1, "WriteDataPoisonErr"},
|
||||
{"umc", 0x2, "SdpParityErr"},
|
||||
{"umc", 0x4, "AddressCommandParityErr"},
|
||||
@@ -103,7 +103,7 @@ const aca_error_type_t error_table[] = {
|
||||
{"umc", 0xb, "RdCrcErr"},
|
||||
{"umc", 0xd, "MpFwErr"},
|
||||
{"umc", 0xe, "MpParErr"},
|
||||
{"umc", 0xf, "End-to-end CRC"},
|
||||
{"umc", 0xf, ACA_ERROR_TYPE_END_TO_END_CRC},
|
||||
{"psp", 0x0, "Mp0HighSramError"},
|
||||
{"psp", 0x1, "Mp0LowSramError"},
|
||||
{"psp", 0x2, "Mp0IDataBank0Error"},
|
||||
@@ -127,7 +127,7 @@ const aca_error_type_t error_table[] = {
|
||||
{"psp", 0x3b, "SRAM_EDC"},
|
||||
{"psp", 0x3c, "SMN_Parity"},
|
||||
{"psp", 0x3d, "SMN_Timeout"},
|
||||
{"psp", 0x3f, "WAFL"},
|
||||
{"psp", 0x3f, ACA_ERROR_TYPE_WAFL},
|
||||
{"smu", 0x0, "Mp5HighSramError"},
|
||||
{"smu", 0x1, "Mp5LowSramError"},
|
||||
{"smu", 0x2, "Mp5DCacheAError"},
|
||||
@@ -478,7 +478,7 @@ int find_bank_name(uint16_t hw_id, uint16_t aca_type, const char **bank_name)
|
||||
}
|
||||
}
|
||||
|
||||
*bank_name = "UNKNOWN";
|
||||
*bank_name = ACA_SEVERITY_UNKNOWN;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -499,7 +499,7 @@ int find_error_type_by_bank(const char *bank, uint32_t error_code, const char **
|
||||
}
|
||||
}
|
||||
|
||||
*error_type = "UNKNOWN";
|
||||
*error_type = ACA_SEVERITY_UNKNOWN;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -520,7 +520,7 @@ int find_error_in_table(const aca_error_entry_t *table, size_t table_size,
|
||||
}
|
||||
}
|
||||
|
||||
*error_type = "UNKNOWN";
|
||||
*error_type = ACA_SEVERITY_UNKNOWN;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -556,6 +556,6 @@ int find_instance_name(const char *bank, uint32_t instance_id_lo, const char **i
|
||||
}
|
||||
}
|
||||
|
||||
*instance_name = "UNKNOWN";
|
||||
*instance_name = ACA_SEVERITY_UNKNOWN;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "aca_version.h"
|
||||
|
||||
/* Implementation of version functions */
|
||||
|
||||
int aca_get_version_major(void)
|
||||
{
|
||||
return ACA_VERSION_MAJOR;
|
||||
}
|
||||
|
||||
int aca_get_version_minor(void)
|
||||
{
|
||||
return ACA_VERSION_MINOR;
|
||||
}
|
||||
|
||||
int aca_get_version_patch(void)
|
||||
{
|
||||
return ACA_VERSION_PATCH;
|
||||
}
|
||||
|
||||
const char *aca_get_version_string(void)
|
||||
{
|
||||
return ACA_VERSION_STRING;
|
||||
}
|
||||
|
||||
aca_version_info_t aca_get_version_info(void)
|
||||
{
|
||||
aca_version_info_t info;
|
||||
|
||||
info.major = ACA_VERSION_MAJOR;
|
||||
info.minor = ACA_VERSION_MINOR;
|
||||
info.patch = ACA_VERSION_PATCH;
|
||||
info.string = ACA_VERSION_STRING;
|
||||
|
||||
return info;
|
||||
}
|
||||
@@ -1,4 +1,3 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
@@ -22,56 +21,57 @@
|
||||
*/
|
||||
|
||||
#include "error_map.h"
|
||||
#include "aca_constants.h"
|
||||
#include <string.h>
|
||||
|
||||
#define AFID_VERSION "0.7"
|
||||
|
||||
static const error_map_entry_t error_map[] = {
|
||||
{1, "Boot-Time Errors", "FW Load", "CPER", "Fail-to-init"},
|
||||
{2, "Boot-Time Errors", "HBM BIST Test", "CPER", "Fail-to-init"},
|
||||
{3, "Boot-Time Errors", "HBM Memory Test", "CPER", "Fail-to-init"},
|
||||
{4, "Boot-Time Errors", "HBM Training", "CPER", "Fail-to-init"},
|
||||
{5, "Boot-Time Errors", "Unhandled", "CPER", "Fail-to-init"},
|
||||
{6, "Boot-Time Errors", "Unknown", "CPER", "Fail-to-init"},
|
||||
{7, "Boot-Time Errors", "USR CP Link Training", "CPER", "Fail-to-init"},
|
||||
{8, "Boot-Time Errors", "USR DP Link Training", "CPER", "Fail-to-init"},
|
||||
{9, "Boot-Time Errors", "WAFL Link Training", "CPER", "Fail-to-init"},
|
||||
{10, "Boot-Time Errors", "XGMI Link Training", "CPER", "Fail-to-init"},
|
||||
{11, "Boot-Time Errors", "Boot Controller Data Abort", "CPER", "Fail-to-init"},
|
||||
{12, "Boot-Time Errors", "Boot Controller Generic", "CPER ", "Fail-to-init"},
|
||||
{13, "Off-Package Link Errors", "PCIe AER", "CPER", "Corrected"},
|
||||
{14, "Off-Package Link Errors", "PCIe AER", "CPER", "Fatal"},
|
||||
{15, "Off-Package Link Errors", "WAFL", "CPER", "Corrected"},
|
||||
{16, "Off-Package Link Errors", "WAFL", "CPER", "Fatal"},
|
||||
{17, "Off-Package Link Errors", "XGMI", "CPER", "Corrected"},
|
||||
{18, "Off-Package Link Errors", "XGMI", "CPER", "Fatal"},
|
||||
{19, "HBM Errors", "Bad Page Retirement Threshold", "CPER", "Fatal"},
|
||||
{20, "HBM Errors", "On-die ECC", "CPER", "Fatal"},
|
||||
{21, "HBM Errors", "End-to-end CRC", "CPER", "Fatal"},
|
||||
{22, "HBM Errors", "On-die ECC", "CPER", "Uncorrected, Non-fatal"},
|
||||
{23, "HBM Errors", "End-to-end CRC", "CPER", "Uncorrected, Non-fatal"},
|
||||
{24, "HBM Errors", "All", "CPER", "Corrected"},
|
||||
{25, "HBM Errors", "All Others", "CPER", "Fatal"},
|
||||
{26, "Device Internal Errors", "Hardware Assertion (HWA)", "CPER", "Fatal"},
|
||||
{27, "Device Internal Errors", "Watchdog Timeout (WDT)", "CPER", "Fatal"},
|
||||
{28, "Device Internal Errors", "All Others", "CPER", "Uncorrected, Non-fatal"},
|
||||
{29, "Device Internal Errors", "All Others", "CPER", "Corrected"},
|
||||
{30, "Device Internal Errors", "All Others", "CPER", "Fatal"},
|
||||
{31, "CPER Format", "Malformed CPER", "CPER", "ALL"},
|
||||
{32, "CPER Format", "Incomplete ACA Data", "CPER", "ALL"},
|
||||
{33, "CPER Format", "Invalid ACA Data", "CPER", "ALL"},
|
||||
{34, "Unidentified Errors", "Unidentified Error", "CPER", "ALL"}};
|
||||
{1, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_FW_LOAD, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{2, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_BIST_TEST, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{3, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_MEMORY_TEST, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{4, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_HBM_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{5, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_UNHANDLED, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{6, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_UNKNOWN_ERROR, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{7, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_USR_CP_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{8, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_USR_DP_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{9, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_WAFL_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{10, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_XGMI_LINK_TRAINING, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{11, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_BOOT_CONTROLLER_DATA_ABORT, ACA_PROTOCOL_CPER, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{12, ACA_CATEGORY_BOOT_TIME_ERRORS, ACA_ERROR_TYPE_BOOT_CONTROLLER_GENERIC, ACA_PROTOCOL_CPER_WITH_SPACE, ACA_SEVERITY_FAIL_TO_INIT},
|
||||
{13, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_PCIE_AER, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
|
||||
{14, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_PCIE_AER, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{15, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_WAFL, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
|
||||
{16, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_WAFL, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{17, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_XGMI, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
|
||||
{18, ACA_CATEGORY_OFF_PACKAGE_LINK_ERRORS, ACA_ERROR_TYPE_XGMI, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{19, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_BAD_PAGE_RETIREMENT_THRESHOLD, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{20, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ON_DIE_ECC, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{21, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_END_TO_END_CRC, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{22, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ON_DIE_ECC, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL},
|
||||
{23, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_END_TO_END_CRC, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL},
|
||||
{24, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ALL, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
|
||||
{25, ACA_CATEGORY_HBM_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{26, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_HARDWARE_ASSERTION, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{27, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_WATCHDOG_TIMEOUT, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{28, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_UNCORRECTED_NON_FATAL},
|
||||
{29, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_CORRECTED},
|
||||
{30, ACA_CATEGORY_DEVICE_INTERNAL_ERRORS, ACA_ERROR_TYPE_ALL_OTHERS, ACA_PROTOCOL_CPER, ACA_SEVERITY_FATAL},
|
||||
{31, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_MALFORMED_CPER, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS},
|
||||
{32, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_INCOMPLETE_ACA_DATA, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS},
|
||||
{33, ACA_CATEGORY_CPER_FORMAT, ACA_ERROR_TYPE_INVALID_ACA_DATA, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS},
|
||||
{34, ACA_CATEGORY_UNIDENTIFIED_ERRORS, ACA_ERROR_TYPE_UNIDENTIFIED_ERROR, ACA_PROTOCOL_CPER, ACA_SEVERITY_ALL_CAPS}};
|
||||
|
||||
static const size_t NUM_ERROR_ENTRIES = sizeof(error_map) / sizeof(error_map[0]);
|
||||
|
||||
int get_error_id(const char *error_category, const char *error_type, const char *error_severity)
|
||||
{
|
||||
if (!error_category || !error_type || !error_severity ||
|
||||
strcmp(error_category, "UNKNOWN") == 0 ||
|
||||
strcmp(error_type, "UNKNOWN") == 0 ||
|
||||
strcmp(error_severity, "UNKNOWN") == 0)
|
||||
strcmp(error_category, ACA_SEVERITY_UNKNOWN) == 0 ||
|
||||
strcmp(error_type, ACA_SEVERITY_UNKNOWN) == 0 ||
|
||||
strcmp(error_severity, ACA_SEVERITY_UNKNOWN) == 0)
|
||||
{
|
||||
return 33; // Return ID for "Invalid Error" if any input is "UNKNOWN" or NULL
|
||||
return ACA_ERROR_INVALID_ACA_DATA_ID; // Return ID for "Invalid Error" if any input is "UNKNOWN" or NULL
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < NUM_ERROR_ENTRIES; i++)
|
||||
@@ -84,5 +84,5 @@ int get_error_id(const char *error_category, const char *error_type, const char
|
||||
}
|
||||
}
|
||||
|
||||
return 34; // Return ID for "Unidentified Errors" if no match found
|
||||
return ACA_ERROR_UNIDENTIFIED_ERROR_ID; // Return ID for "Unidentified Errors" if no match found
|
||||
}
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file main.c
|
||||
* @brief Demo program showing how to use the ACA decoder
|
||||
*
|
||||
* This is a demonstration program that shows how to use the ACA decoder
|
||||
* with sample raw data to decode ACA error information.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <aca_api.h>
|
||||
#include <aca_version.h>
|
||||
#include <aca_constants.h>
|
||||
#include <stdint.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
// Function prototype
|
||||
void print_error_info(const aca_error_info_t *info);
|
||||
void print_version_info(void);
|
||||
|
||||
// Function to print error info in JSON format
|
||||
void print_error_info(const aca_error_info_t *info)
|
||||
{
|
||||
printf("{\n");
|
||||
printf(" \"bank\": \"%s\",\n", info->bank_ref);
|
||||
printf(" \"error_location\": {\n");
|
||||
printf(" \"oam\": \"%d\",\n", info->oam);
|
||||
printf(" \"aid\": \"%d\",\n", info->aid);
|
||||
printf(" \"instance\": \"%s\"\n", info->instance_ref);
|
||||
printf(" },\n");
|
||||
printf(" \"severity\": \"%s\",\n", info->severity_ref);
|
||||
printf(" \"afid\": \"%d\",\n", info->afid);
|
||||
printf(" \"scrub\": \"%u\",\n", info->scrub);
|
||||
printf(" \"err_ext\": \"%u\",\n", info->error_code_ext);
|
||||
printf(" \"error_category\": \"%s\",\n", info->category_ref);
|
||||
printf(" \"error_type\": \"%s\",\n", info->error_type_ref);
|
||||
printf(" \"address\": \"0x%" PRIx64 "\",\n", info->raw_addr);
|
||||
printf(" \"syndrome\": \"0x%" PRIx64 "\"\n", info->raw_synd);
|
||||
printf("}\n");
|
||||
}
|
||||
|
||||
// Function to print version information
|
||||
void print_version_info(void)
|
||||
{
|
||||
printf("=== ACA Decoder Library Version Information ===\n");
|
||||
printf("Version: %s\n", aca_get_version_string());
|
||||
printf("Major: %d\n", aca_get_version_major());
|
||||
printf("Minor: %d\n", aca_get_version_minor());
|
||||
printf("Patch: %d\n", aca_get_version_patch());
|
||||
|
||||
aca_version_info_t version_info = aca_get_version_info();
|
||||
printf("Complete version info:\n");
|
||||
printf(" Major: %d\n", version_info.major);
|
||||
printf(" Minor: %d\n", version_info.minor);
|
||||
printf(" Patch: %d\n", version_info.patch);
|
||||
printf(" String: %s\n", version_info.string);
|
||||
printf("===============================================\n\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
// Display version information
|
||||
print_version_info();
|
||||
|
||||
// Sample usage of decode_afid with 32-byte register array (HBM FATAL ERROR, expected output is 4)
|
||||
uint64_t register_array_32[ACA_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbaa000000004081b, 0x0, 0x209600090f00, 0x5d000000};
|
||||
int afid_32 = decode_afid(register_array_32, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1);
|
||||
printf("Decoded AFID (32-byte array): %d\n", afid_32);
|
||||
|
||||
// Sample usage of decode_afid with 32-byte register array (GC FATAL ERROR, expected output is 3)
|
||||
uint64_t register_array_test[ACA_REGISTER_ARRAY_SIZE_32_BYTES] = {0xbea00000003b0000, 0x100000029, 0x1200136430400, 0x20b};
|
||||
int afid_test = decode_afid(register_array_test, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1);
|
||||
printf("Decoded AFID (test array): %d\n", afid_test);
|
||||
|
||||
// Sample usage of decode_afid with 128-byte register array (HBM CORRECTED ERROR, expected output is 1)
|
||||
uint64_t register_array_128[ACA_REGISTER_ARRAY_SIZE_128_BYTES] = {
|
||||
0xffff,
|
||||
0xdc2040000000011b,
|
||||
0x0,
|
||||
0xd008000801000000,
|
||||
0x25000001ff,
|
||||
0x209600191f00,
|
||||
0xa000000,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0xd008000801000000,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0};
|
||||
int afid_128 = decode_afid(register_array_128, ACA_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1);
|
||||
printf("Decoded AFID (128-byte array): %d\n", afid_128);
|
||||
|
||||
// sample for bad page
|
||||
uint64_t register_array_bad_page[ACA_REGISTER_ARRAY_SIZE_128_BYTES] = {
|
||||
0x1,
|
||||
0xb000000000000137,
|
||||
0x0,
|
||||
0x0,
|
||||
0x1ff00000002,
|
||||
0x9600000000,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0,
|
||||
0x0};
|
||||
|
||||
// when flag is 0b1000, it indicates that the error threshold has been exceeded
|
||||
// and is always a HBM error. The expected output is 19.
|
||||
int afid_bad_page = decode_afid(register_array_bad_page, ACA_REGISTER_ARRAY_SIZE_128_BYTES, ACA_FLAG_THRESHOLD_EXCEEDED, 1);
|
||||
printf("Decoded AFID (bad page): %d\n", afid_bad_page);
|
||||
|
||||
const aca_error_info_t error_info_32 = decode_error_info(register_array_32, ACA_REGISTER_ARRAY_SIZE_32_BYTES, 0, 1);
|
||||
print_error_info(&error_info_32);
|
||||
|
||||
const aca_error_info_t error_info_128 = decode_error_info(register_array_128, ACA_REGISTER_ARRAY_SIZE_128_BYTES, 0, 1);
|
||||
print_error_info(&error_info_128);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -29,7 +29,9 @@
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
extern "C" {
|
||||
#include "aca-decode/aca_decode.h"
|
||||
}
|
||||
#include "amd_smi/impl/amd_smi_cper.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user