[SWDEV-536417] AFID & addc decode fixes (#449)

* fix endian problem
* use hw_revision and flags_mask from cper section instead of hardcoded values

---------

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>

[ROCm/amdsmi commit: 815e0252b1]
Этот коммит содержится в:
Saeed, Oosman
2025-06-06 13:41:16 -05:00
коммит произвёл GitHub
родитель 8c60c4ed94
Коммит cc2b4b4067
7 изменённых файлов: 126 добавлений и 110 удалений
+1 -3
Просмотреть файл
@@ -6498,9 +6498,7 @@ class AMDSMICommands():
if args.afid and args.cper_file:
afids = self.helpers.pvtDumpAfids(args.cper_file)
for afid in afids:
print(afid, end=" ")
print("")
print(' '.join(map(str, afids)))
return
if not self.group_check_printed:
+42 -39
Просмотреть файл
@@ -1082,7 +1082,28 @@ class AMDSMIHelpers():
print(msg)
logging.warning(msg)
def display_cper_files_generated(self, entries, device_handle, folder, follow):
def _severity_as_string(self, error_severity, notify_type, for_filename):
if error_severity == "non_fatal_uncorrected":
if(for_filename):
return "uncorrected"
return "NONFATAL-UNCORRECTED"
elif error_severity == "non_fatal_corrected":
if(for_filename):
return "corrected"
return "NONFATAL-CORRECTED"
elif error_severity == "fatal":
if notify_type == "BOOT":
if(for_filename):
return "boot"
return "BOOT"
if(for_filename):
return "fatal"
return "FATAL"
if(for_filename):
return "unknown"
return "UNKNOWN"
def display_cper_files_generated(self, entries, device_handle, folder):
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
@@ -1093,11 +1114,7 @@ class AMDSMIHelpers():
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
self._cper_warning_printed = True
# Header
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12}", end="")
if folder:
print(f" {'file_name':<17} {'afid'}", end="")
print("")
self._print_header(folder)
self._cper_display_initialized = True
# Loop through all entries in the dictionary.
@@ -1106,35 +1123,29 @@ class AMDSMIHelpers():
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
prefix = self._severity_as_string(error_severity, notify_type, True)
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12}", end="")
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<20}", end="")
if folder:
print(f" {cper_data_file:<17}", end="")
afids = self.pvtDumpAfids(cper_data_file)
for afid in afids:
print(afid, end=" ")
print(' '.join(map(str, afids)), end=" ")
print("")
self.increment_cper_count()
def _print_header(self, folder):
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<20}", end="")
if folder:
print(f" {'file_name':<17} {'list of afids'}", end="")
print("")
def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None):
# Onetime header
if not getattr(self, "_cper_display_initialized", False):
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} ", end="")
if folder:
print(f"{'file_name':<17} {'afid'}", end="")
print("")
self._print_header(folder)
self._cper_display_initialized = True
if folder:
@@ -1157,23 +1168,13 @@ class AMDSMIHelpers():
except OSError: pass
# --- determine prefix/severity ---
sev = entry.get("error_severity", "").lower()
nt = entry.get("notify_type", "")
if sev == "non_fatal_uncorrected":
prefix = "uncorrected"
elif sev == "non_fatal_corrected":
prefix = "corrected"
elif sev == "fatal" and nt == "BOOT":
prefix = "boot"
elif sev == "fatal":
prefix = "fatal"
else:
prefix = "unknown"
error_severity = entry.get("error_severity", "").lower()
notify_type = entry.get("notify_type", "")
prefix = self._severity_as_string(error_severity, notify_type, True)
# --- new filenames ---
count = self.get_cper_count()
cper_name = f"{prefix}_{count}.cper"
json_name = f"{prefix}_{count}.json"
cper_name = f"{prefix}-{count}.cper"
json_name = f"{prefix}-{count}.json"
cper_path = folder / cper_name
json_path = folder / json_name
@@ -1196,6 +1197,7 @@ class AMDSMIHelpers():
# --- collect for printing ---
ts = entry.get("timestamp", "unknown")
gid = self.get_gpu_id_from_device_handle(device_handle)
prefix = self._severity_as_string(error_severity, notify_type, False)
printed_rows.append((ts, gid, prefix, cper_name))
self.increment_cper_count()
@@ -1209,7 +1211,8 @@ class AMDSMIHelpers():
for ts, gid, prefix, fname in to_print:
cper_path = folder / cper_name
afids = self.pvtDumpAfids(cper_path)
print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17} {afids}")
afids = ' '.join(map(str, afids))
print(f"{ts:<20} {gid:<7} {prefix:<20} {fname:<17} {afids}")
else:
print(json.dumps(
@@ -1358,4 +1361,4 @@ class AMDSMIHelpers():
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
break
else:
self.display_cper_files_generated(entries, device_handle, args.folder, args.follow)
self.display_cper_files_generated(entries, device_handle, args.folder)
+37
Просмотреть файл
@@ -0,0 +1,37 @@
/**
* @file utils.h
* @brief Common utility functions
*/
#ifndef UTILS_H
#define UTILS_H
#include <stdint.h>
/**
* @brief Convert a 64-bit value from little endian to big endian
* @param[in] value Value to convert
* @return Converted value in big endian
*/
static inline uint64_t le64_to_be64(uint64_t value) {
return ((value & 0xFF00000000000000ULL) >> 56) |
((value & 0x00FF000000000000ULL) >> 40) |
((value & 0x0000FF0000000000ULL) >> 24) |
((value & 0x000000FF00000000ULL) >> 8) |
((value & 0x00000000FF000000ULL) << 8) |
((value & 0x0000000000FF0000ULL) << 24) |
((value & 0x000000000000FF00ULL) << 40) |
((value & 0x00000000000000FFULL) << 56);
}
/**
* @brief Convert an array of 64-bit values from little endian to big endian
* @param[in,out] array Array to convert
* @param[in] len Length of the array
*/
static inline void convert_array_le_to_be(uint64_t *array, size_t len) {
for (size_t i = 0; i < len; i++) {
array[i] = le64_to_be64(array[i]);
}
}
#endif /* UTILS_H */
+1 -1
Просмотреть файл
@@ -41,7 +41,7 @@ set(INC_LIST
"${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi_utils.h")
set(ACA_SRC_DIR "aca-decode")
set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c ${ACA_SRC_DIR}/aca_tables.c
set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_api.c ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c ${ACA_SRC_DIR}/aca_tables.c
${ACA_SRC_DIR}/error_map.c)
set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/aca-decode")
set(INC_LIST ${INC_LIST} ${ACA_INC_DIR}/aca_decode.h ${ACA_INC_DIR}/aca_fields.h ${ACA_INC_DIR}/aca_tables.h
+23 -10
Просмотреть файл
@@ -1,4 +1,5 @@
#include "aca_decode.h"
#include <utils.h>
int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision)
{
@@ -23,7 +24,6 @@ int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag,
raw_data.aca_ipid = register_array[5];
raw_data.aca_synd = register_array[6];
}
else
{
return -1; // Unsupported size
@@ -44,19 +44,32 @@ aca_error_info_t decode_error_info(const uint64_t *register_array, size_t array_
if (!register_array)
{
return error_info;
} if (array_len == 4) // 32 bytes
}
// Create a copy of the register array to avoid modifying the original
uint64_t converted_array[16];
if (array_len > 16) {
return error_info;
}
// Copy and convert the array
for (size_t i = 0; i < array_len; i++) {
converted_array[i] = le64_to_be64(register_array[i]);
}
if (array_len == 4) // 32 bytes
{
raw_data.aca_status = register_array[0];
raw_data.aca_addr = register_array[1];
raw_data.aca_ipid = register_array[2];
raw_data.aca_synd = register_array[3];
raw_data.aca_status = converted_array[0];
raw_data.aca_addr = converted_array[1];
raw_data.aca_ipid = converted_array[2];
raw_data.aca_synd = converted_array[3];
}
else if (array_len == 16) // 128 bytes
{
raw_data.aca_status = register_array[1];
raw_data.aca_addr = register_array[2];
raw_data.aca_ipid = register_array[5];
raw_data.aca_synd = register_array[6];
raw_data.aca_status = converted_array[1];
raw_data.aca_addr = converted_array[2];
raw_data.aca_ipid = converted_array[5];
raw_data.aca_synd = converted_array[6];
}
else
{
-1
Просмотреть файл
@@ -4183,7 +4183,6 @@ amdsmi_status_t amdsmi_get_afids_from_cper(
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
uint32_t i = 0;
for(int afid: cper_decode(cper)) {
if(i < *num_afids) {
+22 -56
Просмотреть файл
@@ -255,59 +255,19 @@ static int cper_dump_sec_desc(const struct cper_sec_desc *desc)
return 0;
}
static int aca_decode_fatal(const cper_sec_crashdump_data &data)
static int aca_decode_fatal(const cper_sec_crashdump_data &data, uint32_t flag, uint16_t hw_revision)
{
std::ostringstream ss;
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(&data.dump.fatal_err);
aca_raw_data_t raw_data;
raw_data.aca_status = register_array[0];
raw_data.aca_ipid = register_array[2];
raw_data.aca_synd = register_array[3];
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n";
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n";
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n";
raw_data.flags = 0;
raw_data.hw_revision = 1;
aca_error_info_t error_info = aca_decode(&raw_data);
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] fatal error_info.afid: " << std::dec << error_info.afid << "\n";
LOG_DEBUG(ss);
return error_info.afid;
return decode_afid(register_array, sizeof(data.dump.fatal_err)/sizeof(uint64_t), flag, hw_revision);
}
static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes) {
std::ostringstream ss;
if(num_bytes != CPER_ACA_REG_COUNT * sizeof(uint32_t)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Size of register array must be " << std::dec << (CPER_ACA_REG_COUNT * sizeof(uint32_t)) << " bytes\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes, uint32_t flag, uint16_t hw_revision)
{
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(reg_dump);
aca_raw_data_t raw_data;
raw_data.aca_status = register_array[2];
raw_data.aca_ipid = register_array[5];
raw_data.aca_synd = register_array[6];
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n";
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n";
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n";
raw_data.flags = 0;
raw_data.hw_revision = 1;
aca_error_info_t error_info = aca_decode(&raw_data);
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] non-fatal error_info.afid: " << std::dec << error_info.afid << "\n";
LOG_DEBUG(ss);
return error_info.afid;
return decode_afid(register_array, num_bytes, flag, hw_revision);
}
static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err)
static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err, const cper_sec_desc *section)
{
std::ostringstream ss;
@@ -339,10 +299,11 @@ exit:
LOG_DEBUG(ss);
return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump));
return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump)/sizeof(uint64_t),
section->flags_mask, section->revision_major);
}
static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump)
static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section)
{
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - FATAL~~~\n";
@@ -360,10 +321,10 @@ static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump)
LOG_DEBUG(ss);
return aca_decode_fatal(crashdump->data);
return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major);
}
static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump)
static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section)
{
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - BOOT TIME~~~\n";
@@ -375,7 +336,7 @@ static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump)
ss << "~~~~CRASH DUMP - BOOT TIME~~~\n\n";
LOG_DEBUG(ss);
return aca_decode_fatal(crashdump->data);
return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major);
}
} //namespace
@@ -526,29 +487,33 @@ std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper) {
const amdsmi_cper_guid_t *sec_guid = get_sec_desc_type(static_cast<struct cper_sec_desc *>(sec_desc_offset));
const amdsmi_cper_guid_t *cper_guid = get_cper_type(cper);
cper_dump_sec_desc(static_cast<struct cper_sec_desc *>(sec_desc_offset));
cper_sec_desc *section = static_cast<struct cper_sec_desc *>(sec_desc_offset);
cper_dump_sec_desc(section);
if (cper_is_cr(sec_guid)) {
struct cper_sec_crashdump *crashdump = static_cast<struct cper_sec_crashdump *>(sec_offset);
if (cper_is_bt(cper_guid)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding boot crash dump\n";
LOG_DEBUG(ss);
afids.emplace_back(cper_dump_cr_boot(static_cast<struct cper_sec_crashdump *>(sec_offset)));
afids.emplace_back(cper_dump_cr_boot(crashdump, section));
}
else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding crash dump\n";
LOG_DEBUG(ss);
afids.emplace_back(cper_dump_cr_fatal(static_cast<struct cper_sec_crashdump *>(sec_offset)));
afids.emplace_back(cper_dump_cr_fatal(crashdump, section));
}
}
else if (cper_is_nonstd(sec_guid)) {
struct cper_sec_nonstd_err *crashdump = static_cast<struct cper_sec_nonstd_err *>(sec_offset);
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding non-standard error\n";
LOG_DEBUG(ss);
afids.emplace_back(cper_dump_nonstd_err(static_cast<struct cper_sec_nonstd_err *>(sec_offset)));
afids.emplace_back(cper_dump_nonstd_err(crashdump, section));
}
else if (cper_is_proc_err(sec_guid)) {
struct cper_sec_nonstd_err *crashdump = static_cast<struct cper_sec_nonstd_err *>(sec_offset);
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding proc error section type\n";
LOG_DEBUG(ss);
afids.emplace_back(cper_dump_nonstd_err(static_cast<struct cper_sec_nonstd_err *>(sec_offset)));
afids.emplace_back(cper_dump_nonstd_err(crashdump, section));
}
else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Unknown error type!!\n";
@@ -560,6 +525,7 @@ std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper) {
}
}
return afids;
}