[SWDEV-536417] AFID & addc decode fixes (#449)
* fix endian problem
* use hw_revision and flags_mask from cper section instead of hardcoded values
---------
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
[ROCm/amdsmi commit: 815e0252b1]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
8c60c4ed94
Коммит
cc2b4b4067
@@ -6498,9 +6498,7 @@ class AMDSMICommands():
|
||||
|
||||
if args.afid and args.cper_file:
|
||||
afids = self.helpers.pvtDumpAfids(args.cper_file)
|
||||
for afid in afids:
|
||||
print(afid, end=" ")
|
||||
print("")
|
||||
print(' '.join(map(str, afids)))
|
||||
return
|
||||
|
||||
if not self.group_check_printed:
|
||||
|
||||
@@ -1082,7 +1082,28 @@ class AMDSMIHelpers():
|
||||
print(msg)
|
||||
logging.warning(msg)
|
||||
|
||||
def display_cper_files_generated(self, entries, device_handle, folder, follow):
|
||||
def _severity_as_string(self, error_severity, notify_type, for_filename):
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
if(for_filename):
|
||||
return "uncorrected"
|
||||
return "NONFATAL-UNCORRECTED"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
if(for_filename):
|
||||
return "corrected"
|
||||
return "NONFATAL-CORRECTED"
|
||||
elif error_severity == "fatal":
|
||||
if notify_type == "BOOT":
|
||||
if(for_filename):
|
||||
return "boot"
|
||||
return "BOOT"
|
||||
if(for_filename):
|
||||
return "fatal"
|
||||
return "FATAL"
|
||||
if(for_filename):
|
||||
return "unknown"
|
||||
return "UNKNOWN"
|
||||
|
||||
def display_cper_files_generated(self, entries, device_handle, folder):
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
@@ -1093,11 +1114,7 @@ class AMDSMIHelpers():
|
||||
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
|
||||
self._cper_warning_printed = True
|
||||
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12}", end="")
|
||||
if folder:
|
||||
print(f" {'file_name':<17} {'afid'}", end="")
|
||||
print("")
|
||||
self._print_header(folder)
|
||||
self._cper_display_initialized = True
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
@@ -1106,35 +1123,29 @@ class AMDSMIHelpers():
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
prefix = self._severity_as_string(error_severity, notify_type, True)
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12}", end="")
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<20}", end="")
|
||||
if folder:
|
||||
print(f" {cper_data_file:<17}", end="")
|
||||
afids = self.pvtDumpAfids(cper_data_file)
|
||||
for afid in afids:
|
||||
print(afid, end=" ")
|
||||
print(' '.join(map(str, afids)), end=" ")
|
||||
print("")
|
||||
self.increment_cper_count()
|
||||
|
||||
def _print_header(self, folder):
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<20}", end="")
|
||||
if folder:
|
||||
print(f" {'file_name':<17} {'list of afids'}", end="")
|
||||
print("")
|
||||
|
||||
def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None):
|
||||
# One‐time header
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} ", end="")
|
||||
if folder:
|
||||
print(f"{'file_name':<17} {'afid'}", end="")
|
||||
print("")
|
||||
self._print_header(folder)
|
||||
self._cper_display_initialized = True
|
||||
|
||||
if folder:
|
||||
@@ -1157,23 +1168,13 @@ class AMDSMIHelpers():
|
||||
except OSError: pass
|
||||
|
||||
# --- determine prefix/severity ---
|
||||
sev = entry.get("error_severity", "").lower()
|
||||
nt = entry.get("notify_type", "")
|
||||
if sev == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif sev == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif sev == "fatal" and nt == "BOOT":
|
||||
prefix = "boot"
|
||||
elif sev == "fatal":
|
||||
prefix = "fatal"
|
||||
else:
|
||||
prefix = "unknown"
|
||||
|
||||
error_severity = entry.get("error_severity", "").lower()
|
||||
notify_type = entry.get("notify_type", "")
|
||||
prefix = self._severity_as_string(error_severity, notify_type, True)
|
||||
# --- new filenames ---
|
||||
count = self.get_cper_count()
|
||||
cper_name = f"{prefix}_{count}.cper"
|
||||
json_name = f"{prefix}_{count}.json"
|
||||
cper_name = f"{prefix}-{count}.cper"
|
||||
json_name = f"{prefix}-{count}.json"
|
||||
cper_path = folder / cper_name
|
||||
json_path = folder / json_name
|
||||
|
||||
@@ -1196,6 +1197,7 @@ class AMDSMIHelpers():
|
||||
# --- collect for printing ---
|
||||
ts = entry.get("timestamp", "unknown")
|
||||
gid = self.get_gpu_id_from_device_handle(device_handle)
|
||||
prefix = self._severity_as_string(error_severity, notify_type, False)
|
||||
printed_rows.append((ts, gid, prefix, cper_name))
|
||||
|
||||
self.increment_cper_count()
|
||||
@@ -1209,7 +1211,8 @@ class AMDSMIHelpers():
|
||||
for ts, gid, prefix, fname in to_print:
|
||||
cper_path = folder / cper_name
|
||||
afids = self.pvtDumpAfids(cper_path)
|
||||
print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17} {afids}")
|
||||
afids = ' '.join(map(str, afids))
|
||||
print(f"{ts:<20} {gid:<7} {prefix:<20} {fname:<17} {afids}")
|
||||
|
||||
else:
|
||||
print(json.dumps(
|
||||
@@ -1358,4 +1361,4 @@ class AMDSMIHelpers():
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
break
|
||||
else:
|
||||
self.display_cper_files_generated(entries, device_handle, args.folder, args.follow)
|
||||
self.display_cper_files_generated(entries, device_handle, args.folder)
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
/**
|
||||
* @file utils.h
|
||||
* @brief Common utility functions
|
||||
*/
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* @brief Convert a 64-bit value from little endian to big endian
|
||||
* @param[in] value Value to convert
|
||||
* @return Converted value in big endian
|
||||
*/
|
||||
static inline uint64_t le64_to_be64(uint64_t value) {
|
||||
return ((value & 0xFF00000000000000ULL) >> 56) |
|
||||
((value & 0x00FF000000000000ULL) >> 40) |
|
||||
((value & 0x0000FF0000000000ULL) >> 24) |
|
||||
((value & 0x000000FF00000000ULL) >> 8) |
|
||||
((value & 0x00000000FF000000ULL) << 8) |
|
||||
((value & 0x0000000000FF0000ULL) << 24) |
|
||||
((value & 0x000000000000FF00ULL) << 40) |
|
||||
((value & 0x00000000000000FFULL) << 56);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Convert an array of 64-bit values from little endian to big endian
|
||||
* @param[in,out] array Array to convert
|
||||
* @param[in] len Length of the array
|
||||
*/
|
||||
static inline void convert_array_le_to_be(uint64_t *array, size_t len) {
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
array[i] = le64_to_be64(array[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* UTILS_H */
|
||||
@@ -41,7 +41,7 @@ set(INC_LIST
|
||||
"${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi_utils.h")
|
||||
|
||||
set(ACA_SRC_DIR "aca-decode")
|
||||
set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c ${ACA_SRC_DIR}/aca_tables.c
|
||||
set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_api.c ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c ${ACA_SRC_DIR}/aca_tables.c
|
||||
${ACA_SRC_DIR}/error_map.c)
|
||||
set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/aca-decode")
|
||||
set(INC_LIST ${INC_LIST} ${ACA_INC_DIR}/aca_decode.h ${ACA_INC_DIR}/aca_fields.h ${ACA_INC_DIR}/aca_tables.h
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "aca_decode.h"
|
||||
#include <utils.h>
|
||||
|
||||
int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision)
|
||||
{
|
||||
@@ -23,7 +24,6 @@ int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag,
|
||||
raw_data.aca_ipid = register_array[5];
|
||||
raw_data.aca_synd = register_array[6];
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
return -1; // Unsupported size
|
||||
@@ -44,19 +44,32 @@ aca_error_info_t decode_error_info(const uint64_t *register_array, size_t array_
|
||||
if (!register_array)
|
||||
{
|
||||
return error_info;
|
||||
} if (array_len == 4) // 32 bytes
|
||||
}
|
||||
|
||||
// Create a copy of the register array to avoid modifying the original
|
||||
uint64_t converted_array[16];
|
||||
if (array_len > 16) {
|
||||
return error_info;
|
||||
}
|
||||
|
||||
// Copy and convert the array
|
||||
for (size_t i = 0; i < array_len; i++) {
|
||||
converted_array[i] = le64_to_be64(register_array[i]);
|
||||
}
|
||||
|
||||
if (array_len == 4) // 32 bytes
|
||||
{
|
||||
raw_data.aca_status = register_array[0];
|
||||
raw_data.aca_addr = register_array[1];
|
||||
raw_data.aca_ipid = register_array[2];
|
||||
raw_data.aca_synd = register_array[3];
|
||||
raw_data.aca_status = converted_array[0];
|
||||
raw_data.aca_addr = converted_array[1];
|
||||
raw_data.aca_ipid = converted_array[2];
|
||||
raw_data.aca_synd = converted_array[3];
|
||||
}
|
||||
else if (array_len == 16) // 128 bytes
|
||||
{
|
||||
raw_data.aca_status = register_array[1];
|
||||
raw_data.aca_addr = register_array[2];
|
||||
raw_data.aca_ipid = register_array[5];
|
||||
raw_data.aca_synd = register_array[6];
|
||||
raw_data.aca_status = converted_array[1];
|
||||
raw_data.aca_addr = converted_array[2];
|
||||
raw_data.aca_ipid = converted_array[5];
|
||||
raw_data.aca_synd = converted_array[6];
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -4183,7 +4183,6 @@ amdsmi_status_t amdsmi_get_afids_from_cper(
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
uint32_t i = 0;
|
||||
for(int afid: cper_decode(cper)) {
|
||||
if(i < *num_afids) {
|
||||
|
||||
@@ -255,59 +255,19 @@ static int cper_dump_sec_desc(const struct cper_sec_desc *desc)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int aca_decode_fatal(const cper_sec_crashdump_data &data)
|
||||
static int aca_decode_fatal(const cper_sec_crashdump_data &data, uint32_t flag, uint16_t hw_revision)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
|
||||
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(&data.dump.fatal_err);
|
||||
aca_raw_data_t raw_data;
|
||||
raw_data.aca_status = register_array[0];
|
||||
raw_data.aca_ipid = register_array[2];
|
||||
raw_data.aca_synd = register_array[3];
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n";
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n";
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n";
|
||||
|
||||
raw_data.flags = 0;
|
||||
raw_data.hw_revision = 1;
|
||||
|
||||
aca_error_info_t error_info = aca_decode(&raw_data);
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] fatal error_info.afid: " << std::dec << error_info.afid << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return error_info.afid;
|
||||
return decode_afid(register_array, sizeof(data.dump.fatal_err)/sizeof(uint64_t), flag, hw_revision);
|
||||
}
|
||||
|
||||
static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes) {
|
||||
|
||||
std::ostringstream ss;
|
||||
if(num_bytes != CPER_ACA_REG_COUNT * sizeof(uint32_t)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Size of register array must be " << std::dec << (CPER_ACA_REG_COUNT * sizeof(uint32_t)) << " bytes\n";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes, uint32_t flag, uint16_t hw_revision)
|
||||
{
|
||||
const uint64_t *register_array = reinterpret_cast<const uint64_t *>(reg_dump);
|
||||
aca_raw_data_t raw_data;
|
||||
raw_data.aca_status = register_array[2];
|
||||
raw_data.aca_ipid = register_array[5];
|
||||
raw_data.aca_synd = register_array[6];
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n";
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n";
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n";
|
||||
|
||||
raw_data.flags = 0;
|
||||
raw_data.hw_revision = 1;
|
||||
|
||||
aca_error_info_t error_info = aca_decode(&raw_data);
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] non-fatal error_info.afid: " << std::dec << error_info.afid << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return error_info.afid;
|
||||
return decode_afid(register_array, num_bytes, flag, hw_revision);
|
||||
}
|
||||
|
||||
static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err)
|
||||
static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err, const cper_sec_desc *section)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
|
||||
@@ -339,10 +299,11 @@ exit:
|
||||
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump));
|
||||
return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump)/sizeof(uint64_t),
|
||||
section->flags_mask, section->revision_major);
|
||||
}
|
||||
|
||||
static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump)
|
||||
static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - FATAL~~~\n";
|
||||
@@ -360,10 +321,10 @@ static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump)
|
||||
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return aca_decode_fatal(crashdump->data);
|
||||
return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major);
|
||||
}
|
||||
|
||||
static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump)
|
||||
static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - BOOT TIME~~~\n";
|
||||
@@ -375,7 +336,7 @@ static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump)
|
||||
ss << "~~~~CRASH DUMP - BOOT TIME~~~\n\n";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return aca_decode_fatal(crashdump->data);
|
||||
return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major);
|
||||
}
|
||||
|
||||
} //namespace
|
||||
@@ -526,29 +487,33 @@ std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper) {
|
||||
const amdsmi_cper_guid_t *sec_guid = get_sec_desc_type(static_cast<struct cper_sec_desc *>(sec_desc_offset));
|
||||
const amdsmi_cper_guid_t *cper_guid = get_cper_type(cper);
|
||||
|
||||
cper_dump_sec_desc(static_cast<struct cper_sec_desc *>(sec_desc_offset));
|
||||
cper_sec_desc *section = static_cast<struct cper_sec_desc *>(sec_desc_offset);
|
||||
cper_dump_sec_desc(section);
|
||||
|
||||
if (cper_is_cr(sec_guid)) {
|
||||
struct cper_sec_crashdump *crashdump = static_cast<struct cper_sec_crashdump *>(sec_offset);
|
||||
if (cper_is_bt(cper_guid)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding boot crash dump\n";
|
||||
LOG_DEBUG(ss);
|
||||
afids.emplace_back(cper_dump_cr_boot(static_cast<struct cper_sec_crashdump *>(sec_offset)));
|
||||
afids.emplace_back(cper_dump_cr_boot(crashdump, section));
|
||||
}
|
||||
else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding crash dump\n";
|
||||
LOG_DEBUG(ss);
|
||||
afids.emplace_back(cper_dump_cr_fatal(static_cast<struct cper_sec_crashdump *>(sec_offset)));
|
||||
afids.emplace_back(cper_dump_cr_fatal(crashdump, section));
|
||||
}
|
||||
}
|
||||
else if (cper_is_nonstd(sec_guid)) {
|
||||
struct cper_sec_nonstd_err *crashdump = static_cast<struct cper_sec_nonstd_err *>(sec_offset);
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding non-standard error\n";
|
||||
LOG_DEBUG(ss);
|
||||
afids.emplace_back(cper_dump_nonstd_err(static_cast<struct cper_sec_nonstd_err *>(sec_offset)));
|
||||
afids.emplace_back(cper_dump_nonstd_err(crashdump, section));
|
||||
}
|
||||
else if (cper_is_proc_err(sec_guid)) {
|
||||
struct cper_sec_nonstd_err *crashdump = static_cast<struct cper_sec_nonstd_err *>(sec_offset);
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding proc error section type\n";
|
||||
LOG_DEBUG(ss);
|
||||
afids.emplace_back(cper_dump_nonstd_err(static_cast<struct cper_sec_nonstd_err *>(sec_offset)));
|
||||
afids.emplace_back(cper_dump_nonstd_err(crashdump, section));
|
||||
}
|
||||
else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Unknown error type!!\n";
|
||||
@@ -560,6 +525,7 @@ std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return afids;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user