From cc2b4b4067d9474b5c040ba3224f365ef51c7763 Mon Sep 17 00:00:00 2001 From: "Saeed, Oosman" Date: Fri, 6 Jun 2025 13:41:16 -0500 Subject: [PATCH] [SWDEV-536417] AFID & addc decode fixes (#449) * fix endian problem * use hw_revision and flags_mask from cper section instead of hardcoded values --------- Signed-off-by: Maisam Arif [ROCm/amdsmi commit: 815e0252b1f58fcaad094d953a51a762f7dac553] --- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 4 +- projects/amdsmi/amdsmi_cli/amdsmi_helpers.py | 81 ++++++++++--------- projects/amdsmi/include/aca-decode/utils.h | 37 +++++++++ projects/amdsmi/src/CMakeLists.txt | 2 +- projects/amdsmi/src/aca-decode/aca_api.c | 33 +++++--- projects/amdsmi/src/amd_smi/amd_smi.cc | 1 - projects/amdsmi/src/amd_smi/amd_smi_cper.cc | 78 +++++------------- 7 files changed, 126 insertions(+), 110 deletions(-) create mode 100644 projects/amdsmi/include/aca-decode/utils.h diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index fb980794be..bfb89fd810 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -6498,9 +6498,7 @@ class AMDSMICommands(): if args.afid and args.cper_file: afids = self.helpers.pvtDumpAfids(args.cper_file) - for afid in afids: - print(afid, end=" ") - print("") + print(' '.join(map(str, afids))) return if not self.group_check_printed: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index f29f6eb26f..7f66095078 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1082,7 +1082,28 @@ class AMDSMIHelpers(): print(msg) logging.warning(msg) - def display_cper_files_generated(self, entries, device_handle, folder, follow): + def _severity_as_string(self, error_severity, notify_type, for_filename): + if error_severity == "non_fatal_uncorrected": + if(for_filename): + return "uncorrected" + return "NONFATAL-UNCORRECTED" + elif error_severity == "non_fatal_corrected": + if(for_filename): + return "corrected" + return "NONFATAL-CORRECTED" + elif error_severity == "fatal": + if notify_type == "BOOT": + if(for_filename): + return "boot" + return "BOOT" + if(for_filename): + return "fatal" + return "FATAL" + if(for_filename): + return "unknown" + return "UNKNOWN" + + def display_cper_files_generated(self, entries, device_handle, folder): # One‐time initialization: print warning & header only once if not getattr(self, "_cper_display_initialized", False): # Warning if no folder was specified elsewhere @@ -1093,11 +1114,7 @@ class AMDSMIHelpers(): print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder= is specified.") self._cper_warning_printed = True - # Header - print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12}", end="") - if folder: - print(f" {'file_name':<17} {'afid'}", end="") - print("") + self._print_header(folder) self._cper_display_initialized = True # Loop through all entries in the dictionary. @@ -1106,35 +1123,29 @@ class AMDSMIHelpers(): error_severity = entry.get("error_severity", "Unknown") notify_type = entry.get("notify_type", "Unknown") - if error_severity == "non_fatal_uncorrected": - prefix = "uncorrected" - elif error_severity == "non_fatal_corrected": - prefix = "corrected" - elif error_severity == "fatal": - prefix = "fatal" - if notify_type == "BOOT": - prefix = "boot" - + prefix = self._severity_as_string(error_severity, notify_type, True) cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" timestamp = entry.get("timestamp", "unknown") gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12}", end="") + print(f"{timestamp:<20} {gpu_id:<7} {prefix:<20}", end="") if folder: print(f" {cper_data_file:<17}", end="") afids = self.pvtDumpAfids(cper_data_file) - for afid in afids: - print(afid, end=" ") + print(' '.join(map(str, afids)), end=" ") print("") self.increment_cper_count() + def _print_header(self, folder): + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<20}", end="") + if folder: + print(f" {'file_name':<17} {'list of afids'}", end="") + print("") + def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None): # One‐time header if not getattr(self, "_cper_display_initialized", False): - print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} ", end="") - if folder: - print(f"{'file_name':<17} {'afid'}", end="") - print("") + self._print_header(folder) self._cper_display_initialized = True if folder: @@ -1157,23 +1168,13 @@ class AMDSMIHelpers(): except OSError: pass # --- determine prefix/severity --- - sev = entry.get("error_severity", "").lower() - nt = entry.get("notify_type", "") - if sev == "non_fatal_uncorrected": - prefix = "uncorrected" - elif sev == "non_fatal_corrected": - prefix = "corrected" - elif sev == "fatal" and nt == "BOOT": - prefix = "boot" - elif sev == "fatal": - prefix = "fatal" - else: - prefix = "unknown" - + error_severity = entry.get("error_severity", "").lower() + notify_type = entry.get("notify_type", "") + prefix = self._severity_as_string(error_severity, notify_type, True) # --- new filenames --- count = self.get_cper_count() - cper_name = f"{prefix}_{count}.cper" - json_name = f"{prefix}_{count}.json" + cper_name = f"{prefix}-{count}.cper" + json_name = f"{prefix}-{count}.json" cper_path = folder / cper_name json_path = folder / json_name @@ -1196,6 +1197,7 @@ class AMDSMIHelpers(): # --- collect for printing --- ts = entry.get("timestamp", "unknown") gid = self.get_gpu_id_from_device_handle(device_handle) + prefix = self._severity_as_string(error_severity, notify_type, False) printed_rows.append((ts, gid, prefix, cper_name)) self.increment_cper_count() @@ -1209,7 +1211,8 @@ class AMDSMIHelpers(): for ts, gid, prefix, fname in to_print: cper_path = folder / cper_name afids = self.pvtDumpAfids(cper_path) - print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17} {afids}") + afids = ' '.join(map(str, afids)) + print(f"{ts:<20} {gid:<7} {prefix:<20} {fname:<17} {afids}") else: print(json.dumps( @@ -1358,4 +1361,4 @@ class AMDSMIHelpers(): self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) break else: - self.display_cper_files_generated(entries, device_handle, args.folder, args.follow) + self.display_cper_files_generated(entries, device_handle, args.folder) diff --git a/projects/amdsmi/include/aca-decode/utils.h b/projects/amdsmi/include/aca-decode/utils.h new file mode 100644 index 0000000000..60e7826ba2 --- /dev/null +++ b/projects/amdsmi/include/aca-decode/utils.h @@ -0,0 +1,37 @@ +/** + * @file utils.h + * @brief Common utility functions + */ +#ifndef UTILS_H +#define UTILS_H + +#include + +/** + * @brief Convert a 64-bit value from little endian to big endian + * @param[in] value Value to convert + * @return Converted value in big endian + */ +static inline uint64_t le64_to_be64(uint64_t value) { + return ((value & 0xFF00000000000000ULL) >> 56) | + ((value & 0x00FF000000000000ULL) >> 40) | + ((value & 0x0000FF0000000000ULL) >> 24) | + ((value & 0x000000FF00000000ULL) >> 8) | + ((value & 0x00000000FF000000ULL) << 8) | + ((value & 0x0000000000FF0000ULL) << 24) | + ((value & 0x000000000000FF00ULL) << 40) | + ((value & 0x00000000000000FFULL) << 56); +} + +/** + * @brief Convert an array of 64-bit values from little endian to big endian + * @param[in,out] array Array to convert + * @param[in] len Length of the array + */ +static inline void convert_array_le_to_be(uint64_t *array, size_t len) { + for (size_t i = 0; i < len; i++) { + array[i] = le64_to_be64(array[i]); + } +} + +#endif /* UTILS_H */ \ No newline at end of file diff --git a/projects/amdsmi/src/CMakeLists.txt b/projects/amdsmi/src/CMakeLists.txt index 10f9c832ff..01a67a97e1 100644 --- a/projects/amdsmi/src/CMakeLists.txt +++ b/projects/amdsmi/src/CMakeLists.txt @@ -41,7 +41,7 @@ set(INC_LIST "${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi/rocm_smi_utils.h") set(ACA_SRC_DIR "aca-decode") -set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c ${ACA_SRC_DIR}/aca_tables.c +set(SRC_LIST ${SRC_LIST} ${ACA_SRC_DIR}/aca_api.c ${ACA_SRC_DIR}/aca_decode.c ${ACA_SRC_DIR}/aca_fields.c ${ACA_SRC_DIR}/aca_tables.c ${ACA_SRC_DIR}/error_map.c) set(ACA_INC_DIR "${PROJECT_SOURCE_DIR}/include/aca-decode") set(INC_LIST ${INC_LIST} ${ACA_INC_DIR}/aca_decode.h ${ACA_INC_DIR}/aca_fields.h ${ACA_INC_DIR}/aca_tables.h diff --git a/projects/amdsmi/src/aca-decode/aca_api.c b/projects/amdsmi/src/aca-decode/aca_api.c index 143f9dc275..3c2efa0a87 100644 --- a/projects/amdsmi/src/aca-decode/aca_api.c +++ b/projects/amdsmi/src/aca-decode/aca_api.c @@ -1,4 +1,5 @@ #include "aca_decode.h" +#include int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, uint16_t hw_revision) { @@ -23,7 +24,6 @@ int decode_afid(const uint64_t *register_array, size_t array_len, uint32_t flag, raw_data.aca_ipid = register_array[5]; raw_data.aca_synd = register_array[6]; } - else { return -1; // Unsupported size @@ -44,19 +44,32 @@ aca_error_info_t decode_error_info(const uint64_t *register_array, size_t array_ if (!register_array) { return error_info; - } if (array_len == 4) // 32 bytes + } + + // Create a copy of the register array to avoid modifying the original + uint64_t converted_array[16]; + if (array_len > 16) { + return error_info; + } + + // Copy and convert the array + for (size_t i = 0; i < array_len; i++) { + converted_array[i] = le64_to_be64(register_array[i]); + } + + if (array_len == 4) // 32 bytes { - raw_data.aca_status = register_array[0]; - raw_data.aca_addr = register_array[1]; - raw_data.aca_ipid = register_array[2]; - raw_data.aca_synd = register_array[3]; + raw_data.aca_status = converted_array[0]; + raw_data.aca_addr = converted_array[1]; + raw_data.aca_ipid = converted_array[2]; + raw_data.aca_synd = converted_array[3]; } else if (array_len == 16) // 128 bytes { - raw_data.aca_status = register_array[1]; - raw_data.aca_addr = register_array[2]; - raw_data.aca_ipid = register_array[5]; - raw_data.aca_synd = register_array[6]; + raw_data.aca_status = converted_array[1]; + raw_data.aca_addr = converted_array[2]; + raw_data.aca_ipid = converted_array[5]; + raw_data.aca_synd = converted_array[6]; } else { diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 9e3acda49a..b92d9704e1 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -4183,7 +4183,6 @@ amdsmi_status_t amdsmi_get_afids_from_cper( LOG_ERROR(ss); return AMDSMI_STATUS_INVAL; } - uint32_t i = 0; for(int afid: cper_decode(cper)) { if(i < *num_afids) { diff --git a/projects/amdsmi/src/amd_smi/amd_smi_cper.cc b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc index 91f5b3166b..d9504ceb52 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_cper.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc @@ -255,59 +255,19 @@ static int cper_dump_sec_desc(const struct cper_sec_desc *desc) return 0; } -static int aca_decode_fatal(const cper_sec_crashdump_data &data) +static int aca_decode_fatal(const cper_sec_crashdump_data &data, uint32_t flag, uint16_t hw_revision) { - std::ostringstream ss; - const uint64_t *register_array = reinterpret_cast(&data.dump.fatal_err); - aca_raw_data_t raw_data; - raw_data.aca_status = register_array[0]; - raw_data.aca_ipid = register_array[2]; - raw_data.aca_synd = register_array[3]; - - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n"; - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n"; - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n"; - - raw_data.flags = 0; - raw_data.hw_revision = 1; - - aca_error_info_t error_info = aca_decode(&raw_data); - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] fatal error_info.afid: " << std::dec << error_info.afid << "\n"; - LOG_DEBUG(ss); - - return error_info.afid; + return decode_afid(register_array, sizeof(data.dump.fatal_err)/sizeof(uint64_t), flag, hw_revision); } -static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes) { - - std::ostringstream ss; - if(num_bytes != CPER_ACA_REG_COUNT * sizeof(uint32_t)) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Size of register array must be " << std::dec << (CPER_ACA_REG_COUNT * sizeof(uint32_t)) << " bytes\n"; - LOG_ERROR(ss); - return AMDSMI_STATUS_INVAL; - } +static int aca_decode_corrected_error(const uint32_t *reg_dump, size_t num_bytes, uint32_t flag, uint16_t hw_revision) +{ const uint64_t *register_array = reinterpret_cast(reg_dump); - aca_raw_data_t raw_data; - raw_data.aca_status = register_array[2]; - raw_data.aca_ipid = register_array[5]; - raw_data.aca_synd = register_array[6]; - - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_status: 0x" << std::hex << raw_data.aca_status << "\n"; - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_ipid: 0x" << std::hex << raw_data.aca_ipid << "\n"; - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] aca_synd: 0x" << std::hex << raw_data.aca_synd << "\n"; - - raw_data.flags = 0; - raw_data.hw_revision = 1; - - aca_error_info_t error_info = aca_decode(&raw_data); - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] non-fatal error_info.afid: " << std::dec << error_info.afid << "\n"; - LOG_DEBUG(ss); - - return error_info.afid; + return decode_afid(register_array, num_bytes, flag, hw_revision); } -static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err) +static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err, const cper_sec_desc *section) { std::ostringstream ss; @@ -339,10 +299,11 @@ exit: LOG_DEBUG(ss); - return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump)); + return aca_decode_corrected_error(body->err_ctx.reg_dump, sizeof(body->err_ctx.reg_dump)/sizeof(uint64_t), + section->flags_mask, section->revision_major); } -static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump) +static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - FATAL~~~\n"; @@ -360,10 +321,10 @@ static int cper_dump_cr_fatal(const struct cper_sec_crashdump *crashdump) LOG_DEBUG(ss); - return aca_decode_fatal(crashdump->data); + return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major); } -static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump) +static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump, const cper_sec_desc *section) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~CRASH DUMP - BOOT TIME~~~\n"; @@ -375,7 +336,7 @@ static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump) ss << "~~~~CRASH DUMP - BOOT TIME~~~\n\n"; LOG_DEBUG(ss); - return aca_decode_fatal(crashdump->data); + return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major); } } //namespace @@ -526,29 +487,33 @@ std::vector cper_decode(const amdsmi_cper_hdr_t *cper) { const amdsmi_cper_guid_t *sec_guid = get_sec_desc_type(static_cast(sec_desc_offset)); const amdsmi_cper_guid_t *cper_guid = get_cper_type(cper); - cper_dump_sec_desc(static_cast(sec_desc_offset)); + cper_sec_desc *section = static_cast(sec_desc_offset); + cper_dump_sec_desc(section); if (cper_is_cr(sec_guid)) { + struct cper_sec_crashdump *crashdump = static_cast(sec_offset); if (cper_is_bt(cper_guid)) { ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding boot crash dump\n"; LOG_DEBUG(ss); - afids.emplace_back(cper_dump_cr_boot(static_cast(sec_offset))); + afids.emplace_back(cper_dump_cr_boot(crashdump, section)); } else { ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding crash dump\n"; LOG_DEBUG(ss); - afids.emplace_back(cper_dump_cr_fatal(static_cast(sec_offset))); + afids.emplace_back(cper_dump_cr_fatal(crashdump, section)); } } else if (cper_is_nonstd(sec_guid)) { + struct cper_sec_nonstd_err *crashdump = static_cast(sec_offset); ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding non-standard error\n"; LOG_DEBUG(ss); - afids.emplace_back(cper_dump_nonstd_err(static_cast(sec_offset))); + afids.emplace_back(cper_dump_nonstd_err(crashdump, section)); } else if (cper_is_proc_err(sec_guid)) { + struct cper_sec_nonstd_err *crashdump = static_cast(sec_offset); ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] decoding proc error section type\n"; LOG_DEBUG(ss); - afids.emplace_back(cper_dump_nonstd_err(static_cast(sec_offset))); + afids.emplace_back(cper_dump_nonstd_err(crashdump, section)); } else { ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Unknown error type!!\n"; @@ -560,6 +525,7 @@ std::vector cper_decode(const amdsmi_cper_hdr_t *cper) { } } + return afids; }