[SWDEV-536417] CPER Display fixes
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Change-Id: Ic2f3901d0f4c95bd9ed4beda8aa5fd3d596df8d2
[ROCm/amdsmi commit: fb592e003a]
Этот коммит содержится в:
коммит произвёл
Arif, Maisam
родитель
20e374663d
Коммит
7eea09e4d8
@@ -1108,10 +1108,7 @@ class AMDSMIHelpers():
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
if not getattr(self, "_cper_warning_printed", False):
|
||||
YELLOW = "\033[33m"
|
||||
RED = "\033[31m"
|
||||
RESET = "\033[0m"
|
||||
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
|
||||
print(f"WARNING:No cper files will be dumped unless --folder=<folder_name> is specified.")
|
||||
self._cper_warning_printed = True
|
||||
|
||||
self._print_header(folder)
|
||||
@@ -1120,20 +1117,19 @@ class AMDSMIHelpers():
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
prefix = self._severity_as_string(error_severity, notify_type, True)
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<20}", end="")
|
||||
prefix = self._severity_as_string(entry.get("error_severity", "Unknown"),
|
||||
entry.get("notify_type", "Unknown"),
|
||||
True)
|
||||
output = f"{timestamp:<20} {gpu_id:<7} {prefix:<20}"
|
||||
if folder:
|
||||
print(f" {cper_data_file:<17}", end="")
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
afids = self.pvtDumpAfids(cper_data_file)
|
||||
print(' '.join(map(str, afids)), end=" ")
|
||||
print("")
|
||||
afids_str = ' '.join(map(str, afids))
|
||||
output += f" {cper_data_file:<17} {afids_str}"
|
||||
|
||||
print(output)
|
||||
self.increment_cper_count()
|
||||
|
||||
def _print_header(self, folder):
|
||||
@@ -1143,7 +1139,17 @@ class AMDSMIHelpers():
|
||||
print("")
|
||||
|
||||
def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None):
|
||||
# One‐time header
|
||||
"""
|
||||
Dump CPER entries to files in the specified folder. Handles batch deletion if file limit is exceeded.
|
||||
|
||||
Parameters:
|
||||
folder (str): Path to the folder where CPER files will be dumped.
|
||||
entries (dict): Dictionary containing CPER entry metadata.
|
||||
cper_data (list): List of CPER data objects with 'bytes' and 'size' keys.
|
||||
device_handle: Device handle for GPU identification.
|
||||
file_limit (int, optional): Maximum number of files to retain in the folder.
|
||||
"""
|
||||
# Initialize header display
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
self._print_header(folder)
|
||||
self._cper_display_initialized = True
|
||||
@@ -1152,74 +1158,84 @@ class AMDSMIHelpers():
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
printed_rows = []
|
||||
output_rows = {}
|
||||
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# --- rotate out oldest if over limit ---
|
||||
# Batch deletion if file limit is exceeded
|
||||
if file_limit:
|
||||
files = sorted(folder.glob("*.cper"), key=lambda p: p.stat().st_mtime)
|
||||
while len(files) >= file_limit:
|
||||
old = files.pop(0)
|
||||
try: old.unlink()
|
||||
except OSError: pass
|
||||
j = old.with_suffix('.json')
|
||||
if j.exists():
|
||||
try: j.unlink()
|
||||
except OSError: pass
|
||||
folder_files = list(sorted(folder.glob("*.cper"), key=lambda p: p.stat().st_mtime))
|
||||
if file_limit < len(folder_files):
|
||||
for old_file in folder_files[:len(folder_files) - file_limit]:
|
||||
try:
|
||||
old_file.unlink()
|
||||
json_file = old_file.with_suffix('.json')
|
||||
if json_file.exists():
|
||||
json_file.unlink()
|
||||
except OSError as e:
|
||||
logging.debug(f"Failed to delete file {old_file}: {e}")
|
||||
|
||||
# --- determine prefix/severity ---
|
||||
# Determine prefix/severity
|
||||
error_severity = entry.get("error_severity", "").lower()
|
||||
notify_type = entry.get("notify_type", "")
|
||||
prefix = self._severity_as_string(error_severity, notify_type, True)
|
||||
# --- new filenames ---
|
||||
count = self.get_cper_count()
|
||||
cper_name = f"{prefix}-{count}.cper"
|
||||
json_name = f"{prefix}-{count}.json"
|
||||
cper_path = folder / cper_name
|
||||
json_path = folder / json_name
|
||||
|
||||
# --- write files ---
|
||||
self.write_binary(
|
||||
cper_data[entry_index]["bytes"],
|
||||
cper_data[entry_index]["size"],
|
||||
cper_path
|
||||
)
|
||||
# Generate filenames
|
||||
count = self.get_cper_count()
|
||||
cper_name = f"{prefix}-{count}.cper"
|
||||
json_name = f"{prefix}-{count}.json"
|
||||
cper_path = folder / cper_name
|
||||
json_path = folder / json_name
|
||||
|
||||
# Write CPER binary file
|
||||
try:
|
||||
with json_path.open("w") as f:
|
||||
f.write(json.dumps(
|
||||
entry,
|
||||
self.write_binary(
|
||||
cper_data[entry_index]["bytes"],
|
||||
cper_data[entry_index]["size"],
|
||||
cper_path
|
||||
)
|
||||
except Exception as e:
|
||||
logging.debug(f"Failed to write CPER file {cper_path}: {e}")
|
||||
|
||||
# Write JSON metadata file
|
||||
try:
|
||||
with json_path.open("w") as cper_json_file:
|
||||
json.dump(
|
||||
obj=entry,
|
||||
fp=cper_json_file,
|
||||
indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o
|
||||
))
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to write JSON to {json_path}: {e}")
|
||||
|
||||
# --- collect for printing ---
|
||||
ts = entry.get("timestamp", "unknown")
|
||||
gid = self.get_gpu_id_from_device_handle(device_handle)
|
||||
prefix = self._severity_as_string(error_severity, notify_type, False)
|
||||
printed_rows.append((ts, gid, prefix, cper_name))
|
||||
logging.debug(f"Failed to write JSON file {json_path}: {e}")
|
||||
|
||||
# Collect data for printing
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
severity = self._severity_as_string(error_severity, notify_type, False)
|
||||
output_rows[cper_path] = [timestamp, gpu_id, severity, cper_name]
|
||||
self.increment_cper_count()
|
||||
|
||||
# --- only now actually print: either all, or just last `file_limit` ---
|
||||
if file_limit:
|
||||
to_print = printed_rows[-file_limit:]
|
||||
else:
|
||||
to_print = printed_rows
|
||||
|
||||
for ts, gid, prefix, fname in to_print:
|
||||
cper_path = folder / cper_name
|
||||
afids = self.pvtDumpAfids(cper_path)
|
||||
afids = ' '.join(map(str, afids))
|
||||
print(f"{ts:<20} {gid:<7} {prefix:<20} {fname:<17} {afids}")
|
||||
# Print collected rows
|
||||
for cper_path, row in output_rows.items():
|
||||
timestamp, gpu_id, severity, fname = row
|
||||
try:
|
||||
afids = self.pvtDumpAfids(cper_path)
|
||||
afids_str = ' '.join(map(str, afids))
|
||||
except Exception as e:
|
||||
afids_str = "Error fetching AFIDs"
|
||||
logging.debug(f"Failed to fetch AFIDs for {cper_path}: {e}")
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {severity:<20} {fname:<17} {afids_str}")
|
||||
|
||||
else:
|
||||
print(json.dumps(
|
||||
entries,
|
||||
indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o
|
||||
))
|
||||
# Print entries as JSON if no folder is specified
|
||||
try:
|
||||
print(json.dumps(
|
||||
entries,
|
||||
indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o
|
||||
))
|
||||
except Exception as e:
|
||||
logging.debug(f"Failed to dump entries as JSON: {e}")
|
||||
|
||||
def write_binary(self, data, size, filepath):
|
||||
"""
|
||||
@@ -1242,7 +1258,7 @@ class AMDSMIHelpers():
|
||||
data_bytes = data[:size]
|
||||
f.write(data_bytes)
|
||||
|
||||
def hexdump_to_string(self, data: Union[bytes, List[int]]) -> str:
|
||||
def binary_to_hexdump_string(self, data: Union[bytes, List[int]]) -> str:
|
||||
"""
|
||||
Convert binary data to a hexdump string.
|
||||
|
||||
@@ -1253,12 +1269,18 @@ class AMDSMIHelpers():
|
||||
A multiline string, each line showing:
|
||||
offset (in hex), hex bytes (16 per line), and printable ASCII.
|
||||
"""
|
||||
# Normalize to list of ints
|
||||
if isinstance(data, bytes):
|
||||
data_ints = list(data)
|
||||
else:
|
||||
# allow list of ints or single-character strings
|
||||
data_ints = [b if isinstance(b, int) else ord(b) for b in data]
|
||||
# Allow list of ints or single-character strings
|
||||
data_ints = []
|
||||
for b in data:
|
||||
if isinstance(b, int):
|
||||
data_ints.append(b)
|
||||
elif isinstance(b, str) and len(b) == 1:
|
||||
data_ints.append(ord(b))
|
||||
else:
|
||||
raise ValueError(f"Invalid type in data: {type(b)}")
|
||||
|
||||
lines: List[str] = []
|
||||
size = len(data_ints)
|
||||
@@ -1266,7 +1288,7 @@ class AMDSMIHelpers():
|
||||
for offset in range(0, size, 16):
|
||||
chunk = data_ints[offset : offset + 16]
|
||||
hex_values = " ".join(f"{b:02x}" for b in chunk)
|
||||
# pad hex_values to 16*3-1 = 47 chars (two hex digits + space)
|
||||
# Pad hex_values to 16*3-1 = 47 chars (two hex digits + space)
|
||||
hex_values = hex_values.ljust(16 * 3 - 1)
|
||||
ascii_values = "".join(chr(b) if 32 <= b <= 126 else "." for b in chunk)
|
||||
lines.append(f"{offset:08x} {hex_values} |{ascii_values}|")
|
||||
@@ -1289,7 +1311,7 @@ class AMDSMIHelpers():
|
||||
else:
|
||||
# assume it's already bytes
|
||||
raw = raw_data
|
||||
self.hexdump_to_string(raw)
|
||||
self.binary_to_hexdump_string(raw)
|
||||
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
|
||||
return afids
|
||||
|
||||
|
||||
@@ -1430,7 +1430,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
severity_choices_str = ", ".join(severity_choices)
|
||||
severity_help = f"Set the SEVERITY filters from the following:\n {severity_choices_str}"
|
||||
folder_help = "Folder to dump CPER report files"
|
||||
file_limit_help = "Maximum number of entries per output file"
|
||||
file_limit_help = "Maximum number of CPER files in target folder\n Older files beyond limit will be deleted"
|
||||
cper_file_help = "Full path of the cper record file to generate the AFID"
|
||||
follow_help = "Continuously monitor for new entries"
|
||||
|
||||
|
||||
@@ -33,7 +33,6 @@ from typing import Any, Dict, List, Tuple, Union
|
||||
from . import amdsmi_wrapper
|
||||
from .amdsmi_exception import *
|
||||
|
||||
|
||||
### Non Library Specific Constants ###
|
||||
class MaxUIntegerTypes(IntEnum):
|
||||
UINT8_T = 0xFF
|
||||
@@ -857,7 +856,7 @@ def amdsmi_get_cpucore_handles() -> List[amdsmi_wrapper.amdsmi_processor_handle]
|
||||
return core_handles
|
||||
|
||||
def amdsmi_get_cpu_hsmp_proto_ver(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
processor_handle: "amdsmi_wrapper.amdsmi_processor_handle",
|
||||
) -> int:
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(
|
||||
@@ -2217,10 +2216,9 @@ def amdsmi_get_clock_info(
|
||||
# logging.debug("amdsmi_interface.py | amdsmi_get_clock_info | clk_type = " + clk_type_str + " | return_dictionary = \n" + str(json.dumps(dict_ret, indent=4)))
|
||||
return dict_ret
|
||||
|
||||
|
||||
def amdsmi_get_gpu_bad_page_info(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
) -> Union[list, str]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
@@ -2427,7 +2425,7 @@ def amdsmi_get_gpu_cper_entries(
|
||||
severity_mask: int,
|
||||
buffer_size: int = 4 * 1048576,
|
||||
cursor: int = 0
|
||||
) -> Tuple[List[Dict[str, Any]], int]:
|
||||
) -> Tuple[Dict[str, Any], int, List[Dict[str, Any]]]:
|
||||
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(
|
||||
@@ -2541,9 +2539,12 @@ def amdsmi_get_afids_from_cper(
|
||||
all_afids: List[int] = []
|
||||
|
||||
for record in cper_records:
|
||||
raw_bytes = bytes(record["bytes"])
|
||||
record_size = record["size"]
|
||||
|
||||
if isinstance(record, dict) and "bytes" in record and "size" in record:
|
||||
raw_bytes = bytes(record["bytes"])
|
||||
record_size = record["size"]
|
||||
else:
|
||||
raise AmdSmiParameterException(record,
|
||||
"dict with keys 'bytes' and 'size' or bytes/bytearray")
|
||||
# Wrap as char*
|
||||
buf = ctypes.create_string_buffer(raw_bytes, record_size)
|
||||
buf_ptr = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char))
|
||||
|
||||
Ссылка в новой задаче
Block a user