[SWDEV-560828][AMD-SMI] "Fix for amd-smi ras --cper component not being redirected to output file with --follow fix" (#1950)

* Fix for amd-smi ras --cper component does not redirecting output to file

Signed-off-by: yalmusaf_amdeng <Yazen.ALMusaffar@amd.com>

* Update amdsmi_helpers.py

* updated CHANGELOG

Signed-off-by: yalmusaf_amdeng <Yazen.ALMusaffar@amd.com>

* updated CHANGELOG

Signed-off-by: yalmusaf_amdeng <Yazen.ALMusaffar@amd.com>

---------

Signed-off-by: yalmusaf_amdeng <Yazen.ALMusaffar@amd.com>
Este commit está contenido en:
Yazen AL Musaffar
2025-12-08 12:07:19 -06:00
cometido por GitHub
padre 277072f241
commit 7bfccc009e
Se han modificado 2 ficheros con 132 adiciones y 23 borrados
+3
Ver fichero
@@ -398,6 +398,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Resolved Issues
- **Fixed CPER component not being redirected to output file issue when using `amd-smi ras --cper --folder <folder_name> --file <file_name> --follow`**.
- Utlized the AMDSMILogger to redirect to output file when --file option is used
- **Fixed a CPER record count mismatch issue when using the `amd-smi ras --cper --file-limit`**.
- Fixed deletion calculation to use files_to_delete = len(folder_files) - file_limit for exact file count management
+129 -23
Ver fichero
@@ -1460,15 +1460,34 @@ class AMDSMIHelpers():
return "unknown"
return "UNKNOWN"
def display_cper_files_generated(self, entries, device_handle, folder):
# Onetime initialization: print warning & header only once
def display_cper_files_generated(self, entries, device_handle, folder, logger=None):
"""
Display CPER summary lines. If a logger is provided and its destination is
not stdout, append the output to that file instead of printing to stdout.
"""
use_file = (
logger is not None
and logger.is_human_readable_format()
and logger.destination != 'stdout'
)
# Onetime initialization: warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
if not getattr(self, "_cper_warning_printed", False):
print(f"WARNING: No CPER files will be dumped unless --folder=<folder_name> is specified and cper entries exist.")
self._cper_warning_printed = True
warning = (
"WARNING: No CPER files will be dumped unless "
"--folder=<folder_name> is specified and cper entries exist."
)
if use_file:
with logger.destination.open('a', encoding="utf-8") as output_file:
output_file.write(warning + '\n')
else:
print(warning)
self._cper_warning_printed = True
self._print_header(folder)
# Print or log the header
self._print_header(folder, logger if use_file else None)
self._cper_display_initialized = True
# Loop through all entries in the dictionary.
@@ -1476,27 +1495,48 @@ class AMDSMIHelpers():
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
prefix = self._severity_as_string(entry.get("error_severity", "Unknown"),
entry.get("notify_type", "Unknown"),
False)
prefix = self._severity_as_string(
entry.get("error_severity", "Unknown"),
entry.get("notify_type", "Unknown"),
False
)
output = f"{timestamp:<20} {gpu_id:<7} {prefix:<20}"
if folder:
prefix = self._severity_as_string(entry.get("error_severity", "Unknown"),
entry.get("notify_type", "Unknown"),
True)
cper_data_file = f"{prefix}_{self.get_cper_count() + 1}.cper"
prefix_for_filename = self._severity_as_string(
entry.get("error_severity", "Unknown"),
entry.get("notify_type", "Unknown"),
True
)
cper_data_file = f"{prefix_for_filename}_{self.get_cper_count() + 1}.cper"
afids = self.pvtDumpAfids(cper_data_file)
afids_str = ' '.join(map(str, afids))
output += f" {cper_data_file:<17} {afids_str}"
print(output)
if use_file:
with logger.destination.open('a', encoding="utf-8") as output_file:
output_file.write(output + '\n')
else:
print(output)
self.increment_cper_count()
def _print_header(self, folder):
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<20}", end="")
def _print_header(self, folder, logger=None):
header = f"{'timestamp':<20} {'gpu_id':<7} {'severity':<20}"
if folder:
print(f" {'file_name':<17} {'list of afids'}", end="")
print("")
header += f" {'file_name':<17} {'list of afids'}"
use_file = (
logger is not None
and logger.is_human_readable_format()
and logger.destination != 'stdout'
)
if use_file:
with logger.destination.open('a', encoding="utf-8") as output_file:
output_file.write(header + '\n')
else:
print(header)
def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None):
"""
@@ -1747,6 +1787,15 @@ class AMDSMIHelpers():
buffer_size = 1048576
# Decide where to send human-readable output
dest = getattr(logger, "destination", "stdout") if logger is not None else "stdout"
log_to_file = dest != 'stdout'
if log_to_file:
# destination is usually a Path; fall back to Path(string) if needed
log_path = dest if isinstance(dest, Path) else Path(dest)
else:
log_path = None
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
if args.follow and not getattr(self, "_cper_follow_prompted", False):
print("Press CTRL + C to stop.")
@@ -1780,18 +1829,75 @@ class AMDSMIHelpers():
else:
logging.debug(f"Cannot retrieve CPER entries: {e}")
break
args.cursor[gpu_idx] = new_cursor
if len(entries) == 0:
break
if args.folder:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
# When a file destination is set, temporarily redirect stdout
# so that helper print() calls go into that file.
if log_to_file and log_path is not None:
orig_stdout = sys.stdout
try:
try:
log_path.parent.mkdir(parents=True, exist_ok=True)
except Exception:
pass
with log_path.open('a', encoding='utf-8') as f:
sys.stdout = f
if args.folder:
self.dump_cper_entries(
args.folder, entries, cper_data, device_handle, args.file_limit
)
else:
self.display_cper_files_generated(
entries, device_handle, args.folder
)
finally:
sys.stdout = orig_stdout
else:
self.display_cper_files_generated(entries, device_handle, args.folder)
if args.folder:
self.dump_cper_entries(
args.folder, entries, cper_data, device_handle, args.file_limit
)
else:
self.display_cper_files_generated(
entries, device_handle, args.folder
)
if num_entries == 0 and not args.follow:
if args.folder:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
# If nothing was found, still emit the warning/header logic
# using the same redirection logic.
if log_to_file and log_path is not None:
orig_stdout = sys.stdout
try:
try:
log_path.parent.mkdir(parents=True, exist_ok=True)
except Exception:
pass
with log_path.open('a', encoding='utf-8') as f:
sys.stdout = f
if args.folder:
self.dump_cper_entries(
args.folder, entries, cper_data, device_handle, args.file_limit
)
else:
self.display_cper_files_generated(
entries, device_handle, args.folder
)
finally:
sys.stdout = orig_stdout
else:
self.display_cper_files_generated(entries, device_handle, args.folder)
if args.folder:
self.dump_cper_entries(
args.folder, entries, cper_data, device_handle, args.file_limit
)
else:
self.display_cper_files_generated(
entries, device_handle, args.folder
)
def get_bitmask_ranges(self, bitmask_dict):
ranges = {}