[SWDEV-558349] Fix for cper record count mismatch with --file-limit

Change-Id: I4fdcc0fb1153e47c195062e7bdf71c0362723ef6


[ROCm/amdsmi commit: c4cad504be]
Bu işleme şunda yer alıyor:
yalmusaf_amdeng
2025-10-07 15:58:03 -05:00
işlemeyi yapan: Arif, Maisam
ebeveyn a93b9d473d
işleme 25a6ac3585
2 değiştirilmiş dosya ile 21 ekleme ve 17 silme
+3
Dosyayı Görüntüle
@@ -240,6 +240,9 @@ GPU: 0
### Resolved Issues
- **Fixed a CPER record count mismatch issue when using the `amd-smi ras --cper --file-limit`**.
- Fixed deletion calculation to use files_to_delete = len(folder_files) - file_limit for exact file count management
- **Fixed event monitoring segfaults causing RDC to crash**.
- Adds mutex locking around access to device event notification file pointer
+18 -17
Dosyayı Görüntüle
@@ -1244,31 +1244,18 @@ class AMDSMIHelpers():
output_rows = {}
for entry_index, entry in enumerate(entries.values()):
# Batch deletion if file limit is exceeded
if file_limit:
folder_files = list(sorted(folder.glob("*.cper"), key=lambda p: p.stat().st_mtime))
if file_limit < len(folder_files):
for old_file in folder_files[:len(folder_files) - file_limit]:
try:
old_file.unlink()
json_file = old_file.with_suffix('.json')
if json_file.exists():
json_file.unlink()
except OSError as e:
logging.debug(f"Failed to delete file {old_file}: {e}")
# Determine prefix/severity
error_severity = entry.get("error_severity", "").lower()
notify_type = entry.get("notify_type", "")
prefix = self._severity_as_string(error_severity, notify_type, True)
# Generate filenames
count = self.get_cper_count() + 1
cper_name = f"{prefix}-{count}.cper"
json_name = f"{prefix}-{count}.json"
cper_path = folder / cper_name
json_path = folder / json_name
# Write CPER binary file
try:
self.write_binary(
@@ -1278,7 +1265,7 @@ class AMDSMIHelpers():
)
except Exception as e:
logging.debug(f"Failed to write CPER file {cper_path}: {e}")
# Write JSON metadata file
try:
with json_path.open("w") as cper_json_file:
@@ -1290,7 +1277,7 @@ class AMDSMIHelpers():
)
except Exception as e:
logging.debug(f"Failed to write JSON file {json_path}: {e}")
# Collect data for printing
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
@@ -1298,6 +1285,20 @@ class AMDSMIHelpers():
output_rows[cper_path] = [timestamp, gpu_id, severity, cper_name]
self.increment_cper_count()
# Batch deletion if file limit is exceeded (AFTER writing ALL new files)
if file_limit:
folder_files = list(sorted(folder.glob("*.cper"), key=lambda p: p.stat().st_mtime))
if len(folder_files) > file_limit:
files_to_delete = len(folder_files) - file_limit
for old_file in folder_files[:files_to_delete]:
try:
old_file.unlink()
json_file = old_file.with_suffix('.json')
if json_file.exists():
json_file.unlink()
except OSError as e:
logging.debug(f"Failed to delete file {old_file}: {e}")
# Print collected rows
for cper_path, row in output_rows.items():
timestamp, gpu_id, severity, fname = row