[SWDEV-528364] CPER CLI --follow fix (#298)

Signed-off-by: Yazen ALMusaffar <yalmusaf@amd.com>

[ROCm/amdsmi commit: f1f782312d]
Этот коммит содержится в:
AL Musaffar, Yazen
2025-04-22 22:52:03 -05:00
коммит произвёл GitHub
родитель 36a8775ddd
Коммит 5c59f20f22
2 изменённых файлов: 287 добавлений и 21 удалений
+33 -18
Просмотреть файл
@@ -6320,26 +6320,28 @@ class AMDSMICommands():
elif sev in ("nonfatal-corrected", "corrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
severity_mask |= (1 << 2)
if args.cper:
# Start from cursor 0 (no timestamp argument provided).
cursor = 0
buffer_size = 1048576
file_limit = int(args.file_limit) if args.file_limit else 1000
# Print exit message only once and only when follow is set
if self.logger.cper_exit_message() and args.follow:
print('Press q and hit ENTER when you want to stop.')
self.logger.set_cper_exit_message(False)
# Main loop: continuously retrieve CPER entries if --follow is set.
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
if args.folder:
print(f'Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}\n')
else:
print(f'Dumping CPER file header entries for GPU {gpu_id}:\n')
# Print header only when dumping to a folder
if args.follow and not getattr(self, "_cper_follow_prompted", False):
print("Press CTRL + C to stop.")
self._cper_follow_prompted = True
if args.folder and args.gpu:
print(f"Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}")
elif args.folder:
print(f"Dumping CPER file header entries in folder {args.folder}")
self.logger.set_cper_exit_message(False)
self.stop = False
while True:
try:
entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries(
@@ -6356,17 +6358,30 @@ class AMDSMICommands():
else:
logging.debug(f"Error retrieving CPER entries: {e}")
break
if entries:
self.helpers.dump_entries(args.folder, entries, cper_data)
# Dump or display
if args.folder:
if args.gpu and not args.follow:
self.helpers.dump_gpu_entries(args.folder, entries, cper_data, args.gpu)
break
elif not args.gpu and not args.follow:
self.helpers.dump_all_entries(args.folder, entries, cper_data, args.gpu)
break
elif args.follow and args.gpu:
self.helpers.dump_gpu_entries_follow(args.folder, entries, cper_data, args.gpu)
break
elif args.follow and not args.gpu:
self.helpers.dump_all_entries_follow(args.folder, entries, cper_data, args.gpu)
break
if args.follow:
self.helpers.display_cper_files_generated_follow(entries, args.gpu)
break
else:
self.helpers.display_cper_files_generated(entries, args.gpu)
break
if len(entries) == 0 or not args.follow:
break
cursor = new_cursor
time.sleep(5)
user_input = input()
if user_input == 'q':
print("Escape Sequence Detected; Exiting")
self.stop = True
break
def _event_thread(self, commands, i):
+254 -3
Просмотреть файл
@@ -1081,7 +1081,6 @@ class AMDSMIHelpers():
print(msg)
logging.warning(msg)
def write_binary(self, data, size, filepath):
"""
Writes binary data directly to a file.
@@ -1103,8 +1102,93 @@ class AMDSMIHelpers():
data_bytes = data[:size]
f.write(data_bytes)
def display_cper_files_generated_follow(self, entries, device_handle):
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
if not getattr(self, "_cper_warning_printed", False):
YELLOW = "\033[33m"
RED = "\033[31m"
RESET = "\033[0m"
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
self._cper_warning_printed = True
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
entry_file = f"{prefix}_{self.get_cper_count()}.json"
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
self.increment_cper_count()
time.sleep(1)
def display_cper_files_generated(self, entries, device_handle):
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
if not getattr(self, "_cper_warning_printed", False):
YELLOW = "\033[33m"
RED = "\033[31m"
RESET = "\033[0m"
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
self._cper_warning_printed = True
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
entry_file = f"{prefix}_{self.get_cper_count()}.json"
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
self.increment_cper_count()
def dump_gpu_entries(self, folder, entries, cper_data, device_handle):
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
def dump_entries(self, folder, entries, cper_data):
if folder:
folder = Path(folder)
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
@@ -1132,6 +1216,174 @@ class AMDSMIHelpers():
cper_data_file_path = folder / cper_data_file
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
self.increment_cper_count()
try:
with output_path.open("w") as f:
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
# Dump the single entry as JSON, handling bytes via the lambda.
f.write(json.dumps(entry, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
except Exception as e:
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
else:
print(json.dumps(entries, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
def dump_all_entries(self, folder, entries, cper_data, device_handle):
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
if folder:
folder = Path(folder)
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
# Construct a unique file name using the key to avoid overwriting
entry_file = f"{prefix}_{self.get_cper_count()}.json"
output_path = folder / entry_file
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
cper_data_file_path = folder / cper_data_file
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
self.increment_cper_count()
try:
with output_path.open("w") as f:
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
# Dump the single entry as JSON, handling bytes via the lambda.
f.write(json.dumps(entry, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
except Exception as e:
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
else:
print(json.dumps(entries, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
def dump_all_entries_follow(self, folder, entries, cper_data, device_handle):
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
if folder:
folder = Path(folder)
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
# Construct a unique file name using the key to avoid overwriting
entry_file = f"{prefix}_{self.get_cper_count()}.json"
output_path = folder / entry_file
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
cper_data_file_path = folder / cper_data_file
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
self.increment_cper_count()
time.sleep(1)
try:
with output_path.open("w") as f:
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
# Dump the single entry as JSON, handling bytes via the lambda.
f.write(json.dumps(entry, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
except Exception as e:
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
else:
print(json.dumps(entries, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
def dump_gpu_entries_follow(self, folder, entries, cper_data, device_handle):
# Header
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
self._cper_display_initialized = True
if folder:
folder = Path(folder)
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
# Construct a unique file name using the key to avoid overwriting
entry_file = f"{prefix}_{self.get_cper_count()}.json"
output_path = folder / entry_file
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
cper_data_file_path = folder / cper_data_file
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
self.increment_cper_count()
time.sleep(1)
try:
with output_path.open("w") as f:
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
@@ -1143,4 +1395,3 @@ class AMDSMIHelpers():
else:
print(json.dumps(entries, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
self.increment_cper_count()