diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 807d243a11..da234a8302 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -6320,26 +6320,28 @@ class AMDSMICommands(): elif sev in ("nonfatal-corrected", "corrected"): # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) severity_mask |= (1 << 2) - + if args.cper: # Start from cursor 0 (no timestamp argument provided). cursor = 0 buffer_size = 1048576 file_limit = int(args.file_limit) if args.file_limit else 1000 - - # Print exit message only once and only when follow is set - if self.logger.cper_exit_message() and args.follow: - print('Press q and hit ENTER when you want to stop.') - self.logger.set_cper_exit_message(False) - + # Main loop: continuously retrieve CPER entries if --follow is set. gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) - if args.folder: - print(f'Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}\n') - else: - print(f'Dumping CPER file header entries for GPU {gpu_id}:\n') + # Print header only when dumping to a folder + if args.follow and not getattr(self, "_cper_follow_prompted", False): + print("Press CTRL + C to stop.") + self._cper_follow_prompted = True + if args.folder and args.gpu: + print(f"Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}") + elif args.folder: + print(f"Dumping CPER file header entries in folder {args.folder}") + + self.logger.set_cper_exit_message(False) self.stop = False + while True: try: entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries( @@ -6356,17 +6358,30 @@ class AMDSMICommands(): else: logging.debug(f"Error retrieving CPER entries: {e}") break - if entries: - self.helpers.dump_entries(args.folder, entries, cper_data) + # Dump or display + if args.folder: + if args.gpu and not args.follow: + self.helpers.dump_gpu_entries(args.folder, entries, cper_data, args.gpu) + break + elif not args.gpu and not args.follow: + self.helpers.dump_all_entries(args.folder, entries, cper_data, args.gpu) + break + elif args.follow and args.gpu: + self.helpers.dump_gpu_entries_follow(args.folder, entries, cper_data, args.gpu) + break + elif args.follow and not args.gpu: + self.helpers.dump_all_entries_follow(args.folder, entries, cper_data, args.gpu) + break + if args.follow: + self.helpers.display_cper_files_generated_follow(entries, args.gpu) + break + else: + self.helpers.display_cper_files_generated(entries, args.gpu) + break if len(entries) == 0 or not args.follow: break cursor = new_cursor time.sleep(5) - user_input = input() - if user_input == 'q': - print("Escape Sequence Detected; Exiting") - self.stop = True - break def _event_thread(self, commands, i): diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 94a5b9903d..bc2d909025 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1081,7 +1081,6 @@ class AMDSMIHelpers(): print(msg) logging.warning(msg) - def write_binary(self, data, size, filepath): """ Writes binary data directly to a file. @@ -1103,8 +1102,93 @@ class AMDSMIHelpers(): data_bytes = data[:size] f.write(data_bytes) + def display_cper_files_generated_follow(self, entries, device_handle): + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + # One‐time initialization: print warning & header only once + if not getattr(self, "_cper_display_initialized", False): + # Warning if no folder was specified elsewhere + if not getattr(self, "_cper_warning_printed", False): + YELLOW = "\033[33m" + RED = "\033[31m" + RESET = "\033[0m" + print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder= is specified.") + self._cper_warning_printed = True + + # Header + print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") + self._cper_display_initialized = True + + for entry_index, entry in enumerate(entries.values()): + # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". + error_severity = entry.get("error_severity", "Unknown") + notify_type = entry.get("notify_type", "Unknown") + + if error_severity == "non_fatal_uncorrected": + prefix = "uncorrected" + elif error_severity == "non_fatal_corrected": + prefix = "corrected" + elif error_severity == "fatal": + prefix = "fatal" + if notify_type == "BOOT": + prefix = "boot" + + entry_file = f"{prefix}_{self.get_cper_count()}.json" + cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" + + timestamp = entry.get("timestamp", "unknown") + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + self.increment_cper_count() + time.sleep(1) + + + def display_cper_files_generated(self, entries, device_handle): + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + # One‐time initialization: print warning & header only once + if not getattr(self, "_cper_display_initialized", False): + # Warning if no folder was specified elsewhere + if not getattr(self, "_cper_warning_printed", False): + YELLOW = "\033[33m" + RED = "\033[31m" + RESET = "\033[0m" + print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder= is specified.") + self._cper_warning_printed = True + + # Header + print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") + self._cper_display_initialized = True + + # Loop through all entries in the dictionary. + for entry_index, entry in enumerate(entries.values()): + + # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". + error_severity = entry.get("error_severity", "Unknown") + notify_type = entry.get("notify_type", "Unknown") + + if error_severity == "non_fatal_uncorrected": + prefix = "uncorrected" + elif error_severity == "non_fatal_corrected": + prefix = "corrected" + elif error_severity == "fatal": + prefix = "fatal" + if notify_type == "BOOT": + prefix = "boot" + + entry_file = f"{prefix}_{self.get_cper_count()}.json" + cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" + + timestamp = entry.get("timestamp", "unknown") + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + self.increment_cper_count() + + + def dump_gpu_entries(self, folder, entries, cper_data, device_handle): + # Header + print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") + self._cper_display_initialized = True + - def dump_entries(self, folder, entries, cper_data): if folder: folder = Path(folder) folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists @@ -1132,6 +1216,174 @@ class AMDSMIHelpers(): cper_data_file_path = folder / cper_data_file self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path) + #print header + timestamp = entry.get("timestamp", "unknown") + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + self.increment_cper_count() + + + try: + with output_path.open("w") as f: + logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}") + # Dump the single entry as JSON, handling bytes via the lambda. + f.write(json.dumps(entry, indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) + + + + except Exception as e: + logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}") + else: + print(json.dumps(entries, indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) + + + def dump_all_entries(self, folder, entries, cper_data, device_handle): + # Header + print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") + self._cper_display_initialized = True + + + if folder: + folder = Path(folder) + folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists + + # Loop through all entries in the dictionary. + for entry_index, entry in enumerate(entries.values()): + # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". + error_severity = entry.get("error_severity", "Unknown") + notify_type = entry.get("notify_type", "Unknown") + + if error_severity == "non_fatal_uncorrected": + prefix = "uncorrected" + elif error_severity == "non_fatal_corrected": + prefix = "corrected" + elif error_severity == "fatal": + prefix = "fatal" + if notify_type == "BOOT": + prefix = "boot" + + # Construct a unique file name using the key to avoid overwriting + entry_file = f"{prefix}_{self.get_cper_count()}.json" + output_path = folder / entry_file + + cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" + cper_data_file_path = folder / cper_data_file + self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path) + + #print header + timestamp = entry.get("timestamp", "unknown") + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + self.increment_cper_count() + + try: + with output_path.open("w") as f: + logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}") + # Dump the single entry as JSON, handling bytes via the lambda. + f.write(json.dumps(entry, indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) + except Exception as e: + logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}") + else: + print(json.dumps(entries, indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) + + + def dump_all_entries_follow(self, folder, entries, cper_data, device_handle): + # Header + print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") + self._cper_display_initialized = True + + + if folder: + folder = Path(folder) + folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists + + # Loop through all entries in the dictionary. + for entry_index, entry in enumerate(entries.values()): + # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". + error_severity = entry.get("error_severity", "Unknown") + notify_type = entry.get("notify_type", "Unknown") + + if error_severity == "non_fatal_uncorrected": + prefix = "uncorrected" + elif error_severity == "non_fatal_corrected": + prefix = "corrected" + elif error_severity == "fatal": + prefix = "fatal" + if notify_type == "BOOT": + prefix = "boot" + + # Construct a unique file name using the key to avoid overwriting + entry_file = f"{prefix}_{self.get_cper_count()}.json" + output_path = folder / entry_file + + cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" + cper_data_file_path = folder / cper_data_file + self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path) + + #print header + timestamp = entry.get("timestamp", "unknown") + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + self.increment_cper_count() + time.sleep(1) + + try: + with output_path.open("w") as f: + logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}") + # Dump the single entry as JSON, handling bytes via the lambda. + f.write(json.dumps(entry, indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) + except Exception as e: + logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}") + else: + print(json.dumps(entries, indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) + + + def dump_gpu_entries_follow(self, folder, entries, cper_data, device_handle): + # Header + print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}") + self._cper_display_initialized = True + + + if folder: + folder = Path(folder) + folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists + + # Loop through all entries in the dictionary. + for entry_index, entry in enumerate(entries.values()): + # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". + error_severity = entry.get("error_severity", "Unknown") + notify_type = entry.get("notify_type", "Unknown") + + if error_severity == "non_fatal_uncorrected": + prefix = "uncorrected" + elif error_severity == "non_fatal_corrected": + prefix = "corrected" + elif error_severity == "fatal": + prefix = "fatal" + if notify_type == "BOOT": + prefix = "boot" + + # Construct a unique file name using the key to avoid overwriting + entry_file = f"{prefix}_{self.get_cper_count()}.json" + output_path = folder / entry_file + + cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" + cper_data_file_path = folder / cper_data_file + self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path) + + #print header + timestamp = entry.get("timestamp", "unknown") + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}") + self.increment_cper_count() + time.sleep(1) + try: with output_path.open("w") as f: logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}") @@ -1143,4 +1395,3 @@ class AMDSMIHelpers(): else: print(json.dumps(entries, indent=2, default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) - self.increment_cper_count()