From b793acaa719d99981d31d2d4b6f8589bb726b246 Mon Sep 17 00:00:00 2001 From: "Saeed, Oosman" Date: Thu, 29 May 2025 11:59:55 -0500 Subject: [PATCH] [SWDEV-530385] Fix CPER "--follow" & "--file-limit" (#380) * --follow option fix & --file_limit option added * change --file_limit and --cper_file to --file-limit and --cper-file --------- Signed-off-by: Maisam Arif [ROCm/amdsmi commit: 91c9969b722d72476411deb35e6aa2a4cd1677d5] --- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 136 +---- projects/amdsmi/amdsmi_cli/amdsmi_helpers.py | 537 ++++++++---------- projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 6 +- .../amdsmi/docs/how-to/amdsmi-cli-tool.md | 4 +- projects/amdsmi/example/CMakeLists.txt | 5 +- 5 files changed, 257 insertions(+), 431 deletions(-) mode change 100644 => 100755 projects/amdsmi/amdsmi_cli/amdsmi_helpers.py diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index a3f13721ce..a8a300e113 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -6331,33 +6331,7 @@ class AMDSMICommands(): else: with self.logger.destination.open('a', encoding="utf-8") as output_file: output_file.write(legend_output + '\n') - - def __pvtDumpAfids(self, cper_file): - # 1) Fetch the CPER “file” and ensure we have raw bytes - raw_data = cper_file - if hasattr(raw_data, "read"): - # fetch_cper_file returned a file‐object - raw = raw_data.read() - elif isinstance(raw_data, Path): - # Path: read the bytes directly - raw = raw_data.read_bytes() - elif isinstance(raw_data, str): - # fetch_cper_file returned a filename - with open(raw_data, "rb") as f: - raw = f.read() - else: - # assume it's already bytes - raw = raw_data - size = len(raw) - self.helpers.hexdump_to_string(raw) - afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw) - print(f"AFIDS: ", end="") - for afid in afids: - print(afid, end=" ") - print("") - - - + def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None, severity=None, folder=None, file_limit=None, cper_file=None, follow=None): @@ -6365,7 +6339,7 @@ class AMDSMICommands(): Retrieve and process CPER (RAS) entries for a target GPU. Expected command (all options only): - amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder --file_limit=1000 --follow + amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder --file-limit=1000 --follow Since no timestamp is provided on the command line, the function starts from a default cursor of 0. The output file name is auto-generated using the timestamp from the CPER header data (converted from @@ -6392,108 +6366,30 @@ class AMDSMICommands(): if args.gpu == None: args.gpu = self.device_handles - #Fetching AFID if args.afid and args.cper_file: - self.__pvtDumpAfids(args.cper_file) + self.helpers.pvtDumpAfids(args.cper_file) return if not self.group_check_printed: self.helpers.check_required_groups() self.group_check_printed = True - handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras) - if handled_multiple_gpus: + if not args.cper: return - args.gpu = device_handle - # Parse severity mask dynamically from the --severity option. - severity_mask = 0 - # drop duplicates of args - logging.debug(args) - for sev in list(set(args.severity)): - if sev == "all": - # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) - severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) - elif sev == "fatal": - # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) - severity_mask |= (1 << 1) - elif sev in ("nonfatal", "nonfatal-uncorrected"): - # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) - severity_mask |= (1 << 0) - elif sev in ("nonfatal-corrected", "corrected"): - # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) - severity_mask |= (1 << 2) - - cursor = 0 - buffer_size = 1048576 - if args.cper: - # Start from cursor 0 (no timestamp argument provided). - file_limit = int(args.file_limit) if args.file_limit else 1000 - # Main loop: continuously retrieve CPER entries if --follow is set. - gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) - # Print header only when dumping to a folder - if args.follow and not getattr(self, "_cper_follow_prompted", False): - print("Press CTRL + C to stop.") - self._cper_follow_prompted = True - - partition_id = -1 - try: - kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu) - kfd_id = kfd_info['kfd_id'] - node_id = kfd_info['node_id'] - partition_id = kfd_info['current_partition_id'] - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info()) - - if partition_id != 0: - logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}") - return - - if args.folder and not getattr(self, "_cper_folder_prompted", False): - print(f"Dumping CPER file header entries in folder {args.folder}") - self._cper_folder_prompted = True - - self.logger.set_cper_exit_message(False) - self.stop = False - - while True: - try: - entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries( - args.gpu, severity_mask, buffer_size, cursor) - logging.debug(f"cper_entries | entries: {entries}") - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Error opening CPER file. This command requires elevation') from e - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED or \ - e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_NOT_FOUND: - raise FileNotFoundError('Error accessing CPER files. This command requires CPER to be enabled.') from e - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR: - raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e - else: - logging.debug(f"Error retrieving CPER entries: {e}") - break - # Dump or display - if args.folder: - if args.gpu and not args.follow: - self.helpers.dump_gpu_entries(args.folder, entries, cper_data, args.gpu) - break - elif not args.gpu and not args.follow: - self.helpers.dump_all_entries(args.folder, entries, cper_data, args.gpu) - break - elif args.follow and args.gpu: - self.helpers.dump_gpu_entries_follow(args.folder, entries, cper_data, args.gpu) - elif args.follow and not args.gpu: - self.helpers.dump_all_entries_follow(args.folder, entries, cper_data, args.gpu) - if args.follow: - self.helpers.display_cper_files_generated_follow(entries, args.gpu) - else: - self.helpers.display_cper_files_generated(entries, args.gpu) - break - if len(entries) == 0 and not args.follow: - break - cursor = new_cursor - time.sleep(1) + if not args.gpu: + return + + if not isinstance(args.gpu, list): + args.gpu = [args.gpu] + args.cursor = [0] * len(args.gpu) + while True: + for idx, device_handle in enumerate(args.gpu): + self.helpers.ras_cper(args, device_handle, self.logger, idx) + if not args.follow: + break + time.sleep(1) def _event_thread(self, commands, i): devices = commands.device_handles diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py old mode 100644 new mode 100755 index 7136b977e8..38c321879e --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1082,6 +1082,135 @@ class AMDSMIHelpers(): print(msg) logging.warning(msg) + def display_cper_files_generated(self, entries, device_handle, folder, follow): + # One‐time initialization: print warning & header only once + if not getattr(self, "_cper_display_initialized", False): + # Warning if no folder was specified elsewhere + if not getattr(self, "_cper_warning_printed", False): + YELLOW = "\033[33m" + RED = "\033[31m" + RESET = "\033[0m" + print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder= is specified.") + self._cper_warning_printed = True + + # Header + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12}", end="") + if folder: + print(f" {'file_name':<17}", end="") + print("") + self._cper_display_initialized = True + + # Loop through all entries in the dictionary. + for entry_index, entry in enumerate(entries.values()): + + # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". + error_severity = entry.get("error_severity", "Unknown") + notify_type = entry.get("notify_type", "Unknown") + + if error_severity == "non_fatal_uncorrected": + prefix = "uncorrected" + elif error_severity == "non_fatal_corrected": + prefix = "corrected" + elif error_severity == "fatal": + prefix = "fatal" + if notify_type == "BOOT": + prefix = "boot" + + cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" + + timestamp = entry.get("timestamp", "unknown") + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12}", end="") + if folder: + print(f" {cper_data_file:<17}", end="") + print("") + self.increment_cper_count() + + def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None): + # One‐time header + if not getattr(self, "_cper_display_initialized", False): + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") + self._cper_display_initialized = True + + if folder: + folder = Path(folder) + folder.mkdir(parents=True, exist_ok=True) + + printed_rows = [] + + for entry_index, entry in enumerate(entries.values()): + # --- rotate out oldest if over limit --- + if file_limit: + files = sorted(folder.glob("*.cper"), key=lambda p: p.stat().st_mtime) + while len(files) >= file_limit: + old = files.pop(0) + try: old.unlink() + except OSError: pass + j = old.with_suffix('.json') + if j.exists(): + try: j.unlink() + except OSError: pass + + # --- determine prefix/severity --- + sev = entry.get("error_severity", "").lower() + nt = entry.get("notify_type", "") + if sev == "non_fatal_uncorrected": + prefix = "uncorrected" + elif sev == "non_fatal_corrected": + prefix = "corrected" + elif sev == "fatal" and nt == "BOOT": + prefix = "boot" + elif sev == "fatal": + prefix = "fatal" + else: + prefix = "unknown" + + # --- new filenames --- + count = self.get_cper_count() + cper_name = f"{prefix}_{count}.cper" + json_name = f"{prefix}_{count}.json" + cper_path = folder / cper_name + json_path = folder / json_name + + # --- write files --- + self.write_binary( + cper_data[entry_index]["bytes"], + cper_data[entry_index]["size"], + cper_path + ) + try: + with json_path.open("w") as f: + f.write(json.dumps( + entry, + indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o + )) + except Exception as e: + logging.error(f"Failed to write JSON to {json_path}: {e}") + + # --- collect for printing --- + ts = entry.get("timestamp", "unknown") + gid = self.get_gpu_id_from_device_handle(device_handle) + printed_rows.append((ts, gid, prefix, cper_name)) + + self.increment_cper_count() + + # --- only now actually print: either all, or just last `file_limit` --- + if file_limit: + to_print = printed_rows[-file_limit:] + else: + to_print = printed_rows + + for ts, gid, prefix, fname in to_print: + print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17}") + + else: + print(json.dumps( + entries, + indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o + )) + def write_binary(self, data, size, filepath): """ Writes binary data directly to a file. @@ -1103,309 +1232,6 @@ class AMDSMIHelpers(): data_bytes = data[:size] f.write(data_bytes) - def display_cper_files_generated_follow(self, entries, device_handle): - device_handles = amdsmi_interface.amdsmi_get_processor_handles() - # One‐time initialization: print warning & header only once - if not getattr(self, "_cper_display_initialized", False): - # Warning if no folder was specified elsewhere - if not getattr(self, "_cper_warning_printed", False): - YELLOW = "\033[33m" - RED = "\033[31m" - RESET = "\033[0m" - print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder= is specified.") - self._cper_warning_printed = True - - # Header - print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") - self._cper_display_initialized = True - - for entry_index, entry in enumerate(entries.values()): - # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". - error_severity = entry.get("error_severity", "Unknown") - notify_type = entry.get("notify_type", "Unknown") - - if error_severity == "non_fatal_uncorrected": - prefix = "uncorrected" - elif error_severity == "non_fatal_corrected": - prefix = "corrected" - elif error_severity == "fatal": - prefix = "fatal" - if notify_type == "BOOT": - prefix = "boot" - - entry_file = f"{prefix}_{self.get_cper_count()}.json" - cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" - - timestamp = entry.get("timestamp", "unknown") - gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") - self.increment_cper_count() - time.sleep(1) - - - def display_cper_files_generated(self, entries, device_handle): - device_handles = amdsmi_interface.amdsmi_get_processor_handles() - # One‐time initialization: print warning & header only once - if not getattr(self, "_cper_display_initialized", False): - # Warning if no folder was specified elsewhere - if not getattr(self, "_cper_warning_printed", False): - YELLOW = "\033[33m" - RED = "\033[31m" - RESET = "\033[0m" - print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder= is specified.") - self._cper_warning_printed = True - - # Header - print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") - self._cper_display_initialized = True - - # Loop through all entries in the dictionary. - for entry_index, entry in enumerate(entries.values()): - - # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". - error_severity = entry.get("error_severity", "Unknown") - notify_type = entry.get("notify_type", "Unknown") - - if error_severity == "non_fatal_uncorrected": - prefix = "uncorrected" - elif error_severity == "non_fatal_corrected": - prefix = "corrected" - elif error_severity == "fatal": - prefix = "fatal" - if notify_type == "BOOT": - prefix = "boot" - - entry_file = f"{prefix}_{self.get_cper_count()}.json" - cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" - - timestamp = entry.get("timestamp", "unknown") - gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") - self.increment_cper_count() - - - def dump_gpu_entries(self, folder, entries, cper_data, device_handle): - # One‐time initialization: print warning & header only once - if not getattr(self, "_cper_display_initialized", False): - # Warning if no folder was specified elsewhere - print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") - self._cper_display_initialized = True - - - if folder: - folder = Path(folder) - folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists - - # Loop through all entries in the dictionary. - for entry_index, entry in enumerate(entries.values()): - # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". - error_severity = entry.get("error_severity", "Unknown") - notify_type = entry.get("notify_type", "Unknown") - - if error_severity == "non_fatal_uncorrected": - prefix = "uncorrected" - elif error_severity == "non_fatal_corrected": - prefix = "corrected" - elif error_severity == "fatal": - prefix = "fatal" - if notify_type == "BOOT": - prefix = "boot" - - # Construct a unique file name using the key to avoid overwriting - entry_file = f"{prefix}_{self.get_cper_count()}.json" - output_path = folder / entry_file - - cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" - cper_data_file_path = folder / cper_data_file - self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path) - - #print header - timestamp = entry.get("timestamp", "unknown") - gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") - self.increment_cper_count() - - - try: - with output_path.open("w") as f: - logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}") - # Dump the single entry as JSON, handling bytes via the lambda. - f.write(json.dumps(entry, indent=2, - default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) - - - - except Exception as e: - logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}") - else: - print(json.dumps(entries, indent=2, - default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) - - - def dump_all_entries(self, folder, entries, cper_data, device_handle): - # One‐time initialization: print warning & header only once - if not getattr(self, "_cper_display_initialized", False): - # Warning if no folder was specified elsewhere - print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") - self._cper_display_initialized = True - - - if folder: - folder = Path(folder) - folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists - - # Loop through all entries in the dictionary. - for entry_index, entry in enumerate(entries.values()): - # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". - error_severity = entry.get("error_severity", "Unknown") - notify_type = entry.get("notify_type", "Unknown") - - if error_severity == "non_fatal_uncorrected": - prefix = "uncorrected" - elif error_severity == "non_fatal_corrected": - prefix = "corrected" - elif error_severity == "fatal": - prefix = "fatal" - if notify_type == "BOOT": - prefix = "boot" - - # Construct a unique file name using the key to avoid overwriting - entry_file = f"{prefix}_{self.get_cper_count()}.json" - output_path = folder / entry_file - - cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" - cper_data_file_path = folder / cper_data_file - self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path) - - #print header - timestamp = entry.get("timestamp", "unknown") - gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") - self.increment_cper_count() - - try: - with output_path.open("w") as f: - logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}") - # Dump the single entry as JSON, handling bytes via the lambda. - f.write(json.dumps(entry, indent=2, - default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) - except Exception as e: - logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}") - else: - print(json.dumps(entries, indent=2, - default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) - - - def dump_all_entries_follow(self, folder, entries, cper_data, device_handle): - # One‐time initialization: print warning & header only once - if not getattr(self, "_cper_display_initialized", False): - # Warning if no folder was specified elsewhere - print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") - self._cper_display_initialized = True - - - if folder: - folder = Path(folder) - folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists - - # Loop through all entries in the dictionary. - for entry_index, entry in enumerate(entries.values()): - # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". - error_severity = entry.get("error_severity", "Unknown") - notify_type = entry.get("notify_type", "Unknown") - - if error_severity == "non_fatal_uncorrected": - prefix = "uncorrected" - elif error_severity == "non_fatal_corrected": - prefix = "corrected" - elif error_severity == "fatal": - prefix = "fatal" - if notify_type == "BOOT": - prefix = "boot" - - # Construct a unique file name using the key to avoid overwriting - entry_file = f"{prefix}_{self.get_cper_count()}.json" - output_path = folder / entry_file - - cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" - cper_data_file_path = folder / cper_data_file - self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path) - - #print header - timestamp = entry.get("timestamp", "unknown") - gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") - self.increment_cper_count() - time.sleep(1) - - try: - with output_path.open("w") as f: - logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}") - # Dump the single entry as JSON, handling bytes via the lambda. - f.write(json.dumps(entry, indent=2, - default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) - except Exception as e: - logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}") - else: - print(json.dumps(entries, indent=2, - default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) - - - def dump_gpu_entries_follow(self, folder, entries, cper_data, device_handle): - # One‐time initialization: print warning & header only once - if not getattr(self, "_cper_display_initialized", False): - # Warning if no folder was specified elsewhere - print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") - self._cper_display_initialized = True - - - if folder: - folder = Path(folder) - folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists - - # Loop through all entries in the dictionary. - for entry_index, entry in enumerate(entries.values()): - # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". - error_severity = entry.get("error_severity", "Unknown") - notify_type = entry.get("notify_type", "Unknown") - - if error_severity == "non_fatal_uncorrected": - prefix = "uncorrected" - elif error_severity == "non_fatal_corrected": - prefix = "corrected" - elif error_severity == "fatal": - prefix = "fatal" - if notify_type == "BOOT": - prefix = "boot" - - # Construct a unique file name using the key to avoid overwriting - entry_file = f"{prefix}_{self.get_cper_count()}.json" - output_path = folder / entry_file - - cper_data_file = f"{prefix}_{self.get_cper_count()}.cper" - cper_data_file_path = folder / cper_data_file - self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path) - - #print header - timestamp = entry.get("timestamp", "unknown") - gpu_id = self.get_gpu_id_from_device_handle(device_handle) - print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}") - self.increment_cper_count() - time.sleep(1) - - try: - with output_path.open("w") as f: - logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}") - # Dump the single entry as JSON, handling bytes via the lambda. - f.write(json.dumps(entry, indent=2, - default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) - except Exception as e: - logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}") - else: - print(json.dumps(entries, indent=2, - default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o)) - - def hexdump_to_string(self, data: Union[bytes, List[int]]) -> str: """ Convert binary data to a hexdump string. @@ -1435,4 +1261,107 @@ class AMDSMIHelpers(): ascii_values = "".join(chr(b) if 32 <= b <= 126 else "." for b in chunk) lines.append(f"{offset:08x} {hex_values} |{ascii_values}|") - return "\n".join(lines) \ No newline at end of file + return "\n".join(lines) + + def pvtDumpAfids(self, cper_file): + # 1) Fetch the CPER “file” and ensure we have raw bytes + raw_data = cper_file + if hasattr(raw_data, "read"): + # fetch_cper_file returned a file‐object + raw = raw_data.read() + elif isinstance(raw_data, Path): + # Path: read the bytes directly + raw = raw_data.read_bytes() + elif isinstance(raw_data, str): + # fetch_cper_file returned a filename + with open(raw_data, "rb") as f: + raw = f.read() + else: + # assume it's already bytes + raw = raw_data + size = len(raw) + self.hexdump_to_string(raw) + afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw) + print(f"AFIDS: ", end="") + for afid in afids: + print(afid, end=" ") + print("") + + def ras_cper(self, args, device_handle, logger, gpu_idx): + # Parse severity mask dynamically from the --severity option. + severity_mask = 0 + # drop duplicates of args + logging.debug(args) + + for sev in list(set(args.severity)): + if sev == "all": + # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) + severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) + elif sev == "fatal": + # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) + severity_mask |= (1 << 1) + elif sev in ("nonfatal", "nonfatal-uncorrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) + severity_mask |= (1 << 0) + elif sev in ("nonfatal-corrected", "corrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) + severity_mask |= (1 << 2) + + buffer_size = 1048576 + + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + if args.follow and not getattr(self, "_cper_follow_prompted", False): + print("Press CTRL + C to stop.") + self._cper_follow_prompted = True + + partition_id = -1 + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(device_handle) + partition_id = kfd_info['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info()) + + if partition_id != 0: + logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}") + return + + if args.folder and not getattr(self, "_cper_folder_prompted", False): + print(f"Dumping CPER file header entries in folder {args.folder}") + self._cper_folder_prompted = True + + logger.set_cper_exit_message(False) + self.stop = False + + while True: + try: + entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries( + device_handle, severity_mask, buffer_size, args.cursor[gpu_idx]) + logging.debug(f"cper_entries | entries: {entries}") + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Error opening CPER file. This command requires elevation') from e + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED or \ + e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_NOT_FOUND: + raise FileNotFoundError('Error accessing CPER files. This command requires CPER to be enabled.') from e + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR: + raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e + else: + logging.debug(f"Error retrieving CPER entries: {e}") + break + args.cursor[gpu_idx] = new_cursor + if len(entries) == 0: + break + if args.folder: + if args.follow: + if device_handle: + self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) + else: + self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) + else: + if device_handle: + self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) + else: + self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) + break + else: + self.display_cper_files_generated(entries, device_handle, args.folder, args.follow) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index fd4b66f312..ba509d15df 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1397,7 +1397,7 @@ class AMDSMIParser(argparse.ArgumentParser): Adds the 'ras' subcommand. Expected command: - amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder --file_limit=1000 --follow + amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder --file-limit=1000 --follow All parameters are provided via options; no positional arguments or optional --file/--gpu are used. """ @@ -1431,8 +1431,8 @@ class AMDSMIParser(argparse.ArgumentParser): ras_parser.add_argument("--afid", action="store_true", required=False, help=afid_help) ras_parser.add_argument("--severity", type=str.lower, nargs='+', default=['all'], help=severity_help, choices=severity_choices, metavar='SEVERITY') ras_parser.add_argument("--folder", type=str, action=self._check_folder_path(), default=False, help=folder_help) - ras_parser.add_argument("--file_limit", type=self._positive_int, action='store', default=1000, help=file_limit_help) - ras_parser.add_argument("--cper_file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help) + ras_parser.add_argument("--file-limit", type=self._positive_int, action='store', default=1000, help=file_limit_help) + ras_parser.add_argument("--cper-file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help) ras_parser.add_argument("--follow", action="store_true", default=False, help=follow_help) # Add common modifiers and device selection arguments. diff --git a/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md b/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md index 08ee144012..9f5d7e519e 100644 --- a/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md +++ b/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md @@ -767,7 +767,7 @@ Displays RAS information of specified devices. ```shell-session ~$ amd-smi ras --help usage: amd-smi ras [-h] --cper [--severity SEVERITY [SEVERITY ...]] [--folder FOLDER] - [--file_limit FILE_LIMIT] [--follow] + [--file-limit FILE_LIMIT] [--follow] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [--json | --csv] [--file FILE] [--loglevel LEVEL] @@ -781,7 +781,7 @@ RAS arguments: --severity SEVERITY [SEVERITY ...] Set the SEVERITY filters from the following: nonfatal-uncorrected, fatal, nonfatal-corrected, all --folder FOLDER Folder to dump CPER report files - --file_limit FILE_LIMIT Maximum number of entries per output file + --file-limit FILE_LIMIT Maximum number of entries per output file --follow Continuously monitor for new entries Device arguments: diff --git a/projects/amdsmi/example/CMakeLists.txt b/projects/amdsmi/example/CMakeLists.txt index 124745b2a3..5ce174bd1a 100644 --- a/projects/amdsmi/example/CMakeLists.txt +++ b/projects/amdsmi/example/CMakeLists.txt @@ -34,9 +34,10 @@ endif() # add package search paths set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${ROCM_DIR} ../../../) set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib) -if(EXISTS ${ROCM_DIR}/lib64) - set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib64) +if(NOT EXISTS ${ROCM_DIR}/lib64) + file(MAKE_DIRECTORY ${ROCM_DIR}/lib64) endif() +set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib64) find_package(amd_smi CONFIG REQUIRED) message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")