[SWDEV-530385] Fix CPER "--follow" & "--file-limit" (#380)
* --follow option fix & --file_limit option added * change --file_limit and --cper_file to --file-limit and --cper-file --------- Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
d0a89393df
Коммит
91c9969b72
@@ -6331,33 +6331,7 @@ class AMDSMICommands():
|
||||
else:
|
||||
with self.logger.destination.open('a', encoding="utf-8") as output_file:
|
||||
output_file.write(legend_output + '\n')
|
||||
|
||||
def __pvtDumpAfids(self, cper_file):
|
||||
# 1) Fetch the CPER “file” and ensure we have raw bytes
|
||||
raw_data = cper_file
|
||||
if hasattr(raw_data, "read"):
|
||||
# fetch_cper_file returned a file‐object
|
||||
raw = raw_data.read()
|
||||
elif isinstance(raw_data, Path):
|
||||
# Path: read the bytes directly
|
||||
raw = raw_data.read_bytes()
|
||||
elif isinstance(raw_data, str):
|
||||
# fetch_cper_file returned a filename
|
||||
with open(raw_data, "rb") as f:
|
||||
raw = f.read()
|
||||
else:
|
||||
# assume it's already bytes
|
||||
raw = raw_data
|
||||
size = len(raw)
|
||||
self.helpers.hexdump_to_string(raw)
|
||||
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
|
||||
print(f"AFIDS: ", end="")
|
||||
for afid in afids:
|
||||
print(afid, end=" ")
|
||||
print("")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,
|
||||
severity=None, folder=None, file_limit=None, cper_file=None, follow=None):
|
||||
@@ -6365,7 +6339,7 @@ class AMDSMICommands():
|
||||
Retrieve and process CPER (RAS) entries for a target GPU.
|
||||
|
||||
Expected command (all options only):
|
||||
amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder <folder_name> --file_limit=1000 --follow
|
||||
amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder <folder_name> --file-limit=1000 --follow
|
||||
|
||||
Since no timestamp is provided on the command line, the function starts from a default cursor of 0.
|
||||
The output file name is auto-generated using the timestamp from the CPER header data (converted from
|
||||
@@ -6392,108 +6366,30 @@ class AMDSMICommands():
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
#Fetching AFID
|
||||
if args.afid and args.cper_file:
|
||||
self.__pvtDumpAfids(args.cper_file)
|
||||
self.helpers.pvtDumpAfids(args.cper_file)
|
||||
return
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras)
|
||||
if handled_multiple_gpus:
|
||||
if not args.cper:
|
||||
return
|
||||
args.gpu = device_handle
|
||||
|
||||
# Parse severity mask dynamically from the --severity option.
|
||||
severity_mask = 0
|
||||
# drop duplicates of args
|
||||
logging.debug(args)
|
||||
for sev in list(set(args.severity)):
|
||||
if sev == "all":
|
||||
# Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2)
|
||||
severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2))
|
||||
elif sev == "fatal":
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1)
|
||||
severity_mask |= (1 << 1)
|
||||
elif sev in ("nonfatal", "nonfatal-uncorrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0)
|
||||
severity_mask |= (1 << 0)
|
||||
elif sev in ("nonfatal-corrected", "corrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
|
||||
severity_mask |= (1 << 2)
|
||||
|
||||
cursor = 0
|
||||
buffer_size = 1048576
|
||||
if args.cper:
|
||||
# Start from cursor 0 (no timestamp argument provided).
|
||||
file_limit = int(args.file_limit) if args.file_limit else 1000
|
||||
# Main loop: continuously retrieve CPER entries if --follow is set.
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
# Print header only when dumping to a folder
|
||||
if args.follow and not getattr(self, "_cper_follow_prompted", False):
|
||||
print("Press CTRL + C to stop.")
|
||||
self._cper_follow_prompted = True
|
||||
|
||||
partition_id = -1
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
|
||||
kfd_id = kfd_info['kfd_id']
|
||||
node_id = kfd_info['node_id']
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if partition_id != 0:
|
||||
logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}")
|
||||
return
|
||||
|
||||
if args.folder and not getattr(self, "_cper_folder_prompted", False):
|
||||
print(f"Dumping CPER file header entries in folder {args.folder}")
|
||||
self._cper_folder_prompted = True
|
||||
|
||||
self.logger.set_cper_exit_message(False)
|
||||
self.stop = False
|
||||
|
||||
while True:
|
||||
try:
|
||||
entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries(
|
||||
args.gpu, severity_mask, buffer_size, cursor)
|
||||
logging.debug(f"cper_entries | entries: {entries}")
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Error opening CPER file. This command requires elevation') from e
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED or \
|
||||
e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_NOT_FOUND:
|
||||
raise FileNotFoundError('Error accessing CPER files. This command requires CPER to be enabled.') from e
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR:
|
||||
raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e
|
||||
else:
|
||||
logging.debug(f"Error retrieving CPER entries: {e}")
|
||||
break
|
||||
# Dump or display
|
||||
if args.folder:
|
||||
if args.gpu and not args.follow:
|
||||
self.helpers.dump_gpu_entries(args.folder, entries, cper_data, args.gpu)
|
||||
break
|
||||
elif not args.gpu and not args.follow:
|
||||
self.helpers.dump_all_entries(args.folder, entries, cper_data, args.gpu)
|
||||
break
|
||||
elif args.follow and args.gpu:
|
||||
self.helpers.dump_gpu_entries_follow(args.folder, entries, cper_data, args.gpu)
|
||||
elif args.follow and not args.gpu:
|
||||
self.helpers.dump_all_entries_follow(args.folder, entries, cper_data, args.gpu)
|
||||
if args.follow:
|
||||
self.helpers.display_cper_files_generated_follow(entries, args.gpu)
|
||||
else:
|
||||
self.helpers.display_cper_files_generated(entries, args.gpu)
|
||||
break
|
||||
if len(entries) == 0 and not args.follow:
|
||||
break
|
||||
cursor = new_cursor
|
||||
time.sleep(1)
|
||||
if not args.gpu:
|
||||
return
|
||||
|
||||
if not isinstance(args.gpu, list):
|
||||
args.gpu = [args.gpu]
|
||||
|
||||
args.cursor = [0] * len(args.gpu)
|
||||
while True:
|
||||
for idx, device_handle in enumerate(args.gpu):
|
||||
self.helpers.ras_cper(args, device_handle, self.logger, idx)
|
||||
if not args.follow:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
def _event_thread(self, commands, i):
|
||||
devices = commands.device_handles
|
||||
|
||||
Обычный файл → Исполняемый файл
+233
-304
@@ -1082,6 +1082,135 @@ class AMDSMIHelpers():
|
||||
print(msg)
|
||||
logging.warning(msg)
|
||||
|
||||
def display_cper_files_generated(self, entries, device_handle, folder, follow):
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
if not getattr(self, "_cper_warning_printed", False):
|
||||
YELLOW = "\033[33m"
|
||||
RED = "\033[31m"
|
||||
RESET = "\033[0m"
|
||||
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
|
||||
self._cper_warning_printed = True
|
||||
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12}", end="")
|
||||
if folder:
|
||||
print(f" {'file_name':<17}", end="")
|
||||
print("")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12}", end="")
|
||||
if folder:
|
||||
print(f" {cper_data_file:<17}", end="")
|
||||
print("")
|
||||
self.increment_cper_count()
|
||||
|
||||
def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None):
|
||||
# One‐time header
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
if folder:
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
printed_rows = []
|
||||
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# --- rotate out oldest if over limit ---
|
||||
if file_limit:
|
||||
files = sorted(folder.glob("*.cper"), key=lambda p: p.stat().st_mtime)
|
||||
while len(files) >= file_limit:
|
||||
old = files.pop(0)
|
||||
try: old.unlink()
|
||||
except OSError: pass
|
||||
j = old.with_suffix('.json')
|
||||
if j.exists():
|
||||
try: j.unlink()
|
||||
except OSError: pass
|
||||
|
||||
# --- determine prefix/severity ---
|
||||
sev = entry.get("error_severity", "").lower()
|
||||
nt = entry.get("notify_type", "")
|
||||
if sev == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif sev == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif sev == "fatal" and nt == "BOOT":
|
||||
prefix = "boot"
|
||||
elif sev == "fatal":
|
||||
prefix = "fatal"
|
||||
else:
|
||||
prefix = "unknown"
|
||||
|
||||
# --- new filenames ---
|
||||
count = self.get_cper_count()
|
||||
cper_name = f"{prefix}_{count}.cper"
|
||||
json_name = f"{prefix}_{count}.json"
|
||||
cper_path = folder / cper_name
|
||||
json_path = folder / json_name
|
||||
|
||||
# --- write files ---
|
||||
self.write_binary(
|
||||
cper_data[entry_index]["bytes"],
|
||||
cper_data[entry_index]["size"],
|
||||
cper_path
|
||||
)
|
||||
try:
|
||||
with json_path.open("w") as f:
|
||||
f.write(json.dumps(
|
||||
entry,
|
||||
indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o
|
||||
))
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to write JSON to {json_path}: {e}")
|
||||
|
||||
# --- collect for printing ---
|
||||
ts = entry.get("timestamp", "unknown")
|
||||
gid = self.get_gpu_id_from_device_handle(device_handle)
|
||||
printed_rows.append((ts, gid, prefix, cper_name))
|
||||
|
||||
self.increment_cper_count()
|
||||
|
||||
# --- only now actually print: either all, or just last `file_limit` ---
|
||||
if file_limit:
|
||||
to_print = printed_rows[-file_limit:]
|
||||
else:
|
||||
to_print = printed_rows
|
||||
|
||||
for ts, gid, prefix, fname in to_print:
|
||||
print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17}")
|
||||
|
||||
else:
|
||||
print(json.dumps(
|
||||
entries,
|
||||
indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o
|
||||
))
|
||||
|
||||
def write_binary(self, data, size, filepath):
|
||||
"""
|
||||
Writes binary data directly to a file.
|
||||
@@ -1103,309 +1232,6 @@ class AMDSMIHelpers():
|
||||
data_bytes = data[:size]
|
||||
f.write(data_bytes)
|
||||
|
||||
def display_cper_files_generated_follow(self, entries, device_handle):
|
||||
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
if not getattr(self, "_cper_warning_printed", False):
|
||||
YELLOW = "\033[33m"
|
||||
RED = "\033[31m"
|
||||
RESET = "\033[0m"
|
||||
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
|
||||
self._cper_warning_printed = True
|
||||
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def display_cper_files_generated(self, entries, device_handle):
|
||||
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
if not getattr(self, "_cper_warning_printed", False):
|
||||
YELLOW = "\033[33m"
|
||||
RED = "\033[31m"
|
||||
RESET = "\033[0m"
|
||||
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
|
||||
self._cper_warning_printed = True
|
||||
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
|
||||
|
||||
def dump_gpu_entries(self, folder, entries, cper_data, device_handle):
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
# Construct a unique file name using the key to avoid overwriting
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
output_path = folder / entry_file
|
||||
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
cper_data_file_path = folder / cper_data_file
|
||||
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
|
||||
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
|
||||
|
||||
try:
|
||||
with output_path.open("w") as f:
|
||||
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
|
||||
# Dump the single entry as JSON, handling bytes via the lambda.
|
||||
f.write(json.dumps(entry, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
|
||||
else:
|
||||
print(json.dumps(entries, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
def dump_all_entries(self, folder, entries, cper_data, device_handle):
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
# Construct a unique file name using the key to avoid overwriting
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
output_path = folder / entry_file
|
||||
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
cper_data_file_path = folder / cper_data_file
|
||||
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
|
||||
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
|
||||
try:
|
||||
with output_path.open("w") as f:
|
||||
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
|
||||
# Dump the single entry as JSON, handling bytes via the lambda.
|
||||
f.write(json.dumps(entry, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
|
||||
else:
|
||||
print(json.dumps(entries, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
def dump_all_entries_follow(self, folder, entries, cper_data, device_handle):
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
# Construct a unique file name using the key to avoid overwriting
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
output_path = folder / entry_file
|
||||
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
cper_data_file_path = folder / cper_data_file
|
||||
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
|
||||
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
time.sleep(1)
|
||||
|
||||
try:
|
||||
with output_path.open("w") as f:
|
||||
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
|
||||
# Dump the single entry as JSON, handling bytes via the lambda.
|
||||
f.write(json.dumps(entry, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
|
||||
else:
|
||||
print(json.dumps(entries, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
def dump_gpu_entries_follow(self, folder, entries, cper_data, device_handle):
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
# Construct a unique file name using the key to avoid overwriting
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
output_path = folder / entry_file
|
||||
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
cper_data_file_path = folder / cper_data_file
|
||||
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
|
||||
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
|
||||
self.increment_cper_count()
|
||||
time.sleep(1)
|
||||
|
||||
try:
|
||||
with output_path.open("w") as f:
|
||||
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
|
||||
# Dump the single entry as JSON, handling bytes via the lambda.
|
||||
f.write(json.dumps(entry, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
|
||||
else:
|
||||
print(json.dumps(entries, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
def hexdump_to_string(self, data: Union[bytes, List[int]]) -> str:
|
||||
"""
|
||||
Convert binary data to a hexdump string.
|
||||
@@ -1435,4 +1261,107 @@ class AMDSMIHelpers():
|
||||
ascii_values = "".join(chr(b) if 32 <= b <= 126 else "." for b in chunk)
|
||||
lines.append(f"{offset:08x} {hex_values} |{ascii_values}|")
|
||||
|
||||
return "\n".join(lines)
|
||||
return "\n".join(lines)
|
||||
|
||||
def pvtDumpAfids(self, cper_file):
|
||||
# 1) Fetch the CPER “file” and ensure we have raw bytes
|
||||
raw_data = cper_file
|
||||
if hasattr(raw_data, "read"):
|
||||
# fetch_cper_file returned a file‐object
|
||||
raw = raw_data.read()
|
||||
elif isinstance(raw_data, Path):
|
||||
# Path: read the bytes directly
|
||||
raw = raw_data.read_bytes()
|
||||
elif isinstance(raw_data, str):
|
||||
# fetch_cper_file returned a filename
|
||||
with open(raw_data, "rb") as f:
|
||||
raw = f.read()
|
||||
else:
|
||||
# assume it's already bytes
|
||||
raw = raw_data
|
||||
size = len(raw)
|
||||
self.hexdump_to_string(raw)
|
||||
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
|
||||
print(f"AFIDS: ", end="")
|
||||
for afid in afids:
|
||||
print(afid, end=" ")
|
||||
print("")
|
||||
|
||||
def ras_cper(self, args, device_handle, logger, gpu_idx):
|
||||
# Parse severity mask dynamically from the --severity option.
|
||||
severity_mask = 0
|
||||
# drop duplicates of args
|
||||
logging.debug(args)
|
||||
|
||||
for sev in list(set(args.severity)):
|
||||
if sev == "all":
|
||||
# Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2)
|
||||
severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2))
|
||||
elif sev == "fatal":
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1)
|
||||
severity_mask |= (1 << 1)
|
||||
elif sev in ("nonfatal", "nonfatal-uncorrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0)
|
||||
severity_mask |= (1 << 0)
|
||||
elif sev in ("nonfatal-corrected", "corrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
|
||||
severity_mask |= (1 << 2)
|
||||
|
||||
buffer_size = 1048576
|
||||
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
if args.follow and not getattr(self, "_cper_follow_prompted", False):
|
||||
print("Press CTRL + C to stop.")
|
||||
self._cper_follow_prompted = True
|
||||
|
||||
partition_id = -1
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(device_handle)
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if partition_id != 0:
|
||||
logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}")
|
||||
return
|
||||
|
||||
if args.folder and not getattr(self, "_cper_folder_prompted", False):
|
||||
print(f"Dumping CPER file header entries in folder {args.folder}")
|
||||
self._cper_folder_prompted = True
|
||||
|
||||
logger.set_cper_exit_message(False)
|
||||
self.stop = False
|
||||
|
||||
while True:
|
||||
try:
|
||||
entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries(
|
||||
device_handle, severity_mask, buffer_size, args.cursor[gpu_idx])
|
||||
logging.debug(f"cper_entries | entries: {entries}")
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Error opening CPER file. This command requires elevation') from e
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED or \
|
||||
e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_NOT_FOUND:
|
||||
raise FileNotFoundError('Error accessing CPER files. This command requires CPER to be enabled.') from e
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR:
|
||||
raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e
|
||||
else:
|
||||
logging.debug(f"Error retrieving CPER entries: {e}")
|
||||
break
|
||||
args.cursor[gpu_idx] = new_cursor
|
||||
if len(entries) == 0:
|
||||
break
|
||||
if args.folder:
|
||||
if args.follow:
|
||||
if device_handle:
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
else:
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
else:
|
||||
if device_handle:
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
else:
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
break
|
||||
else:
|
||||
self.display_cper_files_generated(entries, device_handle, args.folder, args.follow)
|
||||
|
||||
@@ -1397,7 +1397,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
Adds the 'ras' subcommand.
|
||||
|
||||
Expected command:
|
||||
amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder <folder_name> --file_limit=1000 --follow
|
||||
amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder <folder_name> --file-limit=1000 --follow
|
||||
|
||||
All parameters are provided via options; no positional arguments or optional --file/--gpu are used.
|
||||
"""
|
||||
@@ -1431,8 +1431,8 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
ras_parser.add_argument("--afid", action="store_true", required=False, help=afid_help)
|
||||
ras_parser.add_argument("--severity", type=str.lower, nargs='+', default=['all'], help=severity_help, choices=severity_choices, metavar='SEVERITY')
|
||||
ras_parser.add_argument("--folder", type=str, action=self._check_folder_path(), default=False, help=folder_help)
|
||||
ras_parser.add_argument("--file_limit", type=self._positive_int, action='store', default=1000, help=file_limit_help)
|
||||
ras_parser.add_argument("--cper_file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help)
|
||||
ras_parser.add_argument("--file-limit", type=self._positive_int, action='store', default=1000, help=file_limit_help)
|
||||
ras_parser.add_argument("--cper-file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help)
|
||||
ras_parser.add_argument("--follow", action="store_true", default=False, help=follow_help)
|
||||
|
||||
# Add common modifiers and device selection arguments.
|
||||
|
||||
@@ -767,7 +767,7 @@ Displays RAS information of specified devices.
|
||||
```shell-session
|
||||
~$ amd-smi ras --help
|
||||
usage: amd-smi ras [-h] --cper [--severity SEVERITY [SEVERITY ...]] [--folder FOLDER]
|
||||
[--file_limit FILE_LIMIT] [--follow]
|
||||
[--file-limit FILE_LIMIT] [--follow]
|
||||
[-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]]
|
||||
[--json | --csv] [--file FILE] [--loglevel LEVEL]
|
||||
|
||||
@@ -781,7 +781,7 @@ RAS arguments:
|
||||
--severity SEVERITY [SEVERITY ...] Set the SEVERITY filters from the following:
|
||||
nonfatal-uncorrected, fatal, nonfatal-corrected, all
|
||||
--folder FOLDER Folder to dump CPER report files
|
||||
--file_limit FILE_LIMIT Maximum number of entries per output file
|
||||
--file-limit FILE_LIMIT Maximum number of entries per output file
|
||||
--follow Continuously monitor for new entries
|
||||
|
||||
Device arguments:
|
||||
|
||||
@@ -34,9 +34,10 @@ endif()
|
||||
# add package search paths
|
||||
set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${ROCM_DIR} ../../../)
|
||||
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib)
|
||||
if(EXISTS ${ROCM_DIR}/lib64)
|
||||
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib64)
|
||||
if(NOT EXISTS ${ROCM_DIR}/lib64)
|
||||
file(MAKE_DIRECTORY ${ROCM_DIR}/lib64)
|
||||
endif()
|
||||
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib64)
|
||||
find_package(amd_smi CONFIG REQUIRED)
|
||||
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
|
||||
Ссылка в новой задаче
Block a user