[SWDEV-528364] CPER CLI --follow fix (#298)
Signed-off-by: Yazen ALMusaffar <yalmusaf@amd.com>
[ROCm/amdsmi commit: f1f782312d]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
36a8775ddd
Коммит
5c59f20f22
@@ -6320,26 +6320,28 @@ class AMDSMICommands():
|
||||
elif sev in ("nonfatal-corrected", "corrected"):
|
||||
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
|
||||
severity_mask |= (1 << 2)
|
||||
|
||||
|
||||
if args.cper:
|
||||
# Start from cursor 0 (no timestamp argument provided).
|
||||
cursor = 0
|
||||
buffer_size = 1048576
|
||||
file_limit = int(args.file_limit) if args.file_limit else 1000
|
||||
|
||||
# Print exit message only once and only when follow is set
|
||||
if self.logger.cper_exit_message() and args.follow:
|
||||
print('Press q and hit ENTER when you want to stop.')
|
||||
self.logger.set_cper_exit_message(False)
|
||||
|
||||
|
||||
# Main loop: continuously retrieve CPER entries if --follow is set.
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
if args.folder:
|
||||
print(f'Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}\n')
|
||||
else:
|
||||
print(f'Dumping CPER file header entries for GPU {gpu_id}:\n')
|
||||
|
||||
# Print header only when dumping to a folder
|
||||
if args.follow and not getattr(self, "_cper_follow_prompted", False):
|
||||
print("Press CTRL + C to stop.")
|
||||
self._cper_follow_prompted = True
|
||||
if args.folder and args.gpu:
|
||||
print(f"Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}")
|
||||
elif args.folder:
|
||||
print(f"Dumping CPER file header entries in folder {args.folder}")
|
||||
|
||||
self.logger.set_cper_exit_message(False)
|
||||
self.stop = False
|
||||
|
||||
while True:
|
||||
try:
|
||||
entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries(
|
||||
@@ -6356,17 +6358,30 @@ class AMDSMICommands():
|
||||
else:
|
||||
logging.debug(f"Error retrieving CPER entries: {e}")
|
||||
break
|
||||
if entries:
|
||||
self.helpers.dump_entries(args.folder, entries, cper_data)
|
||||
# Dump or display
|
||||
if args.folder:
|
||||
if args.gpu and not args.follow:
|
||||
self.helpers.dump_gpu_entries(args.folder, entries, cper_data, args.gpu)
|
||||
break
|
||||
elif not args.gpu and not args.follow:
|
||||
self.helpers.dump_all_entries(args.folder, entries, cper_data, args.gpu)
|
||||
break
|
||||
elif args.follow and args.gpu:
|
||||
self.helpers.dump_gpu_entries_follow(args.folder, entries, cper_data, args.gpu)
|
||||
break
|
||||
elif args.follow and not args.gpu:
|
||||
self.helpers.dump_all_entries_follow(args.folder, entries, cper_data, args.gpu)
|
||||
break
|
||||
if args.follow:
|
||||
self.helpers.display_cper_files_generated_follow(entries, args.gpu)
|
||||
break
|
||||
else:
|
||||
self.helpers.display_cper_files_generated(entries, args.gpu)
|
||||
break
|
||||
if len(entries) == 0 or not args.follow:
|
||||
break
|
||||
cursor = new_cursor
|
||||
time.sleep(5)
|
||||
user_input = input()
|
||||
if user_input == 'q':
|
||||
print("Escape Sequence Detected; Exiting")
|
||||
self.stop = True
|
||||
break
|
||||
|
||||
|
||||
def _event_thread(self, commands, i):
|
||||
|
||||
@@ -1081,7 +1081,6 @@ class AMDSMIHelpers():
|
||||
print(msg)
|
||||
logging.warning(msg)
|
||||
|
||||
|
||||
def write_binary(self, data, size, filepath):
|
||||
"""
|
||||
Writes binary data directly to a file.
|
||||
@@ -1103,8 +1102,93 @@ class AMDSMIHelpers():
|
||||
data_bytes = data[:size]
|
||||
f.write(data_bytes)
|
||||
|
||||
def display_cper_files_generated_follow(self, entries, device_handle):
|
||||
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
if not getattr(self, "_cper_warning_printed", False):
|
||||
YELLOW = "\033[33m"
|
||||
RED = "\033[31m"
|
||||
RESET = "\033[0m"
|
||||
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
|
||||
self._cper_warning_printed = True
|
||||
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
self.increment_cper_count()
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def display_cper_files_generated(self, entries, device_handle):
|
||||
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
# One‐time initialization: print warning & header only once
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
# Warning if no folder was specified elsewhere
|
||||
if not getattr(self, "_cper_warning_printed", False):
|
||||
YELLOW = "\033[33m"
|
||||
RED = "\033[31m"
|
||||
RESET = "\033[0m"
|
||||
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
|
||||
self._cper_warning_printed = True
|
||||
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
self.increment_cper_count()
|
||||
|
||||
|
||||
def dump_gpu_entries(self, folder, entries, cper_data, device_handle):
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
def dump_entries(self, folder, entries, cper_data):
|
||||
if folder:
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
|
||||
@@ -1132,6 +1216,174 @@ class AMDSMIHelpers():
|
||||
cper_data_file_path = folder / cper_data_file
|
||||
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
|
||||
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
self.increment_cper_count()
|
||||
|
||||
|
||||
try:
|
||||
with output_path.open("w") as f:
|
||||
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
|
||||
# Dump the single entry as JSON, handling bytes via the lambda.
|
||||
f.write(json.dumps(entry, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
|
||||
else:
|
||||
print(json.dumps(entries, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
def dump_all_entries(self, folder, entries, cper_data, device_handle):
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
# Construct a unique file name using the key to avoid overwriting
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
output_path = folder / entry_file
|
||||
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
cper_data_file_path = folder / cper_data_file
|
||||
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
|
||||
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
self.increment_cper_count()
|
||||
|
||||
try:
|
||||
with output_path.open("w") as f:
|
||||
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
|
||||
# Dump the single entry as JSON, handling bytes via the lambda.
|
||||
f.write(json.dumps(entry, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
|
||||
else:
|
||||
print(json.dumps(entries, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
def dump_all_entries_follow(self, folder, entries, cper_data, device_handle):
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
# Construct a unique file name using the key to avoid overwriting
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
output_path = folder / entry_file
|
||||
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
cper_data_file_path = folder / cper_data_file
|
||||
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
|
||||
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
self.increment_cper_count()
|
||||
time.sleep(1)
|
||||
|
||||
try:
|
||||
with output_path.open("w") as f:
|
||||
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
|
||||
# Dump the single entry as JSON, handling bytes via the lambda.
|
||||
f.write(json.dumps(entry, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
|
||||
else:
|
||||
print(json.dumps(entries, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
|
||||
|
||||
def dump_gpu_entries_follow(self, folder, entries, cper_data, device_handle):
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<6} {'severity':<10} {'file_name'}")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
|
||||
if folder:
|
||||
folder = Path(folder)
|
||||
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
if error_severity == "non_fatal_uncorrected":
|
||||
prefix = "uncorrected"
|
||||
elif error_severity == "non_fatal_corrected":
|
||||
prefix = "corrected"
|
||||
elif error_severity == "fatal":
|
||||
prefix = "fatal"
|
||||
if notify_type == "BOOT":
|
||||
prefix = "boot"
|
||||
|
||||
# Construct a unique file name using the key to avoid overwriting
|
||||
entry_file = f"{prefix}_{self.get_cper_count()}.json"
|
||||
output_path = folder / entry_file
|
||||
|
||||
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
|
||||
cper_data_file_path = folder / cper_data_file
|
||||
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
|
||||
|
||||
#print header
|
||||
timestamp = entry.get("timestamp", "unknown")
|
||||
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
|
||||
print(f"{timestamp:<20} {gpu_id:<6} {prefix:<10} {cper_data_file}")
|
||||
self.increment_cper_count()
|
||||
time.sleep(1)
|
||||
|
||||
try:
|
||||
with output_path.open("w") as f:
|
||||
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
|
||||
@@ -1143,4 +1395,3 @@ class AMDSMIHelpers():
|
||||
else:
|
||||
print(json.dumps(entries, indent=2,
|
||||
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
|
||||
self.increment_cper_count()
|
||||
|
||||
Ссылка в новой задаче
Block a user