[SWDEV-530385] Fix CPER "--follow" & "--file-limit" (#380)

* --follow option fix & --file_limit option added
* change --file_limit and --cper_file to --file-limit and --cper-file

---------

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>

[ROCm/amdsmi commit: 91c9969b72]
Этот коммит содержится в:
Saeed, Oosman
2025-05-29 11:59:55 -05:00
коммит произвёл GitHub
родитель 69fde31369
Коммит b793acaa71
5 изменённых файлов: 257 добавлений и 431 удалений
+16 -120
Просмотреть файл
@@ -6331,33 +6331,7 @@ class AMDSMICommands():
else:
with self.logger.destination.open('a', encoding="utf-8") as output_file:
output_file.write(legend_output + '\n')
def __pvtDumpAfids(self, cper_file):
# 1) Fetch the CPER “file” and ensure we have raw bytes
raw_data = cper_file
if hasattr(raw_data, "read"):
# fetch_cper_file returned a fileobject
raw = raw_data.read()
elif isinstance(raw_data, Path):
# Path: read the bytes directly
raw = raw_data.read_bytes()
elif isinstance(raw_data, str):
# fetch_cper_file returned a filename
with open(raw_data, "rb") as f:
raw = f.read()
else:
# assume it's already bytes
raw = raw_data
size = len(raw)
self.helpers.hexdump_to_string(raw)
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
print(f"AFIDS: ", end="")
for afid in afids:
print(afid, end=" ")
print("")
def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,
severity=None, folder=None, file_limit=None, cper_file=None, follow=None):
@@ -6365,7 +6339,7 @@ class AMDSMICommands():
Retrieve and process CPER (RAS) entries for a target GPU.
Expected command (all options only):
amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder <folder_name> --file_limit=1000 --follow
amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder <folder_name> --file-limit=1000 --follow
Since no timestamp is provided on the command line, the function starts from a default cursor of 0.
The output file name is auto-generated using the timestamp from the CPER header data (converted from
@@ -6392,108 +6366,30 @@ class AMDSMICommands():
if args.gpu == None:
args.gpu = self.device_handles
#Fetching AFID
if args.afid and args.cper_file:
self.__pvtDumpAfids(args.cper_file)
self.helpers.pvtDumpAfids(args.cper_file)
return
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras)
if handled_multiple_gpus:
if not args.cper:
return
args.gpu = device_handle
# Parse severity mask dynamically from the --severity option.
severity_mask = 0
# drop duplicates of args
logging.debug(args)
for sev in list(set(args.severity)):
if sev == "all":
# Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2)
severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2))
elif sev == "fatal":
# Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1)
severity_mask |= (1 << 1)
elif sev in ("nonfatal", "nonfatal-uncorrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0)
severity_mask |= (1 << 0)
elif sev in ("nonfatal-corrected", "corrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
severity_mask |= (1 << 2)
cursor = 0
buffer_size = 1048576
if args.cper:
# Start from cursor 0 (no timestamp argument provided).
file_limit = int(args.file_limit) if args.file_limit else 1000
# Main loop: continuously retrieve CPER entries if --follow is set.
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
# Print header only when dumping to a folder
if args.follow and not getattr(self, "_cper_follow_prompted", False):
print("Press CTRL + C to stop.")
self._cper_follow_prompted = True
partition_id = -1
try:
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
kfd_id = kfd_info['kfd_id']
node_id = kfd_info['node_id']
partition_id = kfd_info['current_partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info())
if partition_id != 0:
logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}")
return
if args.folder and not getattr(self, "_cper_folder_prompted", False):
print(f"Dumping CPER file header entries in folder {args.folder}")
self._cper_folder_prompted = True
self.logger.set_cper_exit_message(False)
self.stop = False
while True:
try:
entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries(
args.gpu, severity_mask, buffer_size, cursor)
logging.debug(f"cper_entries | entries: {entries}")
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Error opening CPER file. This command requires elevation') from e
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED or \
e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_NOT_FOUND:
raise FileNotFoundError('Error accessing CPER files. This command requires CPER to be enabled.') from e
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR:
raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e
else:
logging.debug(f"Error retrieving CPER entries: {e}")
break
# Dump or display
if args.folder:
if args.gpu and not args.follow:
self.helpers.dump_gpu_entries(args.folder, entries, cper_data, args.gpu)
break
elif not args.gpu and not args.follow:
self.helpers.dump_all_entries(args.folder, entries, cper_data, args.gpu)
break
elif args.follow and args.gpu:
self.helpers.dump_gpu_entries_follow(args.folder, entries, cper_data, args.gpu)
elif args.follow and not args.gpu:
self.helpers.dump_all_entries_follow(args.folder, entries, cper_data, args.gpu)
if args.follow:
self.helpers.display_cper_files_generated_follow(entries, args.gpu)
else:
self.helpers.display_cper_files_generated(entries, args.gpu)
break
if len(entries) == 0 and not args.follow:
break
cursor = new_cursor
time.sleep(1)
if not args.gpu:
return
if not isinstance(args.gpu, list):
args.gpu = [args.gpu]
args.cursor = [0] * len(args.gpu)
while True:
for idx, device_handle in enumerate(args.gpu):
self.helpers.ras_cper(args, device_handle, self.logger, idx)
if not args.follow:
break
time.sleep(1)
def _event_thread(self, commands, i):
devices = commands.device_handles
Обычный файл → Исполняемый файл
+233 -304
Просмотреть файл
@@ -1082,6 +1082,135 @@ class AMDSMIHelpers():
print(msg)
logging.warning(msg)
def display_cper_files_generated(self, entries, device_handle, folder, follow):
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
if not getattr(self, "_cper_warning_printed", False):
YELLOW = "\033[33m"
RED = "\033[31m"
RESET = "\033[0m"
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
self._cper_warning_printed = True
# Header
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12}", end="")
if folder:
print(f" {'file_name':<17}", end="")
print("")
self._cper_display_initialized = True
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12}", end="")
if folder:
print(f" {cper_data_file:<17}", end="")
print("")
self.increment_cper_count()
def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None):
# Onetime header
if not getattr(self, "_cper_display_initialized", False):
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
if folder:
folder = Path(folder)
folder.mkdir(parents=True, exist_ok=True)
printed_rows = []
for entry_index, entry in enumerate(entries.values()):
# --- rotate out oldest if over limit ---
if file_limit:
files = sorted(folder.glob("*.cper"), key=lambda p: p.stat().st_mtime)
while len(files) >= file_limit:
old = files.pop(0)
try: old.unlink()
except OSError: pass
j = old.with_suffix('.json')
if j.exists():
try: j.unlink()
except OSError: pass
# --- determine prefix/severity ---
sev = entry.get("error_severity", "").lower()
nt = entry.get("notify_type", "")
if sev == "non_fatal_uncorrected":
prefix = "uncorrected"
elif sev == "non_fatal_corrected":
prefix = "corrected"
elif sev == "fatal" and nt == "BOOT":
prefix = "boot"
elif sev == "fatal":
prefix = "fatal"
else:
prefix = "unknown"
# --- new filenames ---
count = self.get_cper_count()
cper_name = f"{prefix}_{count}.cper"
json_name = f"{prefix}_{count}.json"
cper_path = folder / cper_name
json_path = folder / json_name
# --- write files ---
self.write_binary(
cper_data[entry_index]["bytes"],
cper_data[entry_index]["size"],
cper_path
)
try:
with json_path.open("w") as f:
f.write(json.dumps(
entry,
indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o
))
except Exception as e:
logging.error(f"Failed to write JSON to {json_path}: {e}")
# --- collect for printing ---
ts = entry.get("timestamp", "unknown")
gid = self.get_gpu_id_from_device_handle(device_handle)
printed_rows.append((ts, gid, prefix, cper_name))
self.increment_cper_count()
# --- only now actually print: either all, or just last `file_limit` ---
if file_limit:
to_print = printed_rows[-file_limit:]
else:
to_print = printed_rows
for ts, gid, prefix, fname in to_print:
print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17}")
else:
print(json.dumps(
entries,
indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o
))
def write_binary(self, data, size, filepath):
"""
Writes binary data directly to a file.
@@ -1103,309 +1232,6 @@ class AMDSMIHelpers():
data_bytes = data[:size]
f.write(data_bytes)
def display_cper_files_generated_follow(self, entries, device_handle):
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
if not getattr(self, "_cper_warning_printed", False):
YELLOW = "\033[33m"
RED = "\033[31m"
RESET = "\033[0m"
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
self._cper_warning_printed = True
# Header
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
entry_file = f"{prefix}_{self.get_cper_count()}.json"
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
time.sleep(1)
def display_cper_files_generated(self, entries, device_handle):
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
if not getattr(self, "_cper_warning_printed", False):
YELLOW = "\033[33m"
RED = "\033[31m"
RESET = "\033[0m"
print(f"{YELLOW}WARNING:{RESET} {RED}No{RESET} cper files will be dumped unless --folder=<folder_name> is specified.")
self._cper_warning_printed = True
# Header
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
entry_file = f"{prefix}_{self.get_cper_count()}.json"
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
def dump_gpu_entries(self, folder, entries, cper_data, device_handle):
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
if folder:
folder = Path(folder)
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
# Construct a unique file name using the key to avoid overwriting
entry_file = f"{prefix}_{self.get_cper_count()}.json"
output_path = folder / entry_file
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
cper_data_file_path = folder / cper_data_file
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
try:
with output_path.open("w") as f:
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
# Dump the single entry as JSON, handling bytes via the lambda.
f.write(json.dumps(entry, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
except Exception as e:
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
else:
print(json.dumps(entries, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
def dump_all_entries(self, folder, entries, cper_data, device_handle):
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
if folder:
folder = Path(folder)
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
# Construct a unique file name using the key to avoid overwriting
entry_file = f"{prefix}_{self.get_cper_count()}.json"
output_path = folder / entry_file
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
cper_data_file_path = folder / cper_data_file
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
try:
with output_path.open("w") as f:
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
# Dump the single entry as JSON, handling bytes via the lambda.
f.write(json.dumps(entry, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
except Exception as e:
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
else:
print(json.dumps(entries, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
def dump_all_entries_follow(self, folder, entries, cper_data, device_handle):
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
if folder:
folder = Path(folder)
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
# Construct a unique file name using the key to avoid overwriting
entry_file = f"{prefix}_{self.get_cper_count()}.json"
output_path = folder / entry_file
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
cper_data_file_path = folder / cper_data_file
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
time.sleep(1)
try:
with output_path.open("w") as f:
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
# Dump the single entry as JSON, handling bytes via the lambda.
f.write(json.dumps(entry, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
except Exception as e:
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
else:
print(json.dumps(entries, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
def dump_gpu_entries_follow(self, folder, entries, cper_data, device_handle):
# Onetime initialization: print warning & header only once
if not getattr(self, "_cper_display_initialized", False):
# Warning if no folder was specified elsewhere
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
self._cper_display_initialized = True
if folder:
folder = Path(folder)
folder.mkdir(parents=True, exist_ok=True) # Ensure folder exists
# Loop through all entries in the dictionary.
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
error_severity = entry.get("error_severity", "Unknown")
notify_type = entry.get("notify_type", "Unknown")
if error_severity == "non_fatal_uncorrected":
prefix = "uncorrected"
elif error_severity == "non_fatal_corrected":
prefix = "corrected"
elif error_severity == "fatal":
prefix = "fatal"
if notify_type == "BOOT":
prefix = "boot"
# Construct a unique file name using the key to avoid overwriting
entry_file = f"{prefix}_{self.get_cper_count()}.json"
output_path = folder / entry_file
cper_data_file = f"{prefix}_{self.get_cper_count()}.cper"
cper_data_file_path = folder / cper_data_file
self.write_binary(cper_data[entry_index]["bytes"], cper_data[entry_index]["size"], cper_data_file_path)
#print header
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12} {cper_data_file:<17}")
self.increment_cper_count()
time.sleep(1)
try:
with output_path.open("w") as f:
logging.debug(f"Writing entry {self.get_cper_count()}: {entry} to {output_path}")
# Dump the single entry as JSON, handling bytes via the lambda.
f.write(json.dumps(entry, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
except Exception as e:
logging.error(f"Failed to write entry {self.get_cper_count()} to {output_path}: {e}")
else:
print(json.dumps(entries, indent=2,
default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o))
def hexdump_to_string(self, data: Union[bytes, List[int]]) -> str:
"""
Convert binary data to a hexdump string.
@@ -1435,4 +1261,107 @@ class AMDSMIHelpers():
ascii_values = "".join(chr(b) if 32 <= b <= 126 else "." for b in chunk)
lines.append(f"{offset:08x} {hex_values} |{ascii_values}|")
return "\n".join(lines)
return "\n".join(lines)
def pvtDumpAfids(self, cper_file):
# 1) Fetch the CPER “file” and ensure we have raw bytes
raw_data = cper_file
if hasattr(raw_data, "read"):
# fetch_cper_file returned a fileobject
raw = raw_data.read()
elif isinstance(raw_data, Path):
# Path: read the bytes directly
raw = raw_data.read_bytes()
elif isinstance(raw_data, str):
# fetch_cper_file returned a filename
with open(raw_data, "rb") as f:
raw = f.read()
else:
# assume it's already bytes
raw = raw_data
size = len(raw)
self.hexdump_to_string(raw)
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
print(f"AFIDS: ", end="")
for afid in afids:
print(afid, end=" ")
print("")
def ras_cper(self, args, device_handle, logger, gpu_idx):
# Parse severity mask dynamically from the --severity option.
severity_mask = 0
# drop duplicates of args
logging.debug(args)
for sev in list(set(args.severity)):
if sev == "all":
# Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2)
severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2))
elif sev == "fatal":
# Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1)
severity_mask |= (1 << 1)
elif sev in ("nonfatal", "nonfatal-uncorrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0)
severity_mask |= (1 << 0)
elif sev in ("nonfatal-corrected", "corrected"):
# Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2)
severity_mask |= (1 << 2)
buffer_size = 1048576
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
if args.follow and not getattr(self, "_cper_follow_prompted", False):
print("Press CTRL + C to stop.")
self._cper_follow_prompted = True
partition_id = -1
try:
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(device_handle)
partition_id = kfd_info['current_partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info())
if partition_id != 0:
logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}")
return
if args.folder and not getattr(self, "_cper_folder_prompted", False):
print(f"Dumping CPER file header entries in folder {args.folder}")
self._cper_folder_prompted = True
logger.set_cper_exit_message(False)
self.stop = False
while True:
try:
entries, new_cursor, cper_data = amdsmi_interface.amdsmi_get_gpu_cper_entries(
device_handle, severity_mask, buffer_size, args.cursor[gpu_idx])
logging.debug(f"cper_entries | entries: {entries}")
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Error opening CPER file. This command requires elevation') from e
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED or \
e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_NOT_FOUND:
raise FileNotFoundError('Error accessing CPER files. This command requires CPER to be enabled.') from e
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR:
raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e
else:
logging.debug(f"Error retrieving CPER entries: {e}")
break
args.cursor[gpu_idx] = new_cursor
if len(entries) == 0:
break
if args.folder:
if args.follow:
if device_handle:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
else:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
else:
if device_handle:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
else:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
break
else:
self.display_cper_files_generated(entries, device_handle, args.folder, args.follow)
+3 -3
Просмотреть файл
@@ -1397,7 +1397,7 @@ class AMDSMIParser(argparse.ArgumentParser):
Adds the 'ras' subcommand.
Expected command:
amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder <folder_name> --file_limit=1000 --follow
amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder <folder_name> --file-limit=1000 --follow
All parameters are provided via options; no positional arguments or optional --file/--gpu are used.
"""
@@ -1431,8 +1431,8 @@ class AMDSMIParser(argparse.ArgumentParser):
ras_parser.add_argument("--afid", action="store_true", required=False, help=afid_help)
ras_parser.add_argument("--severity", type=str.lower, nargs='+', default=['all'], help=severity_help, choices=severity_choices, metavar='SEVERITY')
ras_parser.add_argument("--folder", type=str, action=self._check_folder_path(), default=False, help=folder_help)
ras_parser.add_argument("--file_limit", type=self._positive_int, action='store', default=1000, help=file_limit_help)
ras_parser.add_argument("--cper_file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help)
ras_parser.add_argument("--file-limit", type=self._positive_int, action='store', default=1000, help=file_limit_help)
ras_parser.add_argument("--cper-file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help)
ras_parser.add_argument("--follow", action="store_true", default=False, help=follow_help)
# Add common modifiers and device selection arguments.
+2 -2
Просмотреть файл
@@ -767,7 +767,7 @@ Displays RAS information of specified devices.
```shell-session
~$ amd-smi ras --help
usage: amd-smi ras [-h] --cper [--severity SEVERITY [SEVERITY ...]] [--folder FOLDER]
[--file_limit FILE_LIMIT] [--follow]
[--file-limit FILE_LIMIT] [--follow]
[-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]]
[--json | --csv] [--file FILE] [--loglevel LEVEL]
@@ -781,7 +781,7 @@ RAS arguments:
--severity SEVERITY [SEVERITY ...] Set the SEVERITY filters from the following:
nonfatal-uncorrected, fatal, nonfatal-corrected, all
--folder FOLDER Folder to dump CPER report files
--file_limit FILE_LIMIT Maximum number of entries per output file
--file-limit FILE_LIMIT Maximum number of entries per output file
--follow Continuously monitor for new entries
Device arguments:
+3 -2
Просмотреть файл
@@ -34,9 +34,10 @@ endif()
# add package search paths
set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${ROCM_DIR} ../../../)
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib)
if(EXISTS ${ROCM_DIR}/lib64)
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib64)
if(NOT EXISTS ${ROCM_DIR}/lib64)
file(MAKE_DIRECTORY ${ROCM_DIR}/lib64)
endif()
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib64)
find_package(amd_smi CONFIG REQUIRED)
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")