From 2391516cedc2ff7faab7afdaaff37fad1a2f2858 Mon Sep 17 00:00:00 2001 From: "Saeed, Oosman" Date: Tue, 29 Apr 2025 13:16:01 -0500 Subject: [PATCH] [SWDEV-529266] [MI308][AMDSMI][RAS CPER] CPER dump not working on CPX mode (#319) * Do not raise excepction for cper status not found, but keep iterating to next gpu * Do not raise excepction for cper status not found, but keep iterating to next gpu * use partition id and skip if non-zero * reverting un-needed change * Do not raise excepction for cper status not found, but keep iterating to next gpu * use partition id and skip if non-zero --------- Co-authored-by: Oosman Saeed [ROCm/amdsmi commit: 9c297639f3ef44b44a08a2f1706cc86abc80116b] --- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 15a36bb351..737c87d215 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -6381,6 +6381,20 @@ class AMDSMICommands(): if args.follow and not getattr(self, "_cper_follow_prompted", False): print("Press CTRL + C to stop.") self._cper_follow_prompted = True + + partition_id = -1 + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu) + kfd_id = kfd_info['kfd_id'] + node_id = kfd_info['node_id'] + partition_id = kfd_info['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info()) + + if partition_id != 0: + logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}") + return + if args.folder and args.gpu: print(f"Dumping CPER file header entries for GPU {gpu_id} in folder {args.folder}") elif args.folder: