From 9aa39b0979c324fed8db751c0ce80fa68a630b5c Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Mon, 24 Oct 2022 21:45:33 +0000 Subject: [PATCH] core dump: Generates a core dump from a fault event Extracts and creates a core dump ELF file from a fault event, using core dump front end. Signed-off-by: Alex Sierra Change-Id: Ibbbe41b3d13dd3fcb90161e927d48c329cf513a9 [ROCm/ROCR-Runtime commit: 803e37ded5f29788cde8db3f721b456e35e6b37d] --- .../hsa-runtime/core/runtime/amd_aql_queue.cpp | 12 ++++++++++++ .../runtime/hsa-runtime/core/runtime/runtime.cpp | 8 ++++++++ 2 files changed, 20 insertions(+) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 211535ab10..fa9fc19df0 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -67,6 +67,7 @@ #include "core/inc/default_signal.h" #include "core/inc/hsa_ext_amd_impl.h" #include "core/inc/amd_gpu_pm4.h" +#include "core/inc/amd_core_dump.hpp" namespace rocr { namespace AMD { @@ -1231,6 +1232,17 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) { return false; } + // Fallback if KFD does not support GPU core dump. In this case, there core dump is + // generated by hsa-runtime. + if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump) { + if (amd::coredump::dump_gpu_core()) + debug_print("GPU core dump failed\n"); + // supports_core_dump flag is overwritten to avoid generate core dump file again + // caught by a different exception handler. Such as VMFaultHandler. + core::Runtime::runtime_singleton_->KfdVersion( + core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging, true); + } + for (auto& error : QueueErrors) { if (error_code & (1 << (error.code - 1))) { errorCode = error.status; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index 264590ab68..bb06606288 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -70,6 +70,7 @@ #include "core/util/os.h" #include "core/inc/exceptions.h" #include "inc/hsa_ven_amd_aqlprofile.h" +#include "core/inc/amd_core_dump.hpp" #ifndef HSA_VERSION_MAJOR #define HSA_VERSION_MAJOR 1 @@ -1358,6 +1359,13 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { HsaMemoryAccessFault& fault = vm_fault_event->EventData.EventData.MemoryAccessFault; + // Fallback if KFD does not support GPU core dump. In this case, there core dump is + // generated by hsa-runtime. + if (!runtime_singleton_->KfdVersion().supports_core_dump) { + if (amd::coredump::dump_gpu_core()) + debug_print("GPU core dump failed\n"); + } + hsa_status_t custom_handler_status = HSA_STATUS_ERROR; auto system_event_handlers = runtime_singleton_->GetSystemEventHandlers(); // If custom handler is registered, pack the fault info and call the handler