diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 211535ab10..fa9fc19df0 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -67,6 +67,7 @@ #include "core/inc/default_signal.h" #include "core/inc/hsa_ext_amd_impl.h" #include "core/inc/amd_gpu_pm4.h" +#include "core/inc/amd_core_dump.hpp" namespace rocr { namespace AMD { @@ -1231,6 +1232,17 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) { return false; } + // Fallback if KFD does not support GPU core dump. In this case, there core dump is + // generated by hsa-runtime. + if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump) { + if (amd::coredump::dump_gpu_core()) + debug_print("GPU core dump failed\n"); + // supports_core_dump flag is overwritten to avoid generate core dump file again + // caught by a different exception handler. Such as VMFaultHandler. + core::Runtime::runtime_singleton_->KfdVersion( + core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging, true); + } + for (auto& error : QueueErrors) { if (error_code & (1 << (error.code - 1))) { errorCode = error.status; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index 264590ab68..bb06606288 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -70,6 +70,7 @@ #include "core/util/os.h" #include "core/inc/exceptions.h" #include "inc/hsa_ven_amd_aqlprofile.h" +#include "core/inc/amd_core_dump.hpp" #ifndef HSA_VERSION_MAJOR #define HSA_VERSION_MAJOR 1 @@ -1358,6 +1359,13 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { HsaMemoryAccessFault& fault = vm_fault_event->EventData.EventData.MemoryAccessFault; + // Fallback if KFD does not support GPU core dump. In this case, there core dump is + // generated by hsa-runtime. + if (!runtime_singleton_->KfdVersion().supports_core_dump) { + if (amd::coredump::dump_gpu_core()) + debug_print("GPU core dump failed\n"); + } + hsa_status_t custom_handler_status = HSA_STATUS_ERROR; auto system_event_handlers = runtime_singleton_->GetSystemEventHandlers(); // If custom handler is registered, pack the fault info and call the handler