From cbeddf9eb673bc099a96733eca10290b024356a3 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Mon, 24 Oct 2022 21:45:33 +0000 Subject: [PATCH] core dump: Generates a core dump from a fault event Extracts and creates a core dump ELF file from a fault event, using core dump front end. GFX11 is not supported. Signed-off-by: Alex Sierra Change-Id: I5ae154e886f39ab3ce7bbae5803efb27a96c7e2e --- .../hsa-runtime/core/runtime/amd_aql_queue.cpp | 13 +++++++++++++ runtime/hsa-runtime/core/runtime/runtime.cpp | 15 ++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index da1cf091b1..21f0b7d926 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -68,6 +68,7 @@ #include "core/inc/hsa_ext_amd_impl.h" #include "core/inc/amd_gpu_pm4.h" #include "core/inc/hsa_amd_tool_int.hpp" +#include "core/inc/amd_core_dump.hpp" namespace rocr { namespace AMD { @@ -1261,6 +1262,18 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) { return false; } + // Fallback if KFD does not support GPU core dump. In this case, there core dump is + // generated by hsa-runtime. + if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump && + queue->agent_->isa()->GetMajorVersion() != 11) { + if (amd::coredump::dump_gpu_core()) + debug_print("GPU core dump failed\n"); + // supports_core_dump flag is overwritten to avoid generate core dump file again + // caught by a different exception handler. Such as VMFaultHandler. + core::Runtime::runtime_singleton_->KfdVersion( + core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging, true); + } + for (auto& error : QueueErrors) { if (error_code & (1 << (error.code - 1))) { errorCode = error.status; diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index d155d5097f..78c5c1f6aa 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -73,6 +73,7 @@ #include "core/util/os.h" #include "core/inc/exceptions.h" #include "inc/hsa_ven_amd_aqlprofile.h" +#include "core/inc/amd_core_dump.hpp" #ifndef HSA_VERSION_MAJOR #define HSA_VERSION_MAJOR 1 @@ -1696,6 +1697,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { hsa_status_t custom_handler_status = HSA_STATUS_ERROR; auto system_event_handlers = runtime_singleton_->GetSystemEventHandlers(); + Agent* faulty_agent = nullptr; // If custom handler is registered, pack the fault info and call the handler if (!system_event_handlers.empty()) { hsa_amd_event_t memory_fault_event; @@ -1705,7 +1707,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { // Find the faulty agent auto it = runtime_singleton_->agents_by_node_.find(fault.NodeId); assert(it != runtime_singleton_->agents_by_node_.end() && "Can't find faulty agent."); - Agent* faulty_agent = it->second.front(); + faulty_agent = it->second.front(); fault_info.agent = Agent::Convert(faulty_agent); fault_info.virtual_address = fault.VirtualAddress; @@ -1767,12 +1769,12 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { reason += "Unknown"; } - core::Agent* faultingAgent = runtime_singleton_->agents_by_node_[fault.NodeId][0]; + faulty_agent = runtime_singleton_->agents_by_node_[fault.NodeId][0]; fprintf( stderr, "Memory access fault by GPU node-%u (Agent handle: %p) on address %p%s. Reason: %s.\n", - fault.NodeId, reinterpret_cast(faultingAgent->public_handle().handle), + fault.NodeId, reinterpret_cast(faulty_agent->public_handle().handle), reinterpret_cast(fault.VirtualAddress), (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", reason.c_str()); @@ -1780,6 +1782,13 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { PrintMemoryMapNear(reinterpret_cast(fault.VirtualAddress)); #endif } + // Fallback if KFD does not support GPU core dump. In this case, there core dump is + // generated by hsa-runtime. + if (faulty_agent && faulty_agent->isa()->GetMajorVersion() != 11 && + !runtime_singleton_->KfdVersion().supports_core_dump) { + if (amd::coredump::dump_gpu_core()) + debug_print("GPU core dump failed\n"); + } assert(false && "GPU memory access fault."); std::abort(); }