core dump: Generates a core dump from a fault event

Extracts and creates a core dump ELF file from a fault event, using
core dump front end. GFX11 is not supported.

Signed-off-by: Alex Sierra <Alex.Sierra@amd.com>
Change-Id: I5ae154e886f39ab3ce7bbae5803efb27a96c7e2e
Este commit está contenido en:
Alex Sierra
2022-10-24 21:45:33 +00:00
cometido por Alejandro Sierra Guiza
padre 5d3f6a63f1
commit cbeddf9eb6
Se han modificado 2 ficheros con 25 adiciones y 3 borrados
@@ -68,6 +68,7 @@
#include "core/inc/hsa_ext_amd_impl.h"
#include "core/inc/amd_gpu_pm4.h"
#include "core/inc/hsa_amd_tool_int.hpp"
#include "core/inc/amd_core_dump.hpp"
namespace rocr {
namespace AMD {
@@ -1261,6 +1262,18 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
return false;
}
// Fallback if KFD does not support GPU core dump. In this case, there core dump is
// generated by hsa-runtime.
if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump &&
queue->agent_->isa()->GetMajorVersion() != 11) {
if (amd::coredump::dump_gpu_core())
debug_print("GPU core dump failed\n");
// supports_core_dump flag is overwritten to avoid generate core dump file again
// caught by a different exception handler. Such as VMFaultHandler.
core::Runtime::runtime_singleton_->KfdVersion(
core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging, true);
}
for (auto& error : QueueErrors) {
if (error_code & (1 << (error.code - 1))) {
errorCode = error.status;
+12 -3
Ver fichero
@@ -73,6 +73,7 @@
#include "core/util/os.h"
#include "core/inc/exceptions.h"
#include "inc/hsa_ven_amd_aqlprofile.h"
#include "core/inc/amd_core_dump.hpp"
#ifndef HSA_VERSION_MAJOR
#define HSA_VERSION_MAJOR 1
@@ -1696,6 +1697,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
hsa_status_t custom_handler_status = HSA_STATUS_ERROR;
auto system_event_handlers = runtime_singleton_->GetSystemEventHandlers();
Agent* faulty_agent = nullptr;
// If custom handler is registered, pack the fault info and call the handler
if (!system_event_handlers.empty()) {
hsa_amd_event_t memory_fault_event;
@@ -1705,7 +1707,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
// Find the faulty agent
auto it = runtime_singleton_->agents_by_node_.find(fault.NodeId);
assert(it != runtime_singleton_->agents_by_node_.end() && "Can't find faulty agent.");
Agent* faulty_agent = it->second.front();
faulty_agent = it->second.front();
fault_info.agent = Agent::Convert(faulty_agent);
fault_info.virtual_address = fault.VirtualAddress;
@@ -1767,12 +1769,12 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
reason += "Unknown";
}
core::Agent* faultingAgent = runtime_singleton_->agents_by_node_[fault.NodeId][0];
faulty_agent = runtime_singleton_->agents_by_node_[fault.NodeId][0];
fprintf(
stderr,
"Memory access fault by GPU node-%u (Agent handle: %p) on address %p%s. Reason: %s.\n",
fault.NodeId, reinterpret_cast<void*>(faultingAgent->public_handle().handle),
fault.NodeId, reinterpret_cast<void*>(faulty_agent->public_handle().handle),
reinterpret_cast<const void*>(fault.VirtualAddress),
(fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", reason.c_str());
@@ -1780,6 +1782,13 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
PrintMemoryMapNear(reinterpret_cast<void*>(fault.VirtualAddress));
#endif
}
// Fallback if KFD does not support GPU core dump. In this case, there core dump is
// generated by hsa-runtime.
if (faulty_agent && faulty_agent->isa()->GetMajorVersion() != 11 &&
!runtime_singleton_->KfdVersion().supports_core_dump) {
if (amd::coredump::dump_gpu_core())
debug_print("GPU core dump failed\n");
}
assert(false && "GPU memory access fault.");
std::abort();
}