core dump: Generates a core dump from a fault event
Extracts and creates a core dump ELF file from a fault event, using core dump front end. GFX11 is not supported. Signed-off-by: Alex Sierra <Alex.Sierra@amd.com> Change-Id: I5ae154e886f39ab3ce7bbae5803efb27a96c7e2e
Este commit está contenido en:
cometido por
Alejandro Sierra Guiza
padre
5d3f6a63f1
commit
cbeddf9eb6
@@ -68,6 +68,7 @@
|
||||
#include "core/inc/hsa_ext_amd_impl.h"
|
||||
#include "core/inc/amd_gpu_pm4.h"
|
||||
#include "core/inc/hsa_amd_tool_int.hpp"
|
||||
#include "core/inc/amd_core_dump.hpp"
|
||||
|
||||
namespace rocr {
|
||||
namespace AMD {
|
||||
@@ -1261,6 +1262,18 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fallback if KFD does not support GPU core dump. In this case, there core dump is
|
||||
// generated by hsa-runtime.
|
||||
if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump &&
|
||||
queue->agent_->isa()->GetMajorVersion() != 11) {
|
||||
if (amd::coredump::dump_gpu_core())
|
||||
debug_print("GPU core dump failed\n");
|
||||
// supports_core_dump flag is overwritten to avoid generate core dump file again
|
||||
// caught by a different exception handler. Such as VMFaultHandler.
|
||||
core::Runtime::runtime_singleton_->KfdVersion(
|
||||
core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging, true);
|
||||
}
|
||||
|
||||
for (auto& error : QueueErrors) {
|
||||
if (error_code & (1 << (error.code - 1))) {
|
||||
errorCode = error.status;
|
||||
|
||||
@@ -73,6 +73,7 @@
|
||||
#include "core/util/os.h"
|
||||
#include "core/inc/exceptions.h"
|
||||
#include "inc/hsa_ven_amd_aqlprofile.h"
|
||||
#include "core/inc/amd_core_dump.hpp"
|
||||
|
||||
#ifndef HSA_VERSION_MAJOR
|
||||
#define HSA_VERSION_MAJOR 1
|
||||
@@ -1696,6 +1697,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
|
||||
|
||||
hsa_status_t custom_handler_status = HSA_STATUS_ERROR;
|
||||
auto system_event_handlers = runtime_singleton_->GetSystemEventHandlers();
|
||||
Agent* faulty_agent = nullptr;
|
||||
// If custom handler is registered, pack the fault info and call the handler
|
||||
if (!system_event_handlers.empty()) {
|
||||
hsa_amd_event_t memory_fault_event;
|
||||
@@ -1705,7 +1707,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
|
||||
// Find the faulty agent
|
||||
auto it = runtime_singleton_->agents_by_node_.find(fault.NodeId);
|
||||
assert(it != runtime_singleton_->agents_by_node_.end() && "Can't find faulty agent.");
|
||||
Agent* faulty_agent = it->second.front();
|
||||
faulty_agent = it->second.front();
|
||||
fault_info.agent = Agent::Convert(faulty_agent);
|
||||
|
||||
fault_info.virtual_address = fault.VirtualAddress;
|
||||
@@ -1767,12 +1769,12 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
|
||||
reason += "Unknown";
|
||||
}
|
||||
|
||||
core::Agent* faultingAgent = runtime_singleton_->agents_by_node_[fault.NodeId][0];
|
||||
faulty_agent = runtime_singleton_->agents_by_node_[fault.NodeId][0];
|
||||
|
||||
fprintf(
|
||||
stderr,
|
||||
"Memory access fault by GPU node-%u (Agent handle: %p) on address %p%s. Reason: %s.\n",
|
||||
fault.NodeId, reinterpret_cast<void*>(faultingAgent->public_handle().handle),
|
||||
fault.NodeId, reinterpret_cast<void*>(faulty_agent->public_handle().handle),
|
||||
reinterpret_cast<const void*>(fault.VirtualAddress),
|
||||
(fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", reason.c_str());
|
||||
|
||||
@@ -1780,6 +1782,13 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
|
||||
PrintMemoryMapNear(reinterpret_cast<void*>(fault.VirtualAddress));
|
||||
#endif
|
||||
}
|
||||
// Fallback if KFD does not support GPU core dump. In this case, there core dump is
|
||||
// generated by hsa-runtime.
|
||||
if (faulty_agent && faulty_agent->isa()->GetMajorVersion() != 11 &&
|
||||
!runtime_singleton_->KfdVersion().supports_core_dump) {
|
||||
if (amd::coredump::dump_gpu_core())
|
||||
debug_print("GPU core dump failed\n");
|
||||
}
|
||||
assert(false && "GPU memory access fault.");
|
||||
std::abort();
|
||||
}
|
||||
|
||||
Referencia en una nueva incidencia
Block a user