Report SRAM ECC errors through the system event handler.
Modify the system event handler to support multiple users. Name memory fault reason codes. Change-Id: I1b5979b36ab15637eb2be59a61e2d57e76d0a70e
This commit is contained in:
@@ -47,6 +47,7 @@
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
|
||||
#include "core/inc/hsa_ext_interface.h"
|
||||
#include "core/inc/hsa_internal.h"
|
||||
@@ -323,12 +324,6 @@ class Runtime {
|
||||
hsa_status_t SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,
|
||||
void* data);
|
||||
|
||||
void* GetCustomSystemEventData() { return system_event_handler_user_data_; }
|
||||
|
||||
AMD::callback_t<hsa_amd_system_event_callback_t> GetCustomSystemEventHandler() {
|
||||
return system_event_handler_;
|
||||
}
|
||||
|
||||
hsa_status_t SetInternalQueueCreateNotifier(hsa_amd_runtime_queue_notifier callback,
|
||||
void* user_data);
|
||||
|
||||
@@ -419,6 +414,11 @@ class Runtime {
|
||||
// @brief Binds virtual memory access fault handler to this node.
|
||||
void BindVmFaultHandler();
|
||||
|
||||
// @brief Acquire snapshot of system event handlers.
|
||||
// Returns a copy to avoid holding a lock during callbacks.
|
||||
std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
|
||||
GetSystemEventHandlers();
|
||||
|
||||
/// @brief Get the index of ::link_matrix_.
|
||||
/// @param [in] node_id_from Node id of the source node.
|
||||
/// @param [in] node_id_to Node id of the destination node.
|
||||
@@ -494,10 +494,12 @@ class Runtime {
|
||||
// @brief HSA signal to contain the VM fault event.
|
||||
Signal* vm_fault_signal_;
|
||||
|
||||
// Custom system event handler.
|
||||
AMD::callback_t<hsa_amd_system_event_callback_t> system_event_handler_;
|
||||
// Custom system event handlers.
|
||||
std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
|
||||
system_event_handlers_;
|
||||
|
||||
void* system_event_handler_user_data_;
|
||||
// System event handler lock
|
||||
KernelMutex system_event_lock_;
|
||||
|
||||
// Internal queue creation notifier
|
||||
AMD::callback_t<hsa_amd_runtime_queue_notifier> internal_queue_create_notifier_;
|
||||
|
||||
@@ -1037,9 +1037,12 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
|
||||
vm_fault_event->EventData.EventData.MemoryAccessFault;
|
||||
|
||||
hsa_status_t custom_handler_status = HSA_STATUS_ERROR;
|
||||
auto system_event_handlers = runtime_singleton_->GetSystemEventHandlers();
|
||||
// If custom handler is registered, pack the fault info and call the handler
|
||||
if (runtime_singleton_->GetCustomSystemEventHandler()) {
|
||||
hsa_amd_gpu_memory_fault_info_t fault_info;
|
||||
if (!system_event_handlers.empty()) {
|
||||
hsa_amd_event_t memory_fault_event;
|
||||
memory_fault_event.event_type = HSA_AMD_GPU_MEMORY_FAULT_EVENT;
|
||||
hsa_amd_gpu_memory_fault_info_t& fault_info = memory_fault_event.memory_fault;
|
||||
|
||||
// Find the faulty agent
|
||||
auto it = runtime_singleton_->agents_by_node_.find(fault.NodeId);
|
||||
@@ -1048,30 +1051,39 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
|
||||
fault_info.agent = Agent::Convert(faulty_agent);
|
||||
|
||||
fault_info.virtual_address = fault.VirtualAddress;
|
||||
fault_info.fault_reason_mask = 0x00000000;
|
||||
fault_info.fault_reason_mask = 0;
|
||||
if (fault.Failure.NotPresent == 1) {
|
||||
fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000001;
|
||||
fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT;
|
||||
}
|
||||
if (fault.Failure.ReadOnly == 1) {
|
||||
fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000010;
|
||||
fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_READ_ONLY;
|
||||
}
|
||||
if (fault.Failure.NoExecute == 1) {
|
||||
fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000100;
|
||||
fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_NX;
|
||||
}
|
||||
if (fault.Failure.GpuAccess == 1) {
|
||||
fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00001000;
|
||||
}
|
||||
if (fault.Failure.ECC == 1) {
|
||||
fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00010000;
|
||||
fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_HOST_ONLY;
|
||||
}
|
||||
if (fault.Failure.Imprecise == 1) {
|
||||
fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00100000;
|
||||
fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_IMPRECISE;
|
||||
}
|
||||
if (fault.Failure.ECC == 1 && fault.Failure.ErrorType == 0) {
|
||||
fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_DRAM_ECC;
|
||||
}
|
||||
if (fault.Failure.ErrorType == 1) {
|
||||
fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_SRAM_ECC;
|
||||
}
|
||||
if (fault.Failure.ErrorType == 2) {
|
||||
fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_DRAM_ECC;
|
||||
}
|
||||
if (fault.Failure.ErrorType == 3) {
|
||||
fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_HANG;
|
||||
}
|
||||
|
||||
for (auto& callback : system_event_handlers) {
|
||||
hsa_status_t err = callback.first(&memory_fault_event, callback.second);
|
||||
if (err == HSA_STATUS_SUCCESS) custom_handler_status = HSA_STATUS_SUCCESS;
|
||||
}
|
||||
hsa_amd_event_t memory_fault_event;
|
||||
memory_fault_event.event_type = GPU_MEMORY_FAULT_EVENT;
|
||||
memory_fault_event.memory_fault = fault_info;
|
||||
custom_handler_status = runtime_singleton_->GetCustomSystemEventHandler()(
|
||||
&memory_fault_event, runtime_singleton_->GetCustomSystemEventData());
|
||||
}
|
||||
|
||||
// No custom VM fault handler registered or it failed.
|
||||
@@ -1086,8 +1098,13 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
|
||||
reason += "Execute access to a page marked NX";
|
||||
} else if (fault.Failure.GpuAccess == 1) {
|
||||
reason += "Host access only";
|
||||
} else if (fault.Failure.ECC == 1) {
|
||||
reason += "ECC failure (if supported by HW)";
|
||||
} else if ((fault.Failure.ECC == 1 && fault.Failure.ErrorType == 0) ||
|
||||
fault.Failure.ErrorType == 2) {
|
||||
reason += "DRAM ECC failure";
|
||||
} else if (fault.Failure.ErrorType == 1) {
|
||||
reason += "SRAM ECC failure";
|
||||
} else if (fault.Failure.ErrorType == 3) {
|
||||
reason += "Generic hang recovery";
|
||||
} else {
|
||||
reason += "Unknown";
|
||||
}
|
||||
@@ -1166,7 +1183,6 @@ Runtime::Runtime()
|
||||
sys_clock_freq_(0),
|
||||
vm_fault_event_(nullptr),
|
||||
vm_fault_signal_(nullptr),
|
||||
system_event_handler_user_data_(nullptr),
|
||||
ref_count_(0) {}
|
||||
|
||||
hsa_status_t Runtime::Load() {
|
||||
@@ -1437,13 +1453,16 @@ void Runtime::AsyncEvents::Clear() {
|
||||
|
||||
hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback,
|
||||
void* data) {
|
||||
if (system_event_handler_) {
|
||||
return HSA_STATUS_ERROR;
|
||||
} else {
|
||||
system_event_handler_ = callback;
|
||||
system_event_handler_user_data_ = data;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
ScopedAcquire<KernelMutex> lock(&system_event_lock_);
|
||||
system_event_handlers_.push_back(
|
||||
std::make_pair(AMD::callback_t<hsa_amd_system_event_callback_t>(callback), data));
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::vector<std::pair<AMD::callback_t<hsa_amd_system_event_callback_t>, void*>>
|
||||
Runtime::GetSystemEventHandlers() {
|
||||
ScopedAcquire<KernelMutex> lock(&system_event_lock_);
|
||||
return system_event_handlers_;
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::SetInternalQueueCreateNotifier(hsa_amd_runtime_queue_notifier callback,
|
||||
|
||||
@@ -1675,9 +1675,32 @@ typedef enum hsa_amd_event_type_s {
|
||||
/*
|
||||
AMD GPU memory fault.
|
||||
*/
|
||||
GPU_MEMORY_FAULT_EVENT = 0,
|
||||
HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0,
|
||||
GPU_MEMORY_FAULT_EVENT = 0
|
||||
} hsa_amd_event_type_t;
|
||||
|
||||
/**
|
||||
* @brief Flags denoting the cause of a memory fault.
|
||||
*/
|
||||
typedef enum {
|
||||
// Page not present or supervisor privilege.
|
||||
HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0,
|
||||
// Write access to a read-only page.
|
||||
HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1,
|
||||
// Execute access to a page marked NX.
|
||||
HSA_AMD_MEMORY_FAULT_NX = 1 << 2,
|
||||
// GPU attempted access to a host only page.
|
||||
HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3,
|
||||
// DRAM ECC failure.
|
||||
HSA_AMD_MEMORY_FAULT_DRAM_ECC = 1 << 4,
|
||||
// Can't determine the exact fault address.
|
||||
HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5,
|
||||
// SRAM ECC failure (ie registers, no fault address).
|
||||
HSA_AMD_MEMORY_FAULT_SRAM_ECC = 1 << 6,
|
||||
// GPU reset following unspecified hang.
|
||||
HSA_AMD_MEMORY_FAULT_HANG = 1 << 31
|
||||
} hsa_amd_memory_fault_reason_t;
|
||||
|
||||
/**
|
||||
* @brief AMD GPU memory fault event data.
|
||||
*/
|
||||
@@ -1692,13 +1715,7 @@ typedef struct hsa_amd_gpu_memory_fault_info_s {
|
||||
uint64_t virtual_address;
|
||||
/*
|
||||
Bit field encoding the memory access failure reasons. There could be multiple bits set
|
||||
for one fault.
|
||||
0x00000001 Page not present or supervisor privilege.
|
||||
0x00000010 Write access to a read-only page.
|
||||
0x00000100 Execute access to a page marked NX.
|
||||
0x00001000 Host access only.
|
||||
0x00010000 ECC failure (if supported by HW).
|
||||
0x00100000 Can't determine the exact fault address.
|
||||
for one fault. Bits are defined in hsa_amd_memory_fault_reason_t.
|
||||
*/
|
||||
uint32_t fault_reason_mask;
|
||||
} hsa_amd_gpu_memory_fault_info_t;
|
||||
@@ -1713,7 +1730,7 @@ typedef struct hsa_amd_event_s {
|
||||
hsa_amd_event_type_t event_type;
|
||||
union {
|
||||
/*
|
||||
The memory fault info, only valid when @p event_type is GPU_MEMORY_FAULT_EVENT.
|
||||
The memory fault info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_FAULT_EVENT.
|
||||
*/
|
||||
hsa_amd_gpu_memory_fault_info_t memory_fault;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user