From 67376e06abcc403fbc836eccfe560c1f3f78869e Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Tue, 19 Feb 2019 22:06:27 -0600 Subject: [PATCH] Report SRAM ECC errors through the system event handler. Modify the system event handler to support multiple users. Name memory fault reason codes. Change-Id: I1b5979b36ab15637eb2be59a61e2d57e76d0a70e --- runtime/hsa-runtime/core/inc/runtime.h | 20 +++--- runtime/hsa-runtime/core/runtime/runtime.cpp | 71 +++++++++++++------- runtime/hsa-runtime/inc/hsa_ext_amd.h | 35 +++++++--- 3 files changed, 82 insertions(+), 44 deletions(-) diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index 7f88cc6705..924ee93070 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -47,6 +47,7 @@ #include #include +#include #include "core/inc/hsa_ext_interface.h" #include "core/inc/hsa_internal.h" @@ -323,12 +324,6 @@ class Runtime { hsa_status_t SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback, void* data); - void* GetCustomSystemEventData() { return system_event_handler_user_data_; } - - AMD::callback_t GetCustomSystemEventHandler() { - return system_event_handler_; - } - hsa_status_t SetInternalQueueCreateNotifier(hsa_amd_runtime_queue_notifier callback, void* user_data); @@ -419,6 +414,11 @@ class Runtime { // @brief Binds virtual memory access fault handler to this node. void BindVmFaultHandler(); + // @brief Acquire snapshot of system event handlers. + // Returns a copy to avoid holding a lock during callbacks. + std::vector, void*>> + GetSystemEventHandlers(); + /// @brief Get the index of ::link_matrix_. /// @param [in] node_id_from Node id of the source node. /// @param [in] node_id_to Node id of the destination node. @@ -494,10 +494,12 @@ class Runtime { // @brief HSA signal to contain the VM fault event. Signal* vm_fault_signal_; - // Custom system event handler. - AMD::callback_t system_event_handler_; + // Custom system event handlers. + std::vector, void*>> + system_event_handlers_; - void* system_event_handler_user_data_; + // System event handler lock + KernelMutex system_event_lock_; // Internal queue creation notifier AMD::callback_t internal_queue_create_notifier_; diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index ffc39afcc5..d1d4a5f9f3 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -1037,9 +1037,12 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { vm_fault_event->EventData.EventData.MemoryAccessFault; hsa_status_t custom_handler_status = HSA_STATUS_ERROR; + auto system_event_handlers = runtime_singleton_->GetSystemEventHandlers(); // If custom handler is registered, pack the fault info and call the handler - if (runtime_singleton_->GetCustomSystemEventHandler()) { - hsa_amd_gpu_memory_fault_info_t fault_info; + if (!system_event_handlers.empty()) { + hsa_amd_event_t memory_fault_event; + memory_fault_event.event_type = HSA_AMD_GPU_MEMORY_FAULT_EVENT; + hsa_amd_gpu_memory_fault_info_t& fault_info = memory_fault_event.memory_fault; // Find the faulty agent auto it = runtime_singleton_->agents_by_node_.find(fault.NodeId); @@ -1048,30 +1051,39 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { fault_info.agent = Agent::Convert(faulty_agent); fault_info.virtual_address = fault.VirtualAddress; - fault_info.fault_reason_mask = 0x00000000; + fault_info.fault_reason_mask = 0; if (fault.Failure.NotPresent == 1) { - fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000001; + fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT; } if (fault.Failure.ReadOnly == 1) { - fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000010; + fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_READ_ONLY; } if (fault.Failure.NoExecute == 1) { - fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00000100; + fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_NX; } if (fault.Failure.GpuAccess == 1) { - fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00001000; - } - if (fault.Failure.ECC == 1) { - fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00010000; + fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_HOST_ONLY; } if (fault.Failure.Imprecise == 1) { - fault_info.fault_reason_mask = fault_info.fault_reason_mask | 0x00100000; + fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_IMPRECISE; + } + if (fault.Failure.ECC == 1 && fault.Failure.ErrorType == 0) { + fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_DRAM_ECC; + } + if (fault.Failure.ErrorType == 1) { + fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_SRAM_ECC; + } + if (fault.Failure.ErrorType == 2) { + fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_DRAM_ECC; + } + if (fault.Failure.ErrorType == 3) { + fault_info.fault_reason_mask |= HSA_AMD_MEMORY_FAULT_HANG; + } + + for (auto& callback : system_event_handlers) { + hsa_status_t err = callback.first(&memory_fault_event, callback.second); + if (err == HSA_STATUS_SUCCESS) custom_handler_status = HSA_STATUS_SUCCESS; } - hsa_amd_event_t memory_fault_event; - memory_fault_event.event_type = GPU_MEMORY_FAULT_EVENT; - memory_fault_event.memory_fault = fault_info; - custom_handler_status = runtime_singleton_->GetCustomSystemEventHandler()( - &memory_fault_event, runtime_singleton_->GetCustomSystemEventData()); } // No custom VM fault handler registered or it failed. @@ -1086,8 +1098,13 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { reason += "Execute access to a page marked NX"; } else if (fault.Failure.GpuAccess == 1) { reason += "Host access only"; - } else if (fault.Failure.ECC == 1) { - reason += "ECC failure (if supported by HW)"; + } else if ((fault.Failure.ECC == 1 && fault.Failure.ErrorType == 0) || + fault.Failure.ErrorType == 2) { + reason += "DRAM ECC failure"; + } else if (fault.Failure.ErrorType == 1) { + reason += "SRAM ECC failure"; + } else if (fault.Failure.ErrorType == 3) { + reason += "Generic hang recovery"; } else { reason += "Unknown"; } @@ -1166,7 +1183,6 @@ Runtime::Runtime() sys_clock_freq_(0), vm_fault_event_(nullptr), vm_fault_signal_(nullptr), - system_event_handler_user_data_(nullptr), ref_count_(0) {} hsa_status_t Runtime::Load() { @@ -1437,13 +1453,16 @@ void Runtime::AsyncEvents::Clear() { hsa_status_t Runtime::SetCustomSystemEventHandler(hsa_amd_system_event_callback_t callback, void* data) { - if (system_event_handler_) { - return HSA_STATUS_ERROR; - } else { - system_event_handler_ = callback; - system_event_handler_user_data_ = data; - return HSA_STATUS_SUCCESS; - } + ScopedAcquire lock(&system_event_lock_); + system_event_handlers_.push_back( + std::make_pair(AMD::callback_t(callback), data)); + return HSA_STATUS_SUCCESS; +} + +std::vector, void*>> +Runtime::GetSystemEventHandlers() { + ScopedAcquire lock(&system_event_lock_); + return system_event_handlers_; } hsa_status_t Runtime::SetInternalQueueCreateNotifier(hsa_amd_runtime_queue_notifier callback, diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h index 2e5f75cda2..50ca816821 100644 --- a/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -1675,9 +1675,32 @@ typedef enum hsa_amd_event_type_s { /* AMD GPU memory fault. */ - GPU_MEMORY_FAULT_EVENT = 0, + HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0, + GPU_MEMORY_FAULT_EVENT = 0 } hsa_amd_event_type_t; +/** + * @brief Flags denoting the cause of a memory fault. + */ +typedef enum { + // Page not present or supervisor privilege. + HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0, + // Write access to a read-only page. + HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1, + // Execute access to a page marked NX. + HSA_AMD_MEMORY_FAULT_NX = 1 << 2, + // GPU attempted access to a host only page. + HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3, + // DRAM ECC failure. + HSA_AMD_MEMORY_FAULT_DRAM_ECC = 1 << 4, + // Can't determine the exact fault address. + HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5, + // SRAM ECC failure (ie registers, no fault address). + HSA_AMD_MEMORY_FAULT_SRAM_ECC = 1 << 6, + // GPU reset following unspecified hang. + HSA_AMD_MEMORY_FAULT_HANG = 1 << 31 +} hsa_amd_memory_fault_reason_t; + /** * @brief AMD GPU memory fault event data. */ @@ -1692,13 +1715,7 @@ typedef struct hsa_amd_gpu_memory_fault_info_s { uint64_t virtual_address; /* Bit field encoding the memory access failure reasons. There could be multiple bits set - for one fault. - 0x00000001 Page not present or supervisor privilege. - 0x00000010 Write access to a read-only page. - 0x00000100 Execute access to a page marked NX. - 0x00001000 Host access only. - 0x00010000 ECC failure (if supported by HW). - 0x00100000 Can't determine the exact fault address. + for one fault. Bits are defined in hsa_amd_memory_fault_reason_t. */ uint32_t fault_reason_mask; } hsa_amd_gpu_memory_fault_info_t; @@ -1713,7 +1730,7 @@ typedef struct hsa_amd_event_s { hsa_amd_event_type_t event_type; union { /* - The memory fault info, only valid when @p event_type is GPU_MEMORY_FAULT_EVENT. + The memory fault info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_FAULT_EVENT. */ hsa_amd_gpu_memory_fault_info_t memory_fault; };