From 49a527c82699e7171ca2d68395d0ebb429df7103 Mon Sep 17 00:00:00 2001 From: "Jayaprakash, Karthik" Date: Tue, 29 Apr 2025 01:49:02 -0400 Subject: [PATCH] SWDEV-506467 - Skip Abort in case of crash from the device. (#60) Change-Id: I964b2f2647d068202e9c38fcddb1337da754df8d [ROCm/clr commit: b2388dfb88443fe4773c789740368051662f0e68] --- projects/clr/hipamd/src/hip_internal.hpp | 10 +++- projects/clr/rocclr/device/device.cpp | 2 + projects/clr/rocclr/device/device.hpp | 5 ++ projects/clr/rocclr/device/rocm/rocdevice.cpp | 46 ++++++++++++++++++- projects/clr/rocclr/device/rocm/rocdevice.hpp | 7 +++ .../clr/rocclr/device/rocm/rocvirtual.hpp | 46 ++++++++++++------- projects/clr/rocclr/utils/flags.hpp | 2 + 7 files changed, 99 insertions(+), 19 deletions(-) diff --git a/projects/clr/hipamd/src/hip_internal.hpp b/projects/clr/hipamd/src/hip_internal.hpp index 250ddec44d..3352feacec 100644 --- a/projects/clr/hipamd/src/hip_internal.hpp +++ b/projects/clr/hipamd/src/hip_internal.hpp @@ -169,7 +169,10 @@ const char* ihipGetErrorName(hipError_t hip_error); #define HIP_RETURN_DURATION(ret, ...) \ hip::tls.last_command_error_ = ret; \ - if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \ + if (amd::Device::IsDeviceNotUsable()) { \ + hip::tls.last_error_ = hipErrorNoDevice; \ + hip::tls.last_command_error_ = hipErrorNoDevice; \ + } else if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \ if (hip::tls.last_command_error_ != hipSuccess && \ hip::tls.last_command_error_ != hipErrorNotReady) { \ hip::tls.last_error_ = hip::tls.last_command_error_; \ @@ -184,7 +187,10 @@ const char* ihipGetErrorName(hipError_t hip_error); #define HIP_RETURN(ret, ...) \ hip::tls.last_command_error_ = ret; \ - if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \ + if (amd::Device::IsDeviceNotUsable()) { \ + hip::tls.last_error_ = hipErrorNoDevice; \ + hip::tls.last_command_error_ = hipErrorNoDevice; \ + } else if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \ if (hip::tls.last_command_error_ != hipSuccess && \ hip::tls.last_command_error_ != hipErrorNotReady) { \ hip::tls.last_error_ = hip::tls.last_command_error_; \ diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp index 2e9ef8155c..f02d5b7acb 100644 --- a/projects/clr/rocclr/device/device.cpp +++ b/projects/clr/rocclr/device/device.cpp @@ -336,6 +336,8 @@ Context* Device::glb_ctx_ = nullptr; Monitor Device::p2p_stage_ops_(true); Memory* Device::p2p_stage_ = nullptr; +bool Device::device_not_usable_ = false; + std::shared_mutex MemObjMap::AllocatedLock_ ROCCLR_INIT_PRIORITY(101); std::map MemObjMap::MemObjMap_ ROCCLR_INIT_PRIORITY(101); std::map MemObjMap::VirtualMemObjMap_ ROCCLR_INIT_PRIORITY(101); diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 70714735d5..523eb56385 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -2162,6 +2162,9 @@ class Device : public RuntimeObject { virtual device::UriLocator* createUriLocator() const = 0; #endif #endif + + static bool IsDeviceNotUsable() { return device_not_usable_; } + protected: //! Enable the specified extension char* getExtensionString(); @@ -2196,6 +2199,8 @@ class Device : public RuntimeObject { uint64_t initial_heap_size_{HIP_INITIAL_DM_SIZE}; //!< Initial device heap size amd::Monitor activeQueuesLock_ {}; //!< Guards access to the activeQueues set std::unordered_set activeQueues; //!< The set of active queues + static bool device_not_usable_; //!< If set, we should not launch any commands anymore. + private: const Isa *isa_; //!< Device isa bool IsTypeMatching(cl_device_type type, bool offlineDevices); diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 282cb56c4c..82e1b00438 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -559,6 +559,10 @@ bool Device::init() { } } + if (amd::IS_HIP) { + RegisterBackendErrorCb(); + } + return true; } @@ -3392,6 +3396,41 @@ device::Signal* Device::createSignal() const { return new roc::Signal(); } +// ================================================================================================ +hsa_status_t Device::BackendErrorCallBackHandler(const hsa_amd_event_t* event, void* data) { + switch (event->event_type) { + case HSA_AMD_GPU_MEMORY_FAULT_EVENT: + LogError("Memory Fault Error"); + break; + case HSA_AMD_GPU_HW_EXCEPTION_EVENT: + LogError("HW Exception Error"); + break; + case HSA_AMD_GPU_MEMORY_ERROR_EVENT: + LogError("GPU Memory Error"); + break; + default: + LogError("Unknown Event Type "); + break; + } + + if (HIP_SKIP_ABORT_ON_GPU_ERROR) { + device_not_usable_ = true; + } else { + abort(); + } + + return HSA_STATUS_SUCCESS; +} + +// ================================================================================================ +void Device::RegisterBackendErrorCb() { + // Register ROCclr Error Callback + hsa_status_t hsa_error = HSA_STATUS_SUCCESS; + hsa_error = hsa_amd_register_system_event_handler(BackendErrorCallBackHandler, nullptr); + if (hsa_error != HSA_STATUS_SUCCESS) { + LogError("Cannot Register Call back event handler"); + } +} // ================================================================================================ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) { // Only create arena_mem_object if CPU memory is accessible from HMM @@ -3559,7 +3598,12 @@ void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { "Callback: Queue %p aborting with error : %s code: 0x%x", queue->base_address, errorMsg, status); } - abort(); + + if (HIP_SKIP_ABORT_ON_GPU_ERROR) { + amd::Device::device_not_usable_ = true; + } else { + abort(); + } } } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 471cc5d8cc..967a3d8db4 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -541,6 +541,10 @@ class Device : public NullDevice { void getGlobalCUMask(std::string cuMaskStr); + static hsa_status_t BackendErrorCallBackHandler(const hsa_amd_event_t* event, void* data); + + static void RegisterBackendErrorCb(); + virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0); const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; } @@ -646,6 +650,9 @@ class Device : public NullDevice { //! Code object to kernel info map (used in the crash dump analysis) mutable std::map kernel_map_; + //! Friend function callbackQueue can access and set device class variables. + friend void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data); + public: std::atomic numOfVgpus_; //!< Virtual gpu unique index diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 4e8cd7fad3..72027178f2 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -23,6 +23,7 @@ #include "platform/commandqueue.hpp" #include "rocdefs.hpp" #include "rocdevice.hpp" +#include "utils/flags.hpp" #include "utils/util.hpp" #include "hsa/hsa.h" #include "hsa/hsa_ext_image.h" @@ -46,26 +47,39 @@ constexpr static hsa_signal_value_t kInitSignalValueOne = 1; constexpr static uint64_t kTimeout100us = 100 * K; constexpr static uint64_t kUnlimitedWait = std::numeric_limits::max(); +constexpr static uint64_t kTimeout4Secs = 4 * M; + inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false) { + + hsa_wait_state_t wait_state = HSA_WAIT_STATE_BLOCKED; + if (active_wait) { + wait_state = HSA_WAIT_STATE_ACTIVE; + } + if (hsa_signal_load_relaxed(signal) > 0) { - uint64_t timeout = kTimeout100us; - if (active_wait) { - timeout = kUnlimitedWait; + // When it is blocked wait, we wait in active state for 100 us before proceeding to wait in + // blocked state indefinitely. + if (!active_wait) { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host active wait for Signal = (0x%lx) for %d ns", + signal.handle, kTimeout100us); + if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, + kTimeout100us, HSA_WAIT_STATE_ACTIVE) != 0) { + if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsDeviceNotUsable()) { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Device not Stable, while waiting for Signal =" + "(0x%lx) for %d ns", signal.handle, kTimeout100us); + return true; + } + } } - ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host active wait for Signal = (0x%lx) for %d ns", - signal.handle, timeout); - - // Active wait with a timeout - if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, - timeout, HSA_WAIT_STATE_ACTIVE) != 0) { - ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host blocked wait for Signal = (0x%lx)", - signal.handle); - - // Wait until the completion with CPU suspend - if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, - kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) { - return false; + // This is unlimited wait, but we wait for 4 secs and check if the device is + // unstable, if so we return, otherwise we continue to wait in the while loop. + while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, + kTimeout4Secs, wait_state) != 0) { + if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsDeviceNotUsable()) { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Device not Stable, while waiting for Signal =" + "(0x%lx) for %d ns", signal.handle, kTimeout4Secs); + return true; } } } diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index d3bb383b56..b607e73154 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -278,6 +278,8 @@ release(bool, DEBUG_HIP_DYNAMIC_QUEUES, true, \ release(uint, DEBUG_HIP_7_PREVIEW, 0, \ "Enables specific backward incompatible changes support before 7.0," \ "using the mask. By default the changes are disabled and is set to 0")\ +release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, false, \ + "Set this to true, to avoid host side abort for GPU errors") \ release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false, \ "Force use of SPIRV instead of device specific code object.") \