SWDEV-506467 - Skip Abort in case of crash from the device. (#60)

Change-Id: I964b2f2647d068202e9c38fcddb1337da754df8d

[ROCm/clr commit: b2388dfb88]
Этот коммит содержится в:
Jayaprakash, Karthik
2025-04-29 01:49:02 -04:00
коммит произвёл GitHub
родитель ae0640131e
Коммит 49a527c826
7 изменённых файлов: 99 добавлений и 19 удалений
+8 -2
Просмотреть файл
@@ -169,7 +169,10 @@ const char* ihipGetErrorName(hipError_t hip_error);
#define HIP_RETURN_DURATION(ret, ...) \
hip::tls.last_command_error_ = ret; \
if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \
if (amd::Device::IsDeviceNotUsable()) { \
hip::tls.last_error_ = hipErrorNoDevice; \
hip::tls.last_command_error_ = hipErrorNoDevice; \
} else if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \
if (hip::tls.last_command_error_ != hipSuccess && \
hip::tls.last_command_error_ != hipErrorNotReady) { \
hip::tls.last_error_ = hip::tls.last_command_error_; \
@@ -184,7 +187,10 @@ const char* ihipGetErrorName(hipError_t hip_error);
#define HIP_RETURN(ret, ...) \
hip::tls.last_command_error_ = ret; \
if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \
if (amd::Device::IsDeviceNotUsable()) { \
hip::tls.last_error_ = hipErrorNoDevice; \
hip::tls.last_command_error_ = hipErrorNoDevice; \
} else if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \
if (hip::tls.last_command_error_ != hipSuccess && \
hip::tls.last_command_error_ != hipErrorNotReady) { \
hip::tls.last_error_ = hip::tls.last_command_error_; \
+2
Просмотреть файл
@@ -336,6 +336,8 @@ Context* Device::glb_ctx_ = nullptr;
Monitor Device::p2p_stage_ops_(true);
Memory* Device::p2p_stage_ = nullptr;
bool Device::device_not_usable_ = false;
std::shared_mutex MemObjMap::AllocatedLock_ ROCCLR_INIT_PRIORITY(101);
std::map<uintptr_t, amd::Memory*> MemObjMap::MemObjMap_ ROCCLR_INIT_PRIORITY(101);
std::map<uintptr_t, amd::Memory*> MemObjMap::VirtualMemObjMap_ ROCCLR_INIT_PRIORITY(101);
+5
Просмотреть файл
@@ -2162,6 +2162,9 @@ class Device : public RuntimeObject {
virtual device::UriLocator* createUriLocator() const = 0;
#endif
#endif
static bool IsDeviceNotUsable() { return device_not_usable_; }
protected:
//! Enable the specified extension
char* getExtensionString();
@@ -2196,6 +2199,8 @@ class Device : public RuntimeObject {
uint64_t initial_heap_size_{HIP_INITIAL_DM_SIZE}; //!< Initial device heap size
amd::Monitor activeQueuesLock_ {}; //!< Guards access to the activeQueues set
std::unordered_set<amd::CommandQueue*> activeQueues; //!< The set of active queues
static bool device_not_usable_; //!< If set, we should not launch any commands anymore.
private:
const Isa *isa_; //!< Device isa
bool IsTypeMatching(cl_device_type type, bool offlineDevices);
+45 -1
Просмотреть файл
@@ -559,6 +559,10 @@ bool Device::init() {
}
}
if (amd::IS_HIP) {
RegisterBackendErrorCb();
}
return true;
}
@@ -3392,6 +3396,41 @@ device::Signal* Device::createSignal() const {
return new roc::Signal();
}
// ================================================================================================
hsa_status_t Device::BackendErrorCallBackHandler(const hsa_amd_event_t* event, void* data) {
switch (event->event_type) {
case HSA_AMD_GPU_MEMORY_FAULT_EVENT:
LogError("Memory Fault Error");
break;
case HSA_AMD_GPU_HW_EXCEPTION_EVENT:
LogError("HW Exception Error");
break;
case HSA_AMD_GPU_MEMORY_ERROR_EVENT:
LogError("GPU Memory Error");
break;
default:
LogError("Unknown Event Type ");
break;
}
if (HIP_SKIP_ABORT_ON_GPU_ERROR) {
device_not_usable_ = true;
} else {
abort();
}
return HSA_STATUS_SUCCESS;
}
// ================================================================================================
void Device::RegisterBackendErrorCb() {
// Register ROCclr Error Callback
hsa_status_t hsa_error = HSA_STATUS_SUCCESS;
hsa_error = hsa_amd_register_system_event_handler(BackendErrorCallBackHandler, nullptr);
if (hsa_error != HSA_STATUS_SUCCESS) {
LogError("Cannot Register Call back event handler");
}
}
// ================================================================================================
amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) {
// Only create arena_mem_object if CPU memory is accessible from HMM
@@ -3559,7 +3598,12 @@ void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
"Callback: Queue %p aborting with error : %s code: 0x%x", queue->base_address,
errorMsg, status);
}
abort();
if (HIP_SKIP_ABORT_ON_GPU_ERROR) {
amd::Device::device_not_usable_ = true;
} else {
abort();
}
}
}
+7
Просмотреть файл
@@ -541,6 +541,10 @@ class Device : public NullDevice {
void getGlobalCUMask(std::string cuMaskStr);
static hsa_status_t BackendErrorCallBackHandler(const hsa_amd_event_t* event, void* data);
static void RegisterBackendErrorCb();
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0);
const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; }
@@ -646,6 +650,9 @@ class Device : public NullDevice {
//! Code object to kernel info map (used in the crash dump analysis)
mutable std::map<uint64_t, Kernel&> kernel_map_;
//! Friend function callbackQueue can access and set device class variables.
friend void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data);
public:
std::atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
+30 -16
Просмотреть файл
@@ -23,6 +23,7 @@
#include "platform/commandqueue.hpp"
#include "rocdefs.hpp"
#include "rocdevice.hpp"
#include "utils/flags.hpp"
#include "utils/util.hpp"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_image.h"
@@ -46,26 +47,39 @@ constexpr static hsa_signal_value_t kInitSignalValueOne = 1;
constexpr static uint64_t kTimeout100us = 100 * K;
constexpr static uint64_t kUnlimitedWait = std::numeric_limits<uint64_t>::max();
constexpr static uint64_t kTimeout4Secs = 4 * M;
inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false) {
hsa_wait_state_t wait_state = HSA_WAIT_STATE_BLOCKED;
if (active_wait) {
wait_state = HSA_WAIT_STATE_ACTIVE;
}
if (hsa_signal_load_relaxed(signal) > 0) {
uint64_t timeout = kTimeout100us;
if (active_wait) {
timeout = kUnlimitedWait;
// When it is blocked wait, we wait in active state for 100 us before proceeding to wait in
// blocked state indefinitely.
if (!active_wait) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host active wait for Signal = (0x%lx) for %d ns",
signal.handle, kTimeout100us);
if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
kTimeout100us, HSA_WAIT_STATE_ACTIVE) != 0) {
if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsDeviceNotUsable()) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Device not Stable, while waiting for Signal ="
"(0x%lx) for %d ns", signal.handle, kTimeout100us);
return true;
}
}
}
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host active wait for Signal = (0x%lx) for %d ns",
signal.handle, timeout);
// Active wait with a timeout
if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
timeout, HSA_WAIT_STATE_ACTIVE) != 0) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host blocked wait for Signal = (0x%lx)",
signal.handle);
// Wait until the completion with CPU suspend
if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) {
return false;
// This is unlimited wait, but we wait for 4 secs and check if the device is
// unstable, if so we return, otherwise we continue to wait in the while loop.
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
kTimeout4Secs, wait_state) != 0) {
if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsDeviceNotUsable()) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Device not Stable, while waiting for Signal ="
"(0x%lx) for %d ns", signal.handle, kTimeout4Secs);
return true;
}
}
}
+2
Просмотреть файл
@@ -278,6 +278,8 @@ release(bool, DEBUG_HIP_DYNAMIC_QUEUES, true, \
release(uint, DEBUG_HIP_7_PREVIEW, 0, \
"Enables specific backward incompatible changes support before 7.0," \
"using the mask. By default the changes are disabled and is set to 0")\
release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, false, \
"Set this to true, to avoid host side abort for GPU errors") \
release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false, \
"Force use of SPIRV instead of device specific code object.") \