diff --git a/hipamd/src/hip_internal.hpp b/hipamd/src/hip_internal.hpp index 86cad9597c..25687fc7e7 100644 --- a/hipamd/src/hip_internal.hpp +++ b/hipamd/src/hip_internal.hpp @@ -140,19 +140,23 @@ const char* ihipGetErrorName(hipError_t hip_error); // This macro should be called at the beginning of every HIP API. #define HIP_INIT_API(cid, ...) \ + if (amd::Device::IsGPUInError()) { \ + HIP_RETURN(ConvertCLErrorIntoHIPError(amd::Device::GetGPUError())); \ + } \ HIP_INIT_API_INTERNAL(0, cid, __VA_ARGS__) \ if (hip::g_devices.size() == 0) { \ HIP_RETURN(hipErrorNoDevice); \ - } + } \ #define HIP_INIT_API_NO_RETURN(cid, ...) \ HIP_INIT_API_INTERNAL(1, cid, __VA_ARGS__) #define HIP_RETURN_DURATION(ret, ...) \ hip::tls.last_command_error_ = ret; \ - if (amd::Device::IsDeviceNotUsable()) { \ - hip::tls.last_error_ = hipErrorNoDevice; \ - hip::tls.last_command_error_ = hipErrorNoDevice; \ + if (amd::Device::IsGPUInError()) { \ + hipError_t hip_error = ConvertCLErrorIntoHIPError(amd::Device::GetGPUError()); \ + hip::tls.last_error_ = hip_error; \ + hip::tls.last_command_error_ = hip_error; \ } else if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \ if (hip::tls.last_command_error_ != hipSuccess && \ hip::tls.last_command_error_ != hipErrorNotReady) { \ @@ -168,9 +172,10 @@ const char* ihipGetErrorName(hipError_t hip_error); #define HIP_RETURN(ret, ...) \ hip::tls.last_command_error_ = ret; \ - if (amd::Device::IsDeviceNotUsable()) { \ - hip::tls.last_error_ = hipErrorNoDevice; \ - hip::tls.last_command_error_ = hipErrorNoDevice; \ + if (amd::Device::IsGPUInError()) { \ + hipError_t hip_error = ConvertCLErrorIntoHIPError(amd::Device::GetGPUError()); \ + hip::tls.last_error_ = hip_error; \ + hip::tls.last_command_error_ = hip_error; \ } else if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \ if (hip::tls.last_command_error_ != hipSuccess && \ hip::tls.last_command_error_ != hipErrorNotReady) { \ diff --git a/hipamd/src/hip_platform.cpp b/hipamd/src/hip_platform.cpp index ae50e4ce09..da290fd5ed 100644 --- a/hipamd/src/hip_platform.cpp +++ b/hipamd/src/hip_platform.cpp @@ -24,6 +24,7 @@ #include "hip_internal.hpp" #include "platform/program.hpp" #include "platform/runtime.hpp" +#include "utils/flags.hpp" #include #include @@ -185,13 +186,16 @@ void __hipRegisterTexture( void __hipUnregisterFatBinary(hip::FatBinaryInfo** modules) { static std::once_flag unregister_device_sync; - std::call_once(unregister_device_sync, [](){ - for (auto& hipDevice : g_devices) { - // By synchronizing devices ensure that all HSA signal handlers - // complete before removeFatBinary - hipDevice->SyncAllStreams(true); - } - }); + // If SKIP ABORT is set and GPU is in error, dont need to sync streams. + if (!HIP_SKIP_ABORT_ON_GPU_ERROR || !amd::Device::IsGPUInError()) { + std::call_once(unregister_device_sync, [](){ + for (auto& hipDevice : g_devices) { + // By synchronizing devices ensure that all HSA signal handlers + // complete before removeFatBinary + hipDevice->SyncAllStreams(true); + } + }); + } hipError_t err = PlatformState::instance().removeFatBinary(modules); guarantee((err == hipSuccess), "Cannot Unregister Fat Binary, error:%d", err); } diff --git a/hipamd/src/trace_helper.h b/hipamd/src/trace_helper.h index 9ae9c4ed4b..98097b27c1 100644 --- a/hipamd/src/trace_helper.h +++ b/hipamd/src/trace_helper.h @@ -245,3 +245,35 @@ inline std::string ToString(T first, Args... args) { return ToString(first) + ", " + ToString(args...); } + +inline hipError_t ConvertCLErrorIntoHIPError(cl_int cl_error) { + hipError_t hip_error = hipSuccess; + switch (cl_error) { + case CL_INVALID_OPERATION : + hip_error = hipErrorLaunchFailure; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE : + hip_error = hipErrorIllegalAddress; + break; + case CL_INVALID_PROGRAM : + hip_error = hipErrorInvalidSource; + break; + case CL_INVALID_ARG_VALUE : + hip_error = hipErrorInvalidValue; + break; + case CL_INVALID_KERNEL : + hip_error = hipErrorInvalidKernelFile; + break; + case CL_BUILD_PROGRAM_FAILURE : + hip_error = hipErrorLaunchFailure; + break; + case CL_INVALID_MEM_OBJECT : + hip_error = hipErrorIllegalAddress; + break; + case CL_DEVICE_NOT_AVAILABLE: + default: + hip_error = hipErrorUnknown; + break; + } + return hip_error; +} \ No newline at end of file diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp index a791aa46f3..2154535c11 100644 --- a/rocclr/device/device.cpp +++ b/rocclr/device/device.cpp @@ -336,7 +336,7 @@ Context* Device::glb_ctx_ = nullptr; Monitor Device::p2p_stage_ops_(true); Memory* Device::p2p_stage_ = nullptr; -bool Device::device_not_usable_ = false; +cl_int Device::gpu_error_ = CL_SUCCESS; std::shared_mutex MemObjMap::AllocatedLock_ ROCCLR_INIT_PRIORITY(101); std::map MemObjMap::MemObjMap_ ROCCLR_INIT_PRIORITY(101); diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index 7906faffa7..6372d571ad 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -2180,7 +2180,8 @@ class Device : public RuntimeObject { #endif #endif - static bool IsDeviceNotUsable() { return device_not_usable_; } + static bool IsGPUInError() { return (gpu_error_ != CL_SUCCESS); } + static cl_int GetGPUError() { return gpu_error_; } protected: //! Enable the specified extension @@ -2216,6 +2217,7 @@ class Device : public RuntimeObject { uint64_t initial_heap_size_{HIP_INITIAL_DM_SIZE}; //!< Initial device heap size amd::Monitor activeQueuesLock_ {}; //!< Guards access to the activeQueues set std::unordered_set activeQueues; //!< The set of active queues + static cl_int gpu_error_; //!< Store the GPU error cause during kernel launch private: const Isa *isa_; //!< Device isa diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 6cbf80f743..6a4a2566f6 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -18,6 +18,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "cl.h" #ifndef WITHOUT_HSA_BACKEND #include "platform/program.hpp" @@ -3395,23 +3396,28 @@ device::Signal* Device::createSignal() const { // ================================================================================================ hsa_status_t Device::BackendErrorCallBackHandler(const hsa_amd_event_t* event, void* data) { + cl_int gpu_error = CL_SUCCESS; switch (event->event_type) { case HSA_AMD_GPU_MEMORY_FAULT_EVENT: + gpu_error = CL_INVALID_MEM_OBJECT; LogError("Memory Fault Error"); break; case HSA_AMD_GPU_HW_EXCEPTION_EVENT: + gpu_error = CL_INVALID_OPERATION; LogError("HW Exception Error"); break; case HSA_AMD_GPU_MEMORY_ERROR_EVENT: + gpu_error = CL_DEVICE_NOT_AVAILABLE; LogError("GPU Memory Error"); break; default: + gpu_error = CL_DEVICE_NOT_AVAILABLE; LogError("Unknown Event Type "); break; } if (HIP_SKIP_ABORT_ON_GPU_ERROR) { - device_not_usable_ = true; + gpu_error_ = gpu_error; } else { abort(); } @@ -3567,6 +3573,48 @@ ProfilingSignal::~ProfilingSignal() { } } +// ================================================================================================ +cl_int ConvertHSAErrorIntoCLError(hsa_status_t hsa_status) { + cl_int cl_error = CL_SUCCESS; + switch (hsa_status) { + case HSA_STATUS_ERROR_EXCEPTION : + cl_error = CL_INVALID_OPERATION; + break; + case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS : + cl_error = CL_INVALID_ARG_VALUE; + break; + case HSA_STATUS_ERROR_INVALID_ALLOCATION : + cl_error = CL_MEM_OBJECT_ALLOCATION_FAILURE; + break; + case HSA_STATUS_ERROR_INVALID_CODE_OBJECT : + cl_error = CL_INVALID_PROGRAM; + break; + case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT : + cl_error = CL_INVALID_OPERATION; + break; + case HSA_STATUS_ERROR_INVALID_ARGUMENT : + cl_error = CL_INVALID_ARG_VALUE; + break; + case HSA_STATUS_ERROR_INVALID_ISA : + cl_error = CL_INVALID_KERNEL; + break; + case (hsa_status_t) HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION : + cl_error = CL_BUILD_PROGRAM_FAILURE; + break; + case (hsa_status_t) HSA_STATUS_ERROR_MEMORY_FAULT : + cl_error = CL_INVALID_MEM_OBJECT; + break; + case (hsa_status_t) HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION : + cl_error = CL_INVALID_MEM_OBJECT; + break; + case HSA_STATUS_ERROR : + default : + cl_error = CL_DEVICE_NOT_AVAILABLE; + break; + } + return cl_error; +} + // ================================================================================================ void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { @@ -3597,7 +3645,7 @@ void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { } if (HIP_SKIP_ABORT_ON_GPU_ERROR) { - amd::Device::device_not_usable_ = true; + amd::Device::gpu_error_ = ConvertHSAErrorIntoCLError(status); } else { abort(); } diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 83f80824ba..8ad16d3304 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -64,7 +64,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false) { signal.handle, kTimeout100us); if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, kTimeout100us, HSA_WAIT_STATE_ACTIVE) != 0) { - if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsDeviceNotUsable()) { + if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) { ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Device not Stable, while waiting for Signal =" "(0x%lx) for %d ns", signal.handle, kTimeout100us); return true; @@ -76,7 +76,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false) { // unstable, if so we return, otherwise we continue to wait in the while loop. while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, kTimeout4Secs, wait_state) != 0) { - if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsDeviceNotUsable()) { + if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) { ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Device not Stable, while waiting for Signal =" "(0x%lx) for %d ns", signal.handle, kTimeout4Secs); return true; diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index fafc89f472..1e150f08c7 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -22,6 +22,7 @@ #include "thread/monitor.hpp" #include "device/device.hpp" #include "platform/context.hpp" +#include "utils/flags.hpp" /*! * \file commandQueue.cpp