SWDEV-531711 - Report correct error code based on device failure. (#286)

[ROCm/clr commit: f5b8db33f1]
This commit is contained in:
Jayaprakash, Karthik
2025-05-17 06:33:13 -04:00
committato da GitHub
parent dc39d67017
commit 4ea2d9a5ee
8 ha cambiato i file con 112 aggiunte e 20 eliminazioni
+12 -7
Vedi File
@@ -140,19 +140,23 @@ const char* ihipGetErrorName(hipError_t hip_error);
// This macro should be called at the beginning of every HIP API.
#define HIP_INIT_API(cid, ...) \
if (amd::Device::IsGPUInError()) { \
HIP_RETURN(ConvertCLErrorIntoHIPError(amd::Device::GetGPUError())); \
} \
HIP_INIT_API_INTERNAL(0, cid, __VA_ARGS__) \
if (hip::g_devices.size() == 0) { \
HIP_RETURN(hipErrorNoDevice); \
}
} \
#define HIP_INIT_API_NO_RETURN(cid, ...) \
HIP_INIT_API_INTERNAL(1, cid, __VA_ARGS__)
#define HIP_RETURN_DURATION(ret, ...) \
hip::tls.last_command_error_ = ret; \
if (amd::Device::IsDeviceNotUsable()) { \
hip::tls.last_error_ = hipErrorNoDevice; \
hip::tls.last_command_error_ = hipErrorNoDevice; \
if (amd::Device::IsGPUInError()) { \
hipError_t hip_error = ConvertCLErrorIntoHIPError(amd::Device::GetGPUError()); \
hip::tls.last_error_ = hip_error; \
hip::tls.last_command_error_ = hip_error; \
} else if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \
if (hip::tls.last_command_error_ != hipSuccess && \
hip::tls.last_command_error_ != hipErrorNotReady) { \
@@ -168,9 +172,10 @@ const char* ihipGetErrorName(hipError_t hip_error);
#define HIP_RETURN(ret, ...) \
hip::tls.last_command_error_ = ret; \
if (amd::Device::IsDeviceNotUsable()) { \
hip::tls.last_error_ = hipErrorNoDevice; \
hip::tls.last_command_error_ = hipErrorNoDevice; \
if (amd::Device::IsGPUInError()) { \
hipError_t hip_error = ConvertCLErrorIntoHIPError(amd::Device::GetGPUError()); \
hip::tls.last_error_ = hip_error; \
hip::tls.last_command_error_ = hip_error; \
} else if (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_GET_LAST_ERROR) { \
if (hip::tls.last_command_error_ != hipSuccess && \
hip::tls.last_command_error_ != hipErrorNotReady) { \
+11 -7
Vedi File
@@ -24,6 +24,7 @@
#include "hip_internal.hpp"
#include "platform/program.hpp"
#include "platform/runtime.hpp"
#include "utils/flags.hpp"
#include <unordered_map>
#include <mutex>
@@ -185,13 +186,16 @@ void __hipRegisterTexture(
void __hipUnregisterFatBinary(hip::FatBinaryInfo** modules) {
static std::once_flag unregister_device_sync;
std::call_once(unregister_device_sync, [](){
for (auto& hipDevice : g_devices) {
// By synchronizing devices ensure that all HSA signal handlers
// complete before removeFatBinary
hipDevice->SyncAllStreams(true);
}
});
// If SKIP ABORT is set and GPU is in error, dont need to sync streams.
if (!HIP_SKIP_ABORT_ON_GPU_ERROR || !amd::Device::IsGPUInError()) {
std::call_once(unregister_device_sync, [](){
for (auto& hipDevice : g_devices) {
// By synchronizing devices ensure that all HSA signal handlers
// complete before removeFatBinary
hipDevice->SyncAllStreams(true);
}
});
}
hipError_t err = PlatformState::instance().removeFatBinary(modules);
guarantee((err == hipSuccess), "Cannot Unregister Fat Binary, error:%d", err);
}
+32
Vedi File
@@ -245,3 +245,35 @@ inline std::string ToString(T first, Args... args) {
return ToString(first) + ", " + ToString(args...);
}
inline hipError_t ConvertCLErrorIntoHIPError(cl_int cl_error) {
hipError_t hip_error = hipSuccess;
switch (cl_error) {
case CL_INVALID_OPERATION :
hip_error = hipErrorLaunchFailure;
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE :
hip_error = hipErrorIllegalAddress;
break;
case CL_INVALID_PROGRAM :
hip_error = hipErrorInvalidSource;
break;
case CL_INVALID_ARG_VALUE :
hip_error = hipErrorInvalidValue;
break;
case CL_INVALID_KERNEL :
hip_error = hipErrorInvalidKernelFile;
break;
case CL_BUILD_PROGRAM_FAILURE :
hip_error = hipErrorLaunchFailure;
break;
case CL_INVALID_MEM_OBJECT :
hip_error = hipErrorIllegalAddress;
break;
case CL_DEVICE_NOT_AVAILABLE:
default:
hip_error = hipErrorUnknown;
break;
}
return hip_error;
}
+1 -1
Vedi File
@@ -336,7 +336,7 @@ Context* Device::glb_ctx_ = nullptr;
Monitor Device::p2p_stage_ops_(true);
Memory* Device::p2p_stage_ = nullptr;
bool Device::device_not_usable_ = false;
cl_int Device::gpu_error_ = CL_SUCCESS;
std::shared_mutex MemObjMap::AllocatedLock_ ROCCLR_INIT_PRIORITY(101);
std::map<uintptr_t, amd::Memory*> MemObjMap::MemObjMap_ ROCCLR_INIT_PRIORITY(101);
+3 -1
Vedi File
@@ -2180,7 +2180,8 @@ class Device : public RuntimeObject {
#endif
#endif
static bool IsDeviceNotUsable() { return device_not_usable_; }
static bool IsGPUInError() { return (gpu_error_ != CL_SUCCESS); }
static cl_int GetGPUError() { return gpu_error_; }
protected:
//! Enable the specified extension
@@ -2216,6 +2217,7 @@ class Device : public RuntimeObject {
uint64_t initial_heap_size_{HIP_INITIAL_DM_SIZE}; //!< Initial device heap size
amd::Monitor activeQueuesLock_ {}; //!< Guards access to the activeQueues set
std::unordered_set<amd::CommandQueue*> activeQueues; //!< The set of active queues
static cl_int gpu_error_; //!< Store the GPU error cause during kernel launch
private:
const Isa *isa_; //!< Device isa
@@ -18,6 +18,7 @@
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "cl.h"
#ifndef WITHOUT_HSA_BACKEND
#include "platform/program.hpp"
@@ -3395,23 +3396,28 @@ device::Signal* Device::createSignal() const {
// ================================================================================================
hsa_status_t Device::BackendErrorCallBackHandler(const hsa_amd_event_t* event, void* data) {
cl_int gpu_error = CL_SUCCESS;
switch (event->event_type) {
case HSA_AMD_GPU_MEMORY_FAULT_EVENT:
gpu_error = CL_INVALID_MEM_OBJECT;
LogError("Memory Fault Error");
break;
case HSA_AMD_GPU_HW_EXCEPTION_EVENT:
gpu_error = CL_INVALID_OPERATION;
LogError("HW Exception Error");
break;
case HSA_AMD_GPU_MEMORY_ERROR_EVENT:
gpu_error = CL_DEVICE_NOT_AVAILABLE;
LogError("GPU Memory Error");
break;
default:
gpu_error = CL_DEVICE_NOT_AVAILABLE;
LogError("Unknown Event Type ");
break;
}
if (HIP_SKIP_ABORT_ON_GPU_ERROR) {
device_not_usable_ = true;
gpu_error_ = gpu_error;
} else {
abort();
}
@@ -3567,6 +3573,48 @@ ProfilingSignal::~ProfilingSignal() {
}
}
// ================================================================================================
cl_int ConvertHSAErrorIntoCLError(hsa_status_t hsa_status) {
cl_int cl_error = CL_SUCCESS;
switch (hsa_status) {
case HSA_STATUS_ERROR_EXCEPTION :
cl_error = CL_INVALID_OPERATION;
break;
case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS :
cl_error = CL_INVALID_ARG_VALUE;
break;
case HSA_STATUS_ERROR_INVALID_ALLOCATION :
cl_error = CL_MEM_OBJECT_ALLOCATION_FAILURE;
break;
case HSA_STATUS_ERROR_INVALID_CODE_OBJECT :
cl_error = CL_INVALID_PROGRAM;
break;
case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT :
cl_error = CL_INVALID_OPERATION;
break;
case HSA_STATUS_ERROR_INVALID_ARGUMENT :
cl_error = CL_INVALID_ARG_VALUE;
break;
case HSA_STATUS_ERROR_INVALID_ISA :
cl_error = CL_INVALID_KERNEL;
break;
case (hsa_status_t) HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION :
cl_error = CL_BUILD_PROGRAM_FAILURE;
break;
case (hsa_status_t) HSA_STATUS_ERROR_MEMORY_FAULT :
cl_error = CL_INVALID_MEM_OBJECT;
break;
case (hsa_status_t) HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION :
cl_error = CL_INVALID_MEM_OBJECT;
break;
case HSA_STATUS_ERROR :
default :
cl_error = CL_DEVICE_NOT_AVAILABLE;
break;
}
return cl_error;
}
// ================================================================================================
void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
@@ -3597,7 +3645,7 @@ void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
}
if (HIP_SKIP_ABORT_ON_GPU_ERROR) {
amd::Device::device_not_usable_ = true;
amd::Device::gpu_error_ = ConvertHSAErrorIntoCLError(status);
} else {
abort();
}
@@ -64,7 +64,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false) {
signal.handle, kTimeout100us);
if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
kTimeout100us, HSA_WAIT_STATE_ACTIVE) != 0) {
if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsDeviceNotUsable()) {
if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Device not Stable, while waiting for Signal ="
"(0x%lx) for %d ns", signal.handle, kTimeout100us);
return true;
@@ -76,7 +76,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false) {
// unstable, if so we return, otherwise we continue to wait in the while loop.
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
kTimeout4Secs, wait_state) != 0) {
if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsDeviceNotUsable()) {
if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Device not Stable, while waiting for Signal ="
"(0x%lx) for %d ns", signal.handle, kTimeout4Secs);
return true;
@@ -22,6 +22,7 @@
#include "thread/monitor.hpp"
#include "device/device.hpp"
#include "platform/context.hpp"
#include "utils/flags.hpp"
/*!
* \file commandQueue.cpp