From ce2e5eba6bb3be0a4edc21fc73f6d36ccf8a4c04 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Thu, 21 Jan 2021 17:29:34 -0500 Subject: [PATCH] SWDEV-257787 - Reset active signal if ROCR call failed - ROCR fails the call for some reason, then the signal will become invalid and can hang on a wait. The logic will reset the active signal in such cases Change-Id: Ia131420200f1bbd7c9a162b8f1b06db8cecf41c6 --- rocclr/device/rocm/rocblit.cpp | 4 ++++ rocclr/device/rocm/rocvirtual.cpp | 10 ++++++++++ rocclr/device/rocm/rocvirtual.hpp | 4 ++++ 3 files changed, 18 insertions(+) diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index f0fabd64ce..e16b40ec37 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -450,6 +450,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset, &srcMem, &offset, &dim, agent, direction, num_wait_events, wait_event, active); if (status != HSA_STATUS_SUCCESS) { + gpu().Barriers().ResetCurrentSignal(); LogPrintfError("DMA buffer failed with code %d", status); return false; } @@ -473,6 +474,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d size[0], num_wait_events, wait_event, active); gpu().setLastCommandSDMA(true) ; if (status != HSA_STATUS_SUCCESS) { + gpu().Barriers().ResetCurrentSignal(); LogPrintfError("DMA buffer failed with code %d", status); return false; } @@ -656,6 +658,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, if (status == HSA_STATUS_SUCCESS) { gpu().addSystemScope(); } else { + gpu().Barriers().ResetCurrentSignal(); LogPrintfError("Hsa copy from host to device failed with code %d", status); } @@ -702,6 +705,7 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ srcAgent, size, 0, nullptr, active); gpu().setLastCommandSDMA(true); if (status != HSA_STATUS_SUCCESS) { + gpu().Barriers().ResetCurrentSignal(); LogPrintfError("Hsa copy from host to device failed with code %d", status); return false; } diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 65b5d83fb9..7340d99e70 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -205,6 +205,15 @@ void VirtualGPU::MemoryDependency::clear(bool all) { } } +// ================================================================================================ +void VirtualGPU::HwQueueTracker::ResetCurrentSignal() { + // Reset the signal and return + hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, 0); + // Fallback to the previous signal + current_id_ = (current_id_ == 0) ? (signal_list_.size() - 1) : (current_id_ - 1); +} + +// ================================================================================================ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params, size_t& ldsAddress, bool cooperativeGroups) { Kernel& hsaKernel = const_cast(static_cast(*(kernel.getDeviceKernel(dev())))); @@ -1200,6 +1209,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution. if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) { + Barriers().ResetCurrentSignal(); LogError("hsa_amd_svm_prefetch_async failed"); cmd.setStatus(CL_INVALID_OPERATION); } diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 1c5146e93a..a989b389f7 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -258,6 +258,9 @@ class VirtualGPU : public device::VirtualDevice { return hsa_signal_t{}; } + //! Resets current signal back to the previous one. It's necessary in a case of ROCr failure. + void ResetCurrentSignal(); + private: //! Wait for the next active signal void WaitNext() { @@ -282,6 +285,7 @@ class VirtualGPU : public device::VirtualDevice { } return true; } + std::vector signal_list_; //!< The pool of all signals for processing size_t current_id_ = 0; //!< Last submitted signal hsa_agent_t agent_; //!< HSA device agent