From c04e2300c8c8dea2e84fe08407451c05cd7dff6c Mon Sep 17 00:00:00 2001 From: German Date: Thu, 27 Oct 2022 18:22:38 -0400 Subject: [PATCH] SWDEV-363074 - Clean-up sync between SDMA and compute HIP can't rely on the resource tracking, used in OCL and requires different explicit sync. Make sure ROCCLR syncs compute only when SDMA is used and vise versa. The new logic will allow to enable CPDMA without unnecessary waits. Change-Id: Ib9d1788cfd5afa5ea2fec4c96a37d8b9c4d0059d [ROCm/clr commit: ff6b4db70b8dd1984c60dbcaca54d87d58c4d6be] --- projects/clr/rocclr/device/pal/palblit.cpp | 41 +++++++------------ .../clr/rocclr/device/pal/palresource.cpp | 1 + projects/clr/rocclr/device/pal/palvirtual.cpp | 22 +++++----- projects/clr/rocclr/device/pal/palvirtual.hpp | 25 +++++++---- 4 files changed, 42 insertions(+), 47 deletions(-) diff --git a/projects/clr/rocclr/device/pal/palblit.cpp b/projects/clr/rocclr/device/pal/palblit.cpp index 52c85d7166..02516d9f0d 100644 --- a/projects/clr/rocclr/device/pal/palblit.cpp +++ b/projects/clr/rocclr/device/pal/palblit.cpp @@ -47,8 +47,6 @@ inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const { bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** xferBuf, size_t origin, size_t& offset, size_t& totalSize, size_t xferSize) const { - gpu().releaseGpuMemoryFence(); - amd::Coord3D dst(0, 0, 0); size_t tmpSize; uint idxWrite = 0; @@ -120,11 +118,10 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - // Use host copy if memory has direct access if (setup_.disableReadBuffer_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); } else { size_t srcSize = size[0]; @@ -210,11 +207,10 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, const amd::BufferRect& bufRect, const amd::BufferRect& hostRect, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - // Use host copy if memory has direct access if (setup_.disableReadBufferRect_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); } else { Memory& xferBuf = dev().xferRead().acquire(); @@ -293,7 +289,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M } else { chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize()); chunkSize = std::max(chunkSize, 64 * Ki); - flushDMA = true; + flushDMA = (xferSize > chunkSize); } size_t srcOffset = 0; uint32_t flags = Resource::NoWait; @@ -332,13 +328,12 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBuffer_ || (gpuMem(dstMemory).isHostMemDirectAccess() && (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || gpuMem(dstMemory).isPersistentDirectMap()) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); } else { size_t dstSize = size[0]; @@ -401,12 +396,12 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, } } - if (dstSize != 0) { - Memory& xferBuf = gpu().xferWrite().Acquire(dstSize); - + while (dstSize > 0) { + auto xfer_size = std::min(dstSize, gpu().xferWrite().MaxSize()); + Memory& xferBuf = gpu().xferWrite().Acquire(xfer_size); // Write memory using a staged resource if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize, - dstSize)) { + xfer_size)) { LogError("DmaBlitManager::writeBuffer failed!"); return false; } @@ -422,13 +417,12 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem const amd::BufferRect& hostRect, const amd::BufferRect& bufRect, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBufferRect_ || (dstMemory.isHostMemDirectAccess() && (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || gpuMem(dstMemory).isPersistentDirectMap()) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); } else { Memory& xferBuf = gpu().xferWrite().Acquire(std::min(gpu().xferWrite().MaxSize(), size[0])); @@ -481,7 +475,6 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, bool entire) const { gpu().releaseGpuMemoryFence(); - if (setup_.disableWriteImage_) { return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); @@ -497,11 +490,10 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - if (setup_.disableCopyBuffer_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && !dev().settings().apuSystem_ && gpuMem(dstMemory).isHostMemDirectAccess())) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size); } else { return gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); @@ -513,11 +505,10 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - if (setup_.disableCopyBufferRect_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && gpuMem(dstMemory).isHostMemDirectAccess())) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire); } else { size_t srcOffset; @@ -591,10 +582,8 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { bool result = false; - gpu().releaseGpuMemoryFence(); - - if (setup_.disableCopyImageToBuffer_) { + gpu().releaseGpuMemoryFence(); result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); } else { @@ -604,6 +593,7 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory // Check if a HostBlit transfer is required if (completeOperation_ && !result) { + gpu().releaseGpuMemoryFence(); result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); } @@ -617,10 +607,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { bool result = false; - gpu().releaseGpuMemoryFence(); - - if (setup_.disableCopyBufferToImage_) { + gpu().releaseGpuMemoryFence(); result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); } else { @@ -630,6 +618,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory // Check if a HostBlit transfer is required if (completeOperation_ && !result) { + gpu().releaseGpuMemoryFence(); result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); } diff --git a/projects/clr/rocclr/device/pal/palresource.cpp b/projects/clr/rocclr/device/pal/palresource.cpp index 9f77ff70c1..f8b3d4c6f5 100644 --- a/projects/clr/rocclr/device/pal/palresource.cpp +++ b/projects/clr/rocclr/device/pal/palresource.cpp @@ -1430,6 +1430,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, // Make sure compute is done before CP DMA start gpu.addBarrier(RgpSqqtBarrierReason::MemDependency); } else { + gpu.releaseGpuMemoryFence(); gpu.engineID_ = SdmaEngine; } diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index d2827e40b8..dc68a82dca 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -61,7 +61,8 @@ uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType return allocedQueues; } -VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType, +// ================================================================================================ +VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType queueType, uint engineIdx, Pal::ICmdAllocator* cmdAllocator, uint rtCU, amd::CommandQueue::Priority priority, uint64_t residency_limit, uint max_command_buffers) { @@ -341,6 +342,7 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b palDoppRefs_.push_back(doppRef); } +// ================================================================================================ bool VirtualGPU::Queue::flush() { if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) { if (Pal::Result::Success != @@ -381,6 +383,13 @@ bool VirtualGPU::Queue::flush() { submitInfo.fenceCount = 1; submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_]; + if (amd::IS_HIP) { + // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers. + // Force CPU sync if there are pending operations on SDMA, until OS fences will be added + if (iQueue_->Type() == Pal::QueueTypeCompute) { + gpu_.WaitForIdleSdma(); + } + } // Submit command buffer to OS Pal::Result result; if (gpu_.rgpCaptureEna()) { @@ -2695,9 +2704,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dev().rgpCaptureMgr()->PostDispatch(this); } - // Mark the flag indicating if a dispatch is outstanding. - state_.hasPendingDispatch_ = true; - return true; } @@ -3953,12 +3959,4 @@ void* VirtualGPU::getOrCreateHostcallBuffer() { return hostcallBuffer_; } -void VirtualGPU::releaseGpuMemoryFence() { - if (isPendingDispatch() && amd::IS_HIP) { - WaitForIdleCompute(); - // Reset the status. - state_.hasPendingDispatch_ = false; - } -} - } // namespace pal diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index e648869180..5bfad81e39 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -68,7 +68,7 @@ class VirtualGPU : public device::VirtualDevice { Queue(const Queue&) = delete; Queue& operator=(const Queue&) = delete; - static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object + static Queue* Create(VirtualGPU& gpu, //!< ROCCLR virtual GPU object Pal::QueueType queueType, //!< PAL queue type uint engineIdx, //!< Select particular engine index Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator @@ -78,7 +78,7 @@ class VirtualGPU : public device::VirtualDevice { uint max_command_buffers //!< Number of allocated command buffers ); - Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, + Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers) : lock_(nullptr), iQueue_(nullptr), @@ -177,7 +177,7 @@ class VirtualGPU : public device::VirtualDevice { private: void DumpMemoryReferences() const; - const VirtualGPU& gpu_; //!< OCL virtual GPU object + VirtualGPU& gpu_; //!< ROCCLR virtual GPU object Pal::IDevice* iDev_; //!< PAL device uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions uint cmdBufIdCurrent_; //!< Current global command buffer ID @@ -227,7 +227,6 @@ class VirtualGPU : public device::VirtualDevice { uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime uint imageBufferWrtBack_ : 1; //!< Enable image buffer write back - uint hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding }; uint value_; State() : value_(0) {} @@ -562,14 +561,22 @@ class VirtualGPU : public device::VirtualDevice { } } + //! Waits for idle on SDMA engine + void WaitForIdleSdma() { + if (events_[SdmaEngine].isValid()) { + queues_[events_[SdmaEngine].engineId_]->waitForEvent(events_[SdmaEngine].id_); + events_[SdmaEngine].invalidate(); + } + } + void* getOrCreateHostcallBuffer(); //! Waits on an outstanding kernel. - void releaseGpuMemoryFence(); - - //! Returns true if a dispatch is pending. - bool isPendingDispatch() const { return state_.hasPendingDispatch_; } - + void releaseGpuMemoryFence() { + if (amd::IS_HIP) { + WaitForIdleCompute(); + } + } protected: void profileEvent(EngineType engine, bool type) const;