From 6ec57b845c0dfd28ad3a7c44c03aa2645ae69e07 Mon Sep 17 00:00:00 2001 From: Sourabh Betigeri Date: Tue, 23 Mar 2021 11:12:37 -0700 Subject: [PATCH] SWDEV-273265 - Fix to wait on a pending dispatch in PAL. Change-Id: I431cedfef5d5cb727c35ba8e294528017bfe2088 [ROCm/clr commit: 21ea81d5b9fb2d5599e97470ee5c24a2bb64b694] --- projects/clr/rocclr/device/pal/palblit.cpp | 41 +++++++++++++++++++ projects/clr/rocclr/device/pal/palvirtual.cpp | 14 ++++++- projects/clr/rocclr/device/pal/palvirtual.hpp | 9 ++++ 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/projects/clr/rocclr/device/pal/palblit.cpp b/projects/clr/rocclr/device/pal/palblit.cpp index 55cac9adec..a89076e709 100644 --- a/projects/clr/rocclr/device/pal/palblit.cpp +++ b/projects/clr/rocclr/device/pal/palblit.cpp @@ -47,6 +47,8 @@ inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const { bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** xferBuf, size_t origin, size_t& offset, size_t& totalSize, size_t xferSize) const { + gpu().releaseGpuMemoryFence(); + amd::Coord3D dst(0, 0, 0); size_t tmpSize; uint idxWrite = 0; @@ -118,6 +120,8 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { + gpu().releaseGpuMemoryFence(); + // Use host copy if memory has direct access if (setup_.disableReadBuffer_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { @@ -206,6 +210,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, const amd::BufferRect& bufRect, const amd::BufferRect& hostRect, const amd::Coord3D& size, bool entire) const { + gpu().releaseGpuMemoryFence(); + // Use host copy if memory has direct access if (setup_.disableReadBufferRect_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { @@ -258,6 +264,8 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, bool entire) const { + gpu().releaseGpuMemoryFence(); + if (setup_.disableReadImage_) { return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); @@ -324,6 +332,8 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { + gpu().releaseGpuMemoryFence(); + // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBuffer_ || (gpuMem(dstMemory).isHostMemDirectAccess() && @@ -412,6 +422,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem const amd::BufferRect& hostRect, const amd::BufferRect& bufRect, const amd::Coord3D& size, bool entire) const { + gpu().releaseGpuMemoryFence(); + // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBufferRect_ || (dstMemory.isHostMemDirectAccess() && @@ -468,6 +480,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, bool entire) const { + gpu().releaseGpuMemoryFence(); + if (setup_.disableWriteImage_) { return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); @@ -483,6 +497,8 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire) const { + gpu().releaseGpuMemoryFence(); + if (setup_.disableCopyBuffer_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && !dev().settings().apuSystem_ && gpuMem(dstMemory).isHostMemDirectAccess())) { @@ -497,6 +513,8 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, const amd::Coord3D& size, bool entire) const { + gpu().releaseGpuMemoryFence(); + if (setup_.disableCopyBufferRect_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && gpuMem(dstMemory).isHostMemDirectAccess())) { @@ -573,6 +591,8 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { bool result = false; + gpu().releaseGpuMemoryFence(); + if (setup_.disableCopyImageToBuffer_) { result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, @@ -597,6 +617,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { bool result = false; + gpu().releaseGpuMemoryFence(); + if (setup_.disableCopyBufferToImage_) { result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, @@ -620,6 +642,8 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire) const { bool result = false; + gpu().releaseGpuMemoryFence(); + if (setup_.disableCopyImage_) { return HostBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); @@ -1655,6 +1679,8 @@ bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost, // Use host copy if memory has direct access or it's persistent if (setup_.disableReadImage_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + gpu().releaseGpuMemoryFence(); + result = HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); synchronize(); @@ -1702,6 +1728,8 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteImage_ || gpuMem(dstMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isPersistentDirectMap()) { + gpu().releaseGpuMemoryFence(); + result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); synchronize(); @@ -1872,9 +1900,12 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool entire) const { amd::ScopedLock k(lockXferOps_); bool result = false; + // Use host copy if memory has direct access if (setup_.disableReadBuffer_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + gpu().releaseGpuMemoryFence(); + result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); synchronize(); return result; @@ -1923,6 +1954,8 @@ bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, // Use host copy if memory has direct access if (setup_.disableReadBufferRect_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + gpu().releaseGpuMemoryFence(); + result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); synchronize(); return result; @@ -1971,6 +2004,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo (gpuMem(dstMemory).isHostMemDirectAccess() && (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || (gpuMem(dstMemory).memoryType() == Resource::Persistent)) { + gpu().releaseGpuMemoryFence(); + result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); synchronize(); return result; @@ -2022,6 +2057,8 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst (gpuMem(dstMemory).isHostMemDirectAccess() && (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || gpuMem(dstMemory).isPersistentDirectMap()) { + gpu().releaseGpuMemoryFence(); + result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); synchronize(); return result; @@ -2071,6 +2108,8 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, // Use host fill if memory has direct access if (setup_.disableFillBuffer_ || (!forceBlit && gpuMem(memory).isHostMemDirectAccess())) { + gpu().releaseGpuMemoryFence(); + result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire); synchronize(); return result; @@ -2212,6 +2251,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, // Use host fill if memory has direct access if (setup_.disableFillImage_ || gpuMem(memory).isHostMemDirectAccess()) { + gpu().releaseGpuMemoryFence(); + result = HostBlitManager::fillImage(memory, pattern, origin, size, entire); synchronize(); return result; diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index f3d3980ac3..733766418c 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -1216,7 +1216,7 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) { if (size[0] <= dev().settings().pinnedMinXferSize_) { partial = size[0]; } - // Make first step transfer + // Make first step transfer if (partial > 0) { result = blitMgr().readBuffer(*memory, vcmd.destination(), origin, partial); } @@ -2596,6 +2596,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dev().rgpCaptureMgr()->PostDispatch(this); } + // Mark the flag indicating if a dispatch is outstanding. + state_.hasPendingDispatch_ = true; + return true; } @@ -3833,4 +3836,13 @@ void* VirtualGPU::getOrCreateHostcallBuffer() { } return hostcallBuffer_; } + +void VirtualGPU::releaseGpuMemoryFence() { + if (isPendingDispatch() && amd::IS_HIP) { + WaitForIdleCompute(); + // Reset the status. + state_.hasPendingDispatch_ = false; + } +} + } // namespace pal diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index b7ab2df22a..30120293f9 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -226,6 +226,8 @@ class VirtualGPU : public device::VirtualDevice { uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime + uint imageBufferWrtBack_ : 1; //!< Enable image buffer write back + uint hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding }; uint value_; State() : value_(0) {} @@ -553,6 +555,13 @@ class VirtualGPU : public device::VirtualDevice { void* getOrCreateHostcallBuffer(); + //! Waits on an outstanding kernel. + void VirtualGPU::releaseGpuMemoryFence(); + + //! Returns true if a dispatch is pending. + bool isPendingDispatch() const { return state_.hasPendingDispatch_; } + + protected: void profileEvent(EngineType engine, bool type) const;