diff --git a/projects/clr/rocclr/device/pal/palblit.cpp b/projects/clr/rocclr/device/pal/palblit.cpp index 52c85d7166..02516d9f0d 100644 --- a/projects/clr/rocclr/device/pal/palblit.cpp +++ b/projects/clr/rocclr/device/pal/palblit.cpp @@ -47,8 +47,6 @@ inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const { bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** xferBuf, size_t origin, size_t& offset, size_t& totalSize, size_t xferSize) const { - gpu().releaseGpuMemoryFence(); - amd::Coord3D dst(0, 0, 0); size_t tmpSize; uint idxWrite = 0; @@ -120,11 +118,10 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - // Use host copy if memory has direct access if (setup_.disableReadBuffer_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); } else { size_t srcSize = size[0]; @@ -210,11 +207,10 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, const amd::BufferRect& bufRect, const amd::BufferRect& hostRect, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - // Use host copy if memory has direct access if (setup_.disableReadBufferRect_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); } else { Memory& xferBuf = dev().xferRead().acquire(); @@ -293,7 +289,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M } else { chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize()); chunkSize = std::max(chunkSize, 64 * Ki); - flushDMA = true; + flushDMA = (xferSize > chunkSize); } size_t srcOffset = 0; uint32_t flags = Resource::NoWait; @@ -332,13 +328,12 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBuffer_ || (gpuMem(dstMemory).isHostMemDirectAccess() && (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || gpuMem(dstMemory).isPersistentDirectMap()) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); } else { size_t dstSize = size[0]; @@ -401,12 +396,12 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, } } - if (dstSize != 0) { - Memory& xferBuf = gpu().xferWrite().Acquire(dstSize); - + while (dstSize > 0) { + auto xfer_size = std::min(dstSize, gpu().xferWrite().MaxSize()); + Memory& xferBuf = gpu().xferWrite().Acquire(xfer_size); // Write memory using a staged resource if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize, - dstSize)) { + xfer_size)) { LogError("DmaBlitManager::writeBuffer failed!"); return false; } @@ -422,13 +417,12 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem const amd::BufferRect& hostRect, const amd::BufferRect& bufRect, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - // Use host copy if memory has direct access or it's persistent if (setup_.disableWriteBufferRect_ || (dstMemory.isHostMemDirectAccess() && (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) || gpuMem(dstMemory).isPersistentDirectMap()) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); } else { Memory& xferBuf = gpu().xferWrite().Acquire(std::min(gpu().xferWrite().MaxSize(), size[0])); @@ -481,7 +475,6 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, bool entire) const { gpu().releaseGpuMemoryFence(); - if (setup_.disableWriteImage_) { return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); @@ -497,11 +490,10 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - if (setup_.disableCopyBuffer_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && !dev().settings().apuSystem_ && gpuMem(dstMemory).isHostMemDirectAccess())) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size); } else { return gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); @@ -513,11 +505,10 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, const amd::Coord3D& size, bool entire) const { - gpu().releaseGpuMemoryFence(); - if (setup_.disableCopyBufferRect_ || (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && gpuMem(dstMemory).isHostMemDirectAccess())) { + gpu().releaseGpuMemoryFence(); return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire); } else { size_t srcOffset; @@ -591,10 +582,8 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { bool result = false; - gpu().releaseGpuMemoryFence(); - - if (setup_.disableCopyImageToBuffer_) { + gpu().releaseGpuMemoryFence(); result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); } else { @@ -604,6 +593,7 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory // Check if a HostBlit transfer is required if (completeOperation_ && !result) { + gpu().releaseGpuMemoryFence(); result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); } @@ -617,10 +607,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { bool result = false; - gpu().releaseGpuMemoryFence(); - - if (setup_.disableCopyBufferToImage_) { + gpu().releaseGpuMemoryFence(); result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); } else { @@ -630,6 +618,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory // Check if a HostBlit transfer is required if (completeOperation_ && !result) { + gpu().releaseGpuMemoryFence(); result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); } diff --git a/projects/clr/rocclr/device/pal/palresource.cpp b/projects/clr/rocclr/device/pal/palresource.cpp index 9f77ff70c1..f8b3d4c6f5 100644 --- a/projects/clr/rocclr/device/pal/palresource.cpp +++ b/projects/clr/rocclr/device/pal/palresource.cpp @@ -1430,6 +1430,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, // Make sure compute is done before CP DMA start gpu.addBarrier(RgpSqqtBarrierReason::MemDependency); } else { + gpu.releaseGpuMemoryFence(); gpu.engineID_ = SdmaEngine; } diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index d2827e40b8..dc68a82dca 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -61,7 +61,8 @@ uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType return allocedQueues; } -VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType, +// ================================================================================================ +VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType queueType, uint engineIdx, Pal::ICmdAllocator* cmdAllocator, uint rtCU, amd::CommandQueue::Priority priority, uint64_t residency_limit, uint max_command_buffers) { @@ -341,6 +342,7 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b palDoppRefs_.push_back(doppRef); } +// ================================================================================================ bool VirtualGPU::Queue::flush() { if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) { if (Pal::Result::Success != @@ -381,6 +383,13 @@ bool VirtualGPU::Queue::flush() { submitInfo.fenceCount = 1; submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_]; + if (amd::IS_HIP) { + // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers. + // Force CPU sync if there are pending operations on SDMA, until OS fences will be added + if (iQueue_->Type() == Pal::QueueTypeCompute) { + gpu_.WaitForIdleSdma(); + } + } // Submit command buffer to OS Pal::Result result; if (gpu_.rgpCaptureEna()) { @@ -2695,9 +2704,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dev().rgpCaptureMgr()->PostDispatch(this); } - // Mark the flag indicating if a dispatch is outstanding. - state_.hasPendingDispatch_ = true; - return true; } @@ -3953,12 +3959,4 @@ void* VirtualGPU::getOrCreateHostcallBuffer() { return hostcallBuffer_; } -void VirtualGPU::releaseGpuMemoryFence() { - if (isPendingDispatch() && amd::IS_HIP) { - WaitForIdleCompute(); - // Reset the status. - state_.hasPendingDispatch_ = false; - } -} - } // namespace pal diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index e648869180..5bfad81e39 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -68,7 +68,7 @@ class VirtualGPU : public device::VirtualDevice { Queue(const Queue&) = delete; Queue& operator=(const Queue&) = delete; - static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object + static Queue* Create(VirtualGPU& gpu, //!< ROCCLR virtual GPU object Pal::QueueType queueType, //!< PAL queue type uint engineIdx, //!< Select particular engine index Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator @@ -78,7 +78,7 @@ class VirtualGPU : public device::VirtualDevice { uint max_command_buffers //!< Number of allocated command buffers ); - Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, + Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers) : lock_(nullptr), iQueue_(nullptr), @@ -177,7 +177,7 @@ class VirtualGPU : public device::VirtualDevice { private: void DumpMemoryReferences() const; - const VirtualGPU& gpu_; //!< OCL virtual GPU object + VirtualGPU& gpu_; //!< ROCCLR virtual GPU object Pal::IDevice* iDev_; //!< PAL device uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions uint cmdBufIdCurrent_; //!< Current global command buffer ID @@ -227,7 +227,6 @@ class VirtualGPU : public device::VirtualDevice { uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime uint imageBufferWrtBack_ : 1; //!< Enable image buffer write back - uint hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding }; uint value_; State() : value_(0) {} @@ -562,14 +561,22 @@ class VirtualGPU : public device::VirtualDevice { } } + //! Waits for idle on SDMA engine + void WaitForIdleSdma() { + if (events_[SdmaEngine].isValid()) { + queues_[events_[SdmaEngine].engineId_]->waitForEvent(events_[SdmaEngine].id_); + events_[SdmaEngine].invalidate(); + } + } + void* getOrCreateHostcallBuffer(); //! Waits on an outstanding kernel. - void releaseGpuMemoryFence(); - - //! Returns true if a dispatch is pending. - bool isPendingDispatch() const { return state_.hasPendingDispatch_; } - + void releaseGpuMemoryFence() { + if (amd::IS_HIP) { + WaitForIdleCompute(); + } + } protected: void profileEvent(EngineType engine, bool type) const;