diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 5ec63c2495..4f878e3ad3 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -561,7 +561,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, address src = reinterpret_cast
(srcMemory.getDeviceMemory()); address dst = reinterpret_cast
(dstMemory.getDeviceMemory()); - gpu().releaseGpuMemoryFence(kSkipCpuWait); + bool skipCpuWait = true; src += srcOrigin[0]; dst += dstOrigin[0]; @@ -581,6 +581,15 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, dstAgent = dstMemory.dev().getBackendDevice(); } + // Blocking D2H copies need a wait anyways so better wait here + // than having to wait on the device for dependent signals for SDMA which is slow + if (!copyMetadata.isAsync_ && !srcMemory.isHostMemDirectAccess() + && dstMemory.isHostMemDirectAccess()) { + skipCpuWait = false; + } + + gpu().releaseGpuMemoryFence(skipCpuWait); + return rocrCopyBuffer(dst, dstAgent, src, srcAgent, size[0], copyMetadata); } @@ -636,7 +645,8 @@ void DmaBlitManager::releaseBuffer(BufferState &buffer) const { bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDst, size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata, bool enablePin) const { - gpu().releaseGpuMemoryFence(kSkipCpuWait); + // Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow + gpu().releaseGpuMemoryFence(hostToDev); // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning bool status = true; @@ -698,8 +708,6 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs return false; } - gpu().addSystemScope(); - return true; } // ================================================================================================