From c94c02a2e61381f9fdf517d8b5bbd20567288b2e Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Thu, 6 Mar 2025 19:43:52 +0000 Subject: [PATCH] SWDEV-519596 - Avoid passing dep signal to SDMA - For D2H cases avoid passing dependent signals to SDMA, the signals take a while to resolve on SDMA engine Change-Id: I569635228af977847f201c82ca897002f8f2f4a8 [ROCm/clr commit: 78d0ff2dbca16a4f7e5691204fe7e5966e25bb11] --- projects/clr/rocclr/device/rocm/rocblit.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 5ec63c2495..4f878e3ad3 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -561,7 +561,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, address src = reinterpret_cast
(srcMemory.getDeviceMemory()); address dst = reinterpret_cast
(dstMemory.getDeviceMemory()); - gpu().releaseGpuMemoryFence(kSkipCpuWait); + bool skipCpuWait = true; src += srcOrigin[0]; dst += dstOrigin[0]; @@ -581,6 +581,15 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, dstAgent = dstMemory.dev().getBackendDevice(); } + // Blocking D2H copies need a wait anyways so better wait here + // than having to wait on the device for dependent signals for SDMA which is slow + if (!copyMetadata.isAsync_ && !srcMemory.isHostMemDirectAccess() + && dstMemory.isHostMemDirectAccess()) { + skipCpuWait = false; + } + + gpu().releaseGpuMemoryFence(skipCpuWait); + return rocrCopyBuffer(dst, dstAgent, src, srcAgent, size[0], copyMetadata); } @@ -636,7 +645,8 @@ void DmaBlitManager::releaseBuffer(BufferState &buffer) const { bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDst, size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata, bool enablePin) const { - gpu().releaseGpuMemoryFence(kSkipCpuWait); + // Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow + gpu().releaseGpuMemoryFence(hostToDev); // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning bool status = true; @@ -698,8 +708,6 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs return false; } - gpu().addSystemScope(); - return true; } // ================================================================================================