From a18f2c549ce0c50ed5d2ca9fbbae4872d39bf606 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Thu, 19 Dec 2024 23:36:03 +0000 Subject: [PATCH] SWDEV-504494 - Flush to systemscope when copying non-coherent mem - When we use blit(compute) copies, two subsequent copies may read for the same source buffer, the buffer may get modified by the host in between and if the src buffer was allocated with non-coherent flag, the device may simply use stale value from previous cacheline fetch. This is a corner case. Change-Id: I2ce261c6f6fa4e5bb608f116548e5cc711ae6f3c [ROCm/clr commit: b63005d5509a062542b8d3aa72037ca98a51ee78] --- projects/clr/rocclr/device/rocm/rocblit.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index a9ffb4d95a..5862f701f1 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -1707,7 +1707,7 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, copySize = std::min(totalSize, maxStagedXferSize); srcAddr += stagedCopyOffset; ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, " - "dstOrigin=%zu, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize); + "dstOrigin=0x%x, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize); // Flush caches for coherency after the copy as we need to std::memcpy // from staging buffer to unpinned dst. Also attach a signal to the dispatch packet // itself that we can wait on without extra barrier packet. @@ -1865,7 +1865,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize); memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, " - "dstOrigin=%zu, size=%zu", dstAddr, stagingBuffer, origin[0], copySize); + "dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize); + // No cache flush is needed here as we use a staging buffer, and the acquire logic + // ensures that the cacheline is different and re-used only when L2 is flushed result = shaderCopyBuffer(dstAddr, stagingBuffer, origin, srcOrigin, copySize, entire, dev().settings().limit_blit_wg_, copyMetadata); @@ -2253,6 +2255,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds } if (!result) { + // Flush caches for coherency as the MTYPE of the src buffer may be + // non-coherent which mean we need to read it again from memory. + // Also if its a device to device copy(intra device), we dont need flush + // Check CL_MEM_SVM_ATOMICS flag to see if we used system_coarse_segment_ + auto memFlags = srcMemory.owner()->getMemFlags(); + bool srcSvmAtomics = (memFlags & CL_MEM_SVM_ATOMICS) != 0; + if (!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) { + gpu().addSystemScope(); + } result = shaderCopyBuffer(reinterpret_cast
(dstMemory.virtualAddress()), reinterpret_cast
(srcMemory.virtualAddress()), dstOrigin, srcOrigin, sizeIn,