diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index a9ffb4d95a..5862f701f1 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -1707,7 +1707,7 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, copySize = std::min(totalSize, maxStagedXferSize); srcAddr += stagedCopyOffset; ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, " - "dstOrigin=%zu, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize); + "dstOrigin=0x%x, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize); // Flush caches for coherency after the copy as we need to std::memcpy // from staging buffer to unpinned dst. Also attach a signal to the dispatch packet // itself that we can wait on without extra barrier packet. @@ -1865,7 +1865,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize); memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, " - "dstOrigin=%zu, size=%zu", dstAddr, stagingBuffer, origin[0], copySize); + "dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize); + // No cache flush is needed here as we use a staging buffer, and the acquire logic + // ensures that the cacheline is different and re-used only when L2 is flushed result = shaderCopyBuffer(dstAddr, stagingBuffer, origin, srcOrigin, copySize, entire, dev().settings().limit_blit_wg_, copyMetadata); @@ -2253,6 +2255,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds } if (!result) { + // Flush caches for coherency as the MTYPE of the src buffer may be + // non-coherent which mean we need to read it again from memory. + // Also if its a device to device copy(intra device), we dont need flush + // Check CL_MEM_SVM_ATOMICS flag to see if we used system_coarse_segment_ + auto memFlags = srcMemory.owner()->getMemFlags(); + bool srcSvmAtomics = (memFlags & CL_MEM_SVM_ATOMICS) != 0; + if (!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) { + gpu().addSystemScope(); + } result = shaderCopyBuffer(reinterpret_cast
(dstMemory.virtualAddress()), reinterpret_cast(srcMemory.virtualAddress()), dstOrigin, srcOrigin, sizeIn,