SWDEV-497145 - Use rocr copyOnEngine API for staged copies

- Refactor blit code and clean ASAN instrumentation - Use unified function for rocr copy - Enable shader copy path for unpinned writeBuffer/readBuffer paths - Set GPU_FORCE_BLIT_COPY_SIZE=16 which means we will use BLIT copy for pinned copies or unpinned H2D/D2H copies < 16KB Change-Id: I42045cca79234b340dbf53dafb93044199736ae4 [ROCm/clr commit: 7863eb92dc]
2024-11-08 05:51:17 +00:00
@@ -48,21 +48,6 @@ inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const {
  return static_cast<Memory&>(mem);
 }

-// ================================================================================================
-bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory& xferBuf,
-                                      size_t origin, size_t& offset, size_t& totalSize,
-                                      size_t xferSize) const {
-  const_address src = srcMemory.getDeviceMemory();
-  address staging = xferBuf.getDeviceMemory();
-
-  // Copy data from device to host
-  src += origin + offset;
-  address dst = reinterpret_cast<address>(dstHost) + offset;
-  bool ret = hsaCopyStaged(src, dst, totalSize, staging, false);
-
-  return ret;
-}
-
 // ================================================================================================
 bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                const amd::Coord3D& origin, const amd::Coord3D& size,
@@ -77,77 +62,15 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
    gpu().Barriers().WaitCurrent();
    return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
  } else {
-    size_t srcSize = size[0];
-    size_t offset = 0;
-    size_t pinSize = dev().settings().pinnedXferSize_;
-    pinSize = std::min(pinSize, srcSize);
-
-    // Check if a pinned transfer can be executed
-    if (pinSize && (srcSize > MinSizeForPinnedTransfer)) {
-      // Align offset to 4K boundary
-      char* tmpHost = const_cast<char*>(
-          amd::alignDown(reinterpret_cast<const char*>(dstHost), PinnedMemoryAlignment));
-
-      // Find the partial size for unaligned copy
-      size_t partial = reinterpret_cast<const char*>(dstHost) - tmpHost;
-
-      amd::Memory* pinned = nullptr;
-      bool first = true;
-      size_t tmpSize;
-      size_t pinAllocSize;
-
-      // Copy memory, using pinning
-      while (srcSize > 0) {
-        // If it's the first iterarion, then readjust the copy size
-        // to include alignment
-        if (first) {
-          pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
-          tmpSize = std::min(pinAllocSize - partial, srcSize);
-          first = false;
-        } else {
-          tmpSize = std::min(pinSize, srcSize);
-          pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
-          partial = 0;
-        }
-        amd::Coord3D dst(partial, 0, 0);
-        amd::Coord3D srcPin(origin[0] + offset, 0, 0);
-        amd::Coord3D copySizePin(tmpSize, 0, 0);
-        size_t partial2;
-
-        // Allocate a GPU resource for pinning
-        pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
-        if (pinned != nullptr) {
-          // Get device memory for this virtual device
-          Memory* dstMemory = dev().getRocMemory(pinned);
-          const KernelBlitManager *kb = dynamic_cast<const KernelBlitManager*>(this);
-          if (!kb->copyBuffer(gpuMem(srcMemory), *dstMemory, srcPin, dst,
-                              copySizePin)) {
-            LogWarning("DmaBlitManager::readBuffer failed a pinned copy!");
-            gpu().addPinnedMem(pinned);
-            break;
-          }
-          gpu().addPinnedMem(pinned);
-        } else {
-          LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
-          break;
-        }
-        srcSize -= tmpSize;
-        offset += tmpSize;
-        tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
-      }
-    }
-
-    if (0 != srcSize) {
-      Memory& xferBuf = dev().xferRead().acquire();
-
-      // Read memory using a staging resource
-      if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], offset, srcSize,
-                            srcSize)) {
-        LogError("DmaBlitManager::readBuffer failed!");
+    size_t copySize = size[0];
+    if (0 != copySize) {
+      const_address addrSrc = gpuMem(srcMemory).getDeviceMemory() + origin[0];
+      address addrDst = reinterpret_cast<address>(dstHost);
+      constexpr bool kHostToDev = false;
+      if(!hsaCopyStaged(addrSrc, addrDst, copySize, kHostToDev, copyMetadata)) {
+        LogError("DmaBlitManager::readBuffer staged copy failed!");
        return false;
      }
-
-      dev().xferRead().release(gpu(), xferBuf);
    }
  }

@@ -170,8 +93,6 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
    gpu().Barriers().WaitCurrent();
    return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire, copyMetadata);
  } else {
-    Memory& xferBuf = dev().xferRead().acquire();
-    address staging = xferBuf.getDeviceMemory();
    const_address src = gpuMem(srcMemory).getDeviceMemory();

    size_t srcOffset;
@@ -184,13 +105,12 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,

        // Copy data from device to host - line by line
        address dst = reinterpret_cast<address>(dstHost) + dstOffset;
-        bool retval = hsaCopyStaged(src + srcOffset, dst, size[0], staging, false);
+        bool retval = hsaCopyStaged(src + srcOffset, dst, size[0], false, copyMetadata);
        if (!retval) {
          return retval;
        }
      }
    }
-    dev().xferRead().release(gpu(), xferBuf);
  }

  return true;
@@ -216,20 +136,6 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
  return true;
 }

-// ================================================================================================
-bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, address staging,
-                                       size_t origin, size_t& offset, size_t& totalSize,
-                                       size_t xferSize) const {
-  address dst = dstMemory.getDeviceMemory();
-
-  // Copy data from host to device
-  dst += origin + offset;
-  const_address src = reinterpret_cast<const_address>(srcHost) + offset;
-  bool retval = hsaCopyStaged(src, dst, totalSize, staging, true);
-
-  return retval;
-}
-
 // ================================================================================================
 bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
@@ -241,79 +147,17 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
    gpu().releaseGpuMemoryFence();
    return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
  } else {
-    // HSA copy functionality with a possible async operation
-    gpu().releaseGpuMemoryFence(kSkipCpuWait);
-
-    size_t dstSize = size[0];
-    size_t tmpSize = 0;
-    size_t offset = 0;
-    size_t pinSize = dev().settings().pinnedXferSize_;
-    pinSize = std::min(pinSize, dstSize);
-
-    // Check if a pinned transfer can be executed
-    if (pinSize && (dstSize > MinSizeForPinnedTransfer)) {
-      // Align offset to 4K boundary
-      char* tmpHost = const_cast<char*>(
-          amd::alignDown(reinterpret_cast<const char*>(srcHost), PinnedMemoryAlignment));
-
-      // Find the partial size for unaligned copy
-      size_t partial = reinterpret_cast<const char*>(srcHost) - tmpHost;
-
-      amd::Memory* pinned = nullptr;
-      bool first = true;
-      size_t tmpSize;
-      size_t pinAllocSize;
-
-      // Copy memory, using pinning
-      while (dstSize > 0) {
-        // If it's the first iterarion, then readjust the copy size
-        // to include alignment
-        if (first) {
-          pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
-          tmpSize = std::min(pinAllocSize - partial, dstSize);
-          first = false;
-        } else {
-          tmpSize = std::min(pinSize, dstSize);
-          pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
-          partial = 0;
-        }
-        amd::Coord3D src(partial, 0, 0);
-        amd::Coord3D dstPin(origin[0] + offset, 0, 0);
-        amd::Coord3D copySizePin(tmpSize, 0, 0);
-        size_t partial2;
-
-        // Allocate a GPU resource for pinning
-        pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
-
-        if (pinned != nullptr) {
-          // Get device memory for this virtual device
-          Memory* srcMemory = dev().getRocMemory(pinned);
-          const KernelBlitManager *kb = dynamic_cast<const KernelBlitManager*>(this);
-          if (!kb->copyBuffer(*srcMemory, gpuMem(dstMemory), src, dstPin,
-                              copySizePin)) {
-            LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!");
-            gpu().addPinnedMem(pinned);
-            break;
-          }
-          gpu().addPinnedMem(pinned);
-        } else {
-          LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
-          break;
-        }
-        dstSize -= tmpSize;
-        offset += tmpSize;
-        tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
-      }
-    }
-
-    if (dstSize != 0) {
-      address staging = gpu().Staging().Acquire(
-        std::min(dstSize, dev().settings().stagedXferSize_));
+    size_t copySize = size[0];

+    // For small copies use managed staging buffers which can be non blocking
+    if (copySize != 0) {
+      address dstAddr = gpuMem(dstMemory).getDeviceMemory() + origin[0];
+      const_address srcAddr = reinterpret_cast<const_address>(srcHost);
      // Write memory using a staging resource
-      if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), staging, origin[0], offset, dstSize,
-                             dstSize)) {
-        LogError("DmaBlitManager::writeBuffer failed!");
+      constexpr bool kHostToDev = true;
+      bool result = hsaCopyStaged(srcAddr, dstAddr, copySize, kHostToDev, copyMetadata);
+      if (!result) {
+        LogError("DmaBlitManager::writeBuffer staging copy failed!");
        return false;
      }
    }
@@ -336,8 +180,6 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
    return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire,
                                            copyMetadata);
  } else {
-    address staging = gpu().Staging().Acquire(
-      std::min(size[0], dev().settings().stagedXferSize_));
    address dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();

    size_t srcOffset;
@@ -350,7 +192,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem

        // Copy data from host to device - line by line
        const_address src = reinterpret_cast<const_address>(srcHost) + srcOffset;
-        bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], staging, true);
+        constexpr bool kHostToDev = true;
+        bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], kHostToDev, copyMetadata);
        if (!retval) {
          return retval;
        }
@@ -632,43 +475,10 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
 }

 // ================================================================================================
-bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
-                             const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
-                             const amd::Coord3D& size, amd::CopyMetadata copyMetadata) const {
-  address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
-  address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
-
-  gpu().releaseGpuMemoryFence(kSkipCpuWait);
-
-  src += srcOrigin[0];
-  dst += dstOrigin[0];
-
-  // Just call copy function for full profile
+inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
+                                           const_address src, hsa_agent_t& srcAgent, size_t size,
+                                           amd::CopyMetadata& copyMetadata) const {
  hsa_status_t status = HSA_STATUS_SUCCESS;
-  if (dev().agent_profile() == HSA_PROFILE_FULL) {
-    // Stall GPU, sicne CPU copy is possible
-    gpu().Barriers().WaitCurrent();
-    status = hsa_memory_copy(dst, src, size[0]);
-    if (status != HSA_STATUS_SUCCESS) {
-      LogPrintfError("Hsa copy of data failed with code %d", status);
-    }
-    return (status == HSA_STATUS_SUCCESS);
-  }
-
-  hsa_agent_t srcAgent;
-  hsa_agent_t dstAgent;
-
-  if (&srcMemory.dev() == &dstMemory.dev()) {
-    // Detect the agents for memory allocations
-    srcAgent =
-      (srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
-    dstAgent =
-      (dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
-  }
-  else {
-    srcAgent = srcMemory.dev().getBackendDevice();
-    dstAgent = dstMemory.dev().getBackendDevice();
-  }

  uint32_t copyMask = 0;
  uint32_t freeEngineMask = 0;
@@ -707,9 +517,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
  }

  // Check if host wait has to be forced
-  bool forceHostWait = forceHostWaitFunc(size[0]);
+  bool forceHostWait = forceHostWaitFunc(size);

-  auto wait_events = gpu().Barriers().WaitingSignal(engine);
+  constexpr bool kIgnoreHostWait = false;
+  // Ignore waiting on any previous kernel dispatch and queue a signal to ROCr copy api instead
+  auto wait_events = gpu().Barriers().WaitingSignal(engine, kIgnoreHostWait);
  hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp(),
                                                      forceHostWait);

@@ -740,11 +552,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
              "HSA Async Copy on copy_engine=0x%x, dst=0x%zx, src=0x%zx, "
              "size=%ld, forceSDMA=%d, wait_event=0x%zx, completion_signal=0x%zx", copyEngine,
-              dst, src, size[0], forceSDMA, (wait_events.size() != 0) ? wait_events[0].handle : 0,
+              dst, src, size, forceSDMA, (wait_events.size() != 0) ? wait_events[0].handle : 0,
              active.handle);

      status = hsa_amd_memory_async_copy_on_engine(dst, dstAgent, src, srcAgent,
-                                                  size[0], wait_events.size(),
+                                                  size, wait_events.size(),
                                                  wait_events.data(), active, copyEngine,
                                                  forceSDMA);
    } else {
@@ -756,11 +568,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
            "HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, "
            "completion_signal=0x%zx",
-            dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0,
+            dst, src, size, (wait_events.size() != 0) ? wait_events[0].handle : 0,
            active.handle);

    status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
-        size[0], wait_events.size(), wait_events.data(), active);
+        size, wait_events.size(), wait_events.data(), active);
  }

  if (status == HSA_STATUS_SUCCESS) {
@@ -773,93 +585,104 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
  return (status == HSA_STATUS_SUCCESS);
 }

+
+// ================================================================================================
+bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
+                             const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
+                             const amd::Coord3D& size, amd::CopyMetadata& copyMetadata) const {
+  address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
+  address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
+
+  gpu().releaseGpuMemoryFence(kSkipCpuWait);
+
+  src += srcOrigin[0];
+  dst += dstOrigin[0];
+
+  hsa_agent_t srcAgent;
+  hsa_agent_t dstAgent;
+
+  if (&srcMemory.dev() == &dstMemory.dev()) {
+    // Detect the agents for memory allocations
+    srcAgent =
+      (srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
+    dstAgent =
+      (dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
+  }
+  else {
+    srcAgent = srcMemory.dev().getBackendDevice();
+    dstAgent = dstMemory.dev().getBackendDevice();
+  }
+
+  return rocrCopyBuffer(dst, dstAgent, src, srcAgent, size[0], copyMetadata);
+}
+
 // ================================================================================================
 bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
-                                   address staging, bool hostToDev) const {
+                                   bool hostToDev, amd::CopyMetadata& copyMetadata)  const {
  // Stall GPU, sicne CPU copy is possible
  gpu().releaseGpuMemoryFence(hostToDev);

-  // No allocation is necessary for Full Profile
-  hsa_status_t status;
-  if (dev().agent_profile() == HSA_PROFILE_FULL) {
-    status = hsa_memory_copy(hostDst, hostSrc, size);
-    if (status != HSA_STATUS_SUCCESS) {
-      LogPrintfError("Hsa copy of data failed with code %d", status);
-    }
-    return (status == HSA_STATUS_SUCCESS);
-  }
-
  size_t totalSize = size;
-  size_t offset = 0;
+  size_t stagedCopyOffset = 0;
+  bool status = true;
+  Memory* xferBuf = nullptr;
+  address stagingBuffer = 0;
+  size_t maxStagedXferSize = dev().settings().stagedXferSize_;

-  address hsaBuffer = staging;
+  if (!hostToDev) {
+    // Get static staging buffer as we need to wait until copy on GPU completes to copy
+    // it back to the unpinned buffer
+    xferBuf = &dev().xferRead().acquire();
+    stagingBuffer = xferBuf->getDeviceMemory();
+  }

  // Allocate requested size of memory
  while (totalSize > 0) {
-    size = std::min(totalSize, dev().settings().stagedXferSize_);
+    size = std::min(totalSize, maxStagedXferSize);
+
+    hsa_agent_t srcAgent;
+    hsa_agent_t dstAgent;

    // Copy data from Host to Device
    if (hostToDev) {
-      const hsa_agent_t srcAgent = dev().getCpuAgent();
+      hsa_agent_t srcAgent = dev().getCpuAgent();
+      hsa_agent_t dstAgent = dev().getBackendDevice();

-      HwQueueEngine engine = HwQueueEngine::Unknown;
-      if (srcAgent.handle == dev().getBackendDevice().handle) {
-        engine = HwQueueEngine::SdmaWrite;
+      // Get an address from managed staging buffer
+      stagingBuffer = gpu().Staging().Acquire(std::min(size, maxStagedXferSize));
+
+      address dst = hostDst + stagedCopyOffset;
+      memcpy(stagingBuffer, hostSrc + stagedCopyOffset, size);
+      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged H2D");
+      status = rocrCopyBuffer(dst, dstAgent, stagingBuffer, srcAgent, size, copyMetadata);
+      if (!status) {
+        break;
      }
-      gpu().Barriers().SetActiveEngine(engine);
-      auto wait_events = gpu().Barriers().WaitingSignal(engine);
-      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
-
-      memcpy(hsaBuffer, hostSrc + offset, size);
-      status = hsa_amd_memory_async_copy(
-          hostDst + offset, dev().getBackendDevice(), hsaBuffer, srcAgent, size,
-          wait_events.size(), wait_events.data(), active);
-      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-          "HSA Async Copy staged H2D dst=0x%zx, src=0x%zx, size=%ld, completion_signal=0x%zx",
-          hostDst + offset, hsaBuffer, size, active.handle);
-
-      if (status != HSA_STATUS_SUCCESS) {
-        gpu().Barriers().ResetCurrentSignal();
-        LogPrintfError("Hsa copy from host to device failed with code %d", status);
-        return false;
-      }
-      totalSize -= size;
-      if (totalSize > 0) {
-        // Wait if there are extra copies, which don't fit in a single staging buffer
-        gpu().Barriers().WaitCurrent();
-      }
-      offset += size;
-      continue;
-    }
-
-    const hsa_agent_t dstAgent = dev().getCpuAgent();
-
-    HwQueueEngine engine = HwQueueEngine::Unknown;
-    if (dstAgent.handle == dev().getBackendDevice().handle) {
-      engine = HwQueueEngine::SdmaRead;
-    }
-    gpu().Barriers().SetActiveEngine(engine);
-    auto wait_events = gpu().Barriers().WaitingSignal(engine);
-    hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
-
-    // Copy data from Device to Host
-    status = hsa_amd_memory_async_copy(
-        hsaBuffer, dstAgent, hostSrc + offset, dev().getBackendDevice(), size,
-        wait_events.size(), wait_events.data(), active);
-    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-            "HSA Async Copy staged D2H dst=0x%zx, src=0x%zx, size=%ld, completion_signal=0x%zx",
-            hsaBuffer, hostSrc + offset, size, active.handle);
-
-    if (status == HSA_STATUS_SUCCESS) {
-      gpu().Barriers().WaitCurrent();
-      memcpy(hostDst + offset, hsaBuffer, size);
    } else {
-      gpu().Barriers().ResetCurrentSignal();
-      LogPrintfError("Hsa copy from device to host failed with code %d", status);
-      return false;
+      dstAgent = dev().getCpuAgent();
+      srcAgent = dev().getBackendDevice();
+
+      const_address src = static_cast<const_address>(hostSrc) + stagedCopyOffset;
+      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged D2H");
+      status = rocrCopyBuffer(stagingBuffer, dstAgent, src, srcAgent, size, copyMetadata);
+      if (status) {
+        gpu().Barriers().WaitCurrent();
+        memcpy(hostDst + stagedCopyOffset, stagingBuffer, size);
+      } else {
+        break;
+      }
    }
+
    totalSize -= size;
-    offset += size;
+    stagedCopyOffset += size;
+  }
+
+  if (!hostToDev) {
+    dev().xferRead().release(gpu(), *xferBuf);
+  }
+
+  if (!status) {
+    return false;
  }

  gpu().addSystemScope();
@@ -1829,13 +1652,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
    synchronize();
    return result;
  } else {
-    size_t pinSize = size[0];
+    size_t totalSize = size[0];

    // Check if a pinned transfer can be executed with a single pin
-
-    if (((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer))) {
+    if (((totalSize <= dev().settings().pinnedXferSize_) &&
+         (totalSize > MinSizeForPinnedTransfer))) {
      size_t partial;
-      amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
+      amd::Memory* amdMemory = pinHostMemory(dstHost, totalSize, partial);

      if (amdMemory == nullptr) {
        // Force SW copy
@@ -1857,7 +1680,55 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
      // Add pinned memory for a later release
      gpu().addPinnedMem(amdMemory);
    } else {
-      result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
+      // Do a staging copy
+      bool useShaderCopyPath = setup_.disableHwlCopyBuffer_                         ||
+                               (totalSize <= dev().settings().sdmaCopyThreshold_)   ||
+                               (copyMetadata.copyEnginePreference_ ==
+                                amd::CopyMetadata::CopyEnginePreference::BLIT);
+
+      if (!useShaderCopyPath) {
+        // HSA copy using a staging resource
+        result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size,
+                                            entire, copyMetadata);
+      }
+      if (!result) {
+        // Blit copy using a staging resource
+        address srcAddr = gpuMem(srcMemory).getDeviceMemory();
+        address dstAddr = reinterpret_cast<address>(dstHost);
+        amd::Coord3D dstOrigin(0, 0, 0);
+        size_t copySize = 0;
+        size_t stagedCopyOffset = 0;
+        size_t maxStagedXferSize = dev().settings().stagedXferSize_;
+        Memory& xferBuf = dev().xferRead().acquire();
+        address xferBufAddr = xferBuf.getDeviceMemory();
+
+        constexpr bool kAttachSignal = true;
+        while (totalSize > 0) {
+          copySize = std::min(totalSize, maxStagedXferSize);
+          srcAddr += stagedCopyOffset;
+          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, "
+                  "dstOrigin=%zu, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize);
+          // Flush caches for coherency after the copy as we need to std::memcpy
+          // from staging buffer to unpinned dst. Also attach a signal to the dispatch packet
+          // itself that we can wait on without extra barrier packet.
+          gpu().addSystemScope();
+          result = shaderCopyBuffer(xferBufAddr, srcAddr, dstOrigin, origin, copySize,
+                                    entire, dev().settings().limit_blit_wg_, copyMetadata,
+                                    kAttachSignal);
+          if (!result) {
+            break;
+          }
+          // Wait on current signal of previous blit copy
+          gpu().Barriers().WaitCurrent();
+          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
+                  (void*)(dstAddr + stagedCopyOffset), xferBufAddr, copySize);
+          memcpy(dstAddr + stagedCopyOffset, xferBufAddr, copySize);
+          totalSize -= copySize;
+          stagedCopyOffset += copySize;
+        }
+
+        dev().xferRead().release(gpu(), xferBuf);
+      }
    }
  }

@@ -1934,16 +1805,20 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
    synchronize();
    return result;
  } else {
-    size_t pinSize = size[0];
-
+    size_t totalSize = size[0];
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Unpinned write path");
+    // If size > min pinned size, do a pinning copy, since we are limited by staging buffer size
    // Check if a pinned transfer can be executed with a single pin
-    if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) {
+    if ((totalSize <= dev().settings().pinnedXferSize_) &&
+        (totalSize > MinSizeForPinnedTransfer)) {
+      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Pinned write copy for size=%ld", totalSize);
      size_t partial;
-      amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
+      amd::Memory* amdMemory = pinHostMemory(srcHost, totalSize, partial);

      if (amdMemory == nullptr) {
        // Force SW copy
-        result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
+        result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
+                                             size, entire, copyMetadata);
        synchronize();
        return result;
      }
@@ -1960,7 +1835,47 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
      // Add pinned memory for a later release
      gpu().addPinnedMem(amdMemory);
    } else {
-      result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
+      // Do a staging copy
+      bool useShaderCopyPath = setup_.disableHwlCopyBuffer_                         ||
+                               (totalSize <= dev().settings().sdmaCopyThreshold_)   ||
+                               (copyMetadata.copyEnginePreference_ ==
+                                amd::CopyMetadata::CopyEnginePreference::BLIT);
+
+      if (!useShaderCopyPath) {
+        // HSA copy using a staging resource
+        result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
+                                             size, entire, copyMetadata);
+      }
+
+      if (!result) {
+        // Blit copy using a staging resource
+        address dstAddr = gpuMem(dstMemory).getDeviceMemory();
+        const_address srcAddr = reinterpret_cast<const_address>(srcHost);
+        amd::Coord3D srcOrigin(0, 0, 0);
+        size_t copySize = 0;
+        size_t stagedCopyOffset = 0;
+        size_t maxStagedXferSize = dev().settings().stagedXferSize_;
+
+        while (totalSize > 0) {
+          copySize = std::min(totalSize, maxStagedXferSize);
+          // Get an address from managed staging buffer
+          address stagingBuffer = gpu().Staging().Acquire(std::min(copySize, maxStagedXferSize));
+          dstAddr += stagedCopyOffset;
+          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
+                  stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize);
+          memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize);
+          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
+                  "dstOrigin=%zu, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
+          result = shaderCopyBuffer(dstAddr, stagingBuffer,
+                                    origin, srcOrigin, copySize,
+                                    entire, dev().settings().limit_blit_wg_, copyMetadata);
+          if (!result) {
+            break;
+          }
+          totalSize -= copySize;
+          stagedCopyOffset += copySize;
+        }
+      }
    }
  }

@@ -2230,6 +2145,68 @@ bool KernelBlitManager::fillBuffer3D(device::Memory& memory, const void* pattern
  ShouldNotReachHere();
  return false;
 }
+
+// ================================================================================================
+bool KernelBlitManager::shaderCopyBuffer(address dst, address src,
+                                         const amd::Coord3D& dstOrigin,
+                                         const amd::Coord3D& srcOrigin,
+                                         const amd::Coord3D& sizeIn, bool entire,
+                                         const uint32_t blitWg,
+                                         amd::CopyMetadata copyMetadata,
+                                         bool attachSignal) const {
+  constexpr uint32_t kBlitType = BlitCopyBuffer;
+  constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
+  amd::Coord3D size(sizeIn[0]);
+
+  // Check alignments for source and destination
+  bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
+  uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
+
+  // Setup copy size accordingly to the alignment
+  uint32_t remainder = size[0] % aligned_size;
+  size.c[0] /= aligned_size;
+  size.c[0] += (remainder != 0) ? 1 : 0;
+
+  // Program the dispatch dimensions
+  const size_t localWorkSize = (aligned) ? 512 : 1024;
+  size_t globalWorkSize = std::min(blitWg * localWorkSize, size[0]);
+  globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
+
+  // Program kernels arguments for the blit operation
+  // Program source origin
+  setArgument(kernels_[kBlitType], 0, sizeof(src), reinterpret_cast<void*>(src),
+              srcOrigin[0], nullptr, true);
+
+  // Program destinaiton origin
+  setArgument(kernels_[kBlitType], 1, sizeof(dst), reinterpret_cast<void*>(dst),
+              dstOrigin[0], nullptr, true);
+
+  uint64_t copySize = sizeIn[0];
+  setArgument(kernels_[kBlitType], 2, sizeof(copySize), &copySize);
+
+  setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
+  setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
+
+  // End pointer is the aligned copy size and destination offset
+  uint64_t end_ptr = reinterpret_cast<uint64_t>(dst) + dstOrigin[0] + sizeIn[0] - remainder;
+
+  setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
+
+  uint32_t next_chunk = globalWorkSize;
+  setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
+
+  // Create ND range object for the kernel's execution
+  amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
+
+  // Execute the blit
+  address parameters = captureArguments(kernels_[kBlitType]);
+  bool result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters, nullptr,
+                                           0, nullptr, nullptr, attachSignal);
+  releaseArguments(parameters);
+
+  return result;
+}
+
 // ================================================================================================
 bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                   const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
@@ -2238,32 +2215,28 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
  amd::ScopedLock k(lockXferOps_);
  bool result = false;
  bool p2p = false;
-  uint32_t blit_wg_ = dev().settings().limit_blit_wg_;
+  uint32_t blitWg = dev().settings().limit_blit_wg_;

  if (&gpuMem(srcMemory).dev() != &gpuMem(dstMemory).dev()) {
    if (sizeIn[0] > dev().settings().sdma_p2p_threshold_) {
      p2p = true;
    } else {
      constexpr uint32_t kLimitWgForKernelP2p = 16;
-      blit_wg_ = kLimitWgForKernelP2p;
+      blitWg = kLimitWgForKernelP2p;
    }
  }

-  bool asan = false;
  bool ipcShared = srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
-#if defined(__clang__)
-#if __has_feature(address_sanitizer)
-  asan = true;
-#endif
-#endif

-  bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
-      (sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
-      (!(p2p || asan || ipcShared) &&
-           (!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess() &&
-            !(copyMetadata.copyEnginePreference_ ==
-              amd::CopyMetadata::CopyEnginePreference::SDMA)) ||
-       (copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT));
+  bool useShaderCopyPath = setup_.disableHwlCopyBuffer_                       ||
+                          (sizeIn[0] <= dev().settings().sdmaCopyThreshold_)  ||
+                          (!(p2p || ipcShared) &&
+                          (!srcMemory.isHostMemDirectAccess()
+                            && !dstMemory.isHostMemDirectAccess() &&
+                            !(copyMetadata.copyEnginePreference_ ==
+                              amd::CopyMetadata::CopyEnginePreference::SDMA)) ||
+                          (copyMetadata.copyEnginePreference_ ==
+                           amd::CopyMetadata::CopyEnginePreference::BLIT));

  if (!useShaderCopyPath) {
    if (amd::IS_HIP) {
@@ -2275,60 +2248,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
        gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER);
      }
    }
-    result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire,
-                                        copyMetadata);
+    result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn,
+                                        entire, copyMetadata);
  }

  if (!result) {
-    constexpr uint32_t kBlitType = BlitCopyBuffer;
-    constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
-    amd::Coord3D size(sizeIn[0]);
-
-    // Check alignments for source and destination
-    bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
-    uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
-
-    // Setup copy size accordingly to the alignment
-    uint32_t remainder = size[0] % aligned_size;
-    size.c[0] /= aligned_size;
-    size.c[0] += (remainder != 0) ? 1 : 0;
-
-    // Program the dispatch dimensions
-    const size_t localWorkSize = (aligned) ? 512 : 1024;
-    size_t globalWorkSize = std::min(blit_wg_ * localWorkSize, size[0]);
-    globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
-
-    // Program kernels arguments for the blit operation
-    cl_mem mem = as_cl<amd::Memory>(srcMemory.owner());
-    // Program source origin
-    uint64_t srcOffset = srcOrigin[0];
-    setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem, srcOffset, &srcMemory);
-    mem = as_cl<amd::Memory>(dstMemory.owner());
-    // Program destinaiton origin
-    uint64_t dstOffset = dstOrigin[0];
-    setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem, dstOffset, &dstMemory);
-
-    uint64_t copySize = sizeIn[0];
-    setArgument(kernels_[kBlitType], 2, sizeof(copySize), &copySize);
-
-    setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
-    setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
-
-    // End pointer is the aligned copy size and destination offset
-    uint64_t end_ptr = dstMemory.virtualAddress() + dstOffset + sizeIn[0] - remainder;
-
-    setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
-
-    uint32_t next_chunk = globalWorkSize;
-    setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
-
-    // Create ND range object for the kernel's execution
-    amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
-
-    // Execute the blit
-    address parameters = captureArguments(kernels_[kBlitType]);
-    result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters, nullptr);
-    releaseArguments(parameters);
+    result = shaderCopyBuffer(reinterpret_cast<address>(dstMemory.virtualAddress()),
+                              reinterpret_cast<address>(srcMemory.virtualAddress()),
+                              dstOrigin, srcOrigin, sizeIn,
+                              entire, blitWg, copyMetadata);
  }

  synchronize();
@@ -231,7 +231,11 @@ class DmaBlitManager : public device::HostBlitManager {
  //! taking into account the Hsail profile supported by Hsa Agent
  bool hsaCopy(const Memory& srcMemory, const Memory& dstMemory, const amd::Coord3D& srcOrigin,
               const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
-               amd::CopyMetadata copyMetadata) const;
+               amd::CopyMetadata& copyMetadata) const;
+
+  inline bool rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
+                             const_address src, hsa_agent_t& srcAgent, size_t size,
+                             amd::CopyMetadata& copyMetadata) const;

  const size_t MinSizeForPinnedTransfer;
  bool completeOperation_;                    //!< DMA blit manager must complete operation
@@ -248,33 +252,13 @@ class DmaBlitManager : public device::HostBlitManager {
  //! Disable operator=
  DmaBlitManager& operator=(const DmaBlitManager&);

-  //! Reads video memory, using a staged buffer
-  bool readMemoryStaged(Memory& srcMemory,  //!< Source memory object
-                        void* dstHost,      //!< Destination host memory
-                        Memory& xferBuf,    //!< Staged buffer for read
-                        size_t origin,      //!< Original offset in the source memory
-                        size_t& offset,     //!< Offset for the current copy pointer
-                        size_t& totalSize,  //!< Total size for copy region
-                        size_t xferSize     //!< Transfer size
-                        ) const;
-
-  //! Write into video memory, using a staged buffer
-  bool writeMemoryStaged(const void* srcHost,  //!< Source host memory
-                         Memory& dstMemory,    //!< Destination memory object
-                         address staging,      //!< Staged buffer for write
-                         size_t origin,        //!< Original offset in the destination memory
-                         size_t& offset,       //!< Offset for the current copy pointer
-                         size_t& totalSize,    //!< Total size for the copy region
-                         size_t xferSize       //!< Transfer size
-                         ) const;
-
  //! Assits in transferring data from Host to Local or vice versa
  //! taking into account the Hsail profile supported by Hsa Agent
-  bool hsaCopyStaged(const_address hostSrc,  //!< Contains source data to be copied
-                     address hostDst,        //!< Destination buffer address for copying
-                     size_t size,            //!< Size of data to copy in bytes
-                     address staging,        //!< Staging resource
-                     bool hostToDev          //!< True if data is copied from Host To Device
+  bool hsaCopyStaged(const_address hostSrc,           //!< Contains source data to be copied
+                     address hostDst,                 //!< Destination buffer address for copying
+                     size_t size,                     //!< Size of data to copy in bytes
+                     bool hostToDev,                  //!< True if data is copied from H2D
+                     amd::CopyMetadata& copyMetadata  //!< Memory copy MetaData
                     ) const;

  bool forceHostWaitFunc(size_t copy_size) const;
@@ -583,6 +567,12 @@ class KernelBlitManager : public DmaBlitManager {
    return (dev().info().imageSupport_) ? BlitTotal : BlitLinearTotal;
  }

+  //! Copies a buffer using the shader path
+  bool shaderCopyBuffer(address dst, address src,
+                        const amd::Coord3D& dstOrigin, const amd::Coord3D& srcOrigin,
+                        const amd::Coord3D& size, bool entire, const uint32_t blitWg,
+                        amd::CopyMetadata copyMetadata, bool attachSignal = false) const;
+
  //! Disable copy constructor
  KernelBlitManager(const KernelBlitManager&);

@@ -1202,7 +1202,8 @@ bool Device::populateOCLDeviceConstants() {
                                               &info_.globalMemCacheLineSize_)) {
    return false;
  }
-  assert(info_.globalMemCacheLineSize_ > 0);
+  info_.globalMemCacheLineSize_ = (info_.globalMemCacheLineSize_ != 0) ?
+                                         info_.globalMemCacheLineSize_ : 64;

  uint32_t cachesize[4] = {0};
  if (HSA_STATUS_SUCCESS !=
@@ -57,8 +57,7 @@ bool LightningKernel::postLoad() {
  }
  kernargSegmentAlignment_ =
      amd::alignUp(std::max(kernargSegmentAlignment_, 128u),
-                   device().info().globalMemCacheLineSize_ > 0 ?
-                       device().info().globalMemCacheLineSize_ : 64);
+                   device().info().globalMemCacheLineSize_);

  // Set the workgroup information for the kernel
  workGroupInfo_.availableLDSSize_ = device().info().localMemSizePerCU_;
@@ -497,7 +497,8 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
 }

 // ================================================================================================
-std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
+std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine,
+                                                                     bool forceHostWait) {
  bool explicit_wait = false;
  // Reset all current waiting signals
  waiting_signals_.clear();
@@ -545,8 +546,8 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
      const Settings& settings = gpu_.dev().settings();
      // Actively wait on CPU to avoid extra overheads of signal tracking on GPU.
      // For small copies set forced wait
-      if (!WaitForSignal<true>(external_signals_[i]->signal_, false,
-                               external_signals_[i]->flags_.forceHostWait_)) {
+      if (!WaitForSignal<true>(external_signals_[i]->signal_, false, forceHostWait ?
+                               external_signals_[i]->flags_.forceHostWait_ : false)) {
        if (settings.cpu_wait_for_signal_) {
          // Wait on CPU for completion if requested
          CpuWaitForSignal(external_signals_[i]);
@@ -713,6 +714,9 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
      else {
        uint32_t index = desc.info_.arrayIndex_;
        mem = memories[index];
+        const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
+        ClPrint(amd::LOG_INFO, amd::LOG_KERN,
+            "Arg%d: %s %s = ptr:%p", i, desc.typeName_.c_str(), desc.name_.c_str(), globalAddress);
        if (mem == nullptr) {
          //! This condition is for SVM fine-grain
          if (dev().isFineGrainedSystem(true)) {
@@ -839,7 +843,7 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1
 // ================================================================================================
 template <typename AqlPacket>
 bool VirtualGPU::dispatchGenericAqlPacket(
-  AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking) {
+  AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, bool attach_signal) {
  const uint32_t queueSize = gpu_queue_->size;
  const uint32_t queueMask = queueSize - 1;
  const uint32_t sw_queue_size = queueMask;
@@ -847,6 +851,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
  // Check for queue full and wait if needed.
  uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
  uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
+
  if (addSystemScope_) {
    header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
                HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
@@ -858,15 +863,15 @@ bool VirtualGPU::dispatchGenericAqlPacket(
  auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);

-  if (fence_state_ == amd::Device::kCacheStateSystem &&
-      expected_fence_state == amd::Device::kCacheStateSystem) {
+  if (fence_state_ == amd::Device::kCacheStateSystem
+      && expected_fence_state == amd::Device::kCacheStateSystem) {
    header = dispatchPacketHeader_;
    fence_dirty_ = true;
  }

  fence_state_ = static_cast<Device::CacheState>(expected_fence_state);

-  if (timestamp_ != nullptr) {
+  if (timestamp_ != nullptr || attach_signal) {
    // Get active signal for current dispatch if profiling is necessary
    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);

@@ -967,7 +972,7 @@ void VirtualGPU::dispatchBlockingWait() {
 // ================================================================================================
 bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
                                   uint16_t rest, bool blocking, bool capturing,
-                                   const uint8_t* aqlPacket) {
+                                   const uint8_t* aqlPacket, bool attach_signal) {
  if (capturing == true) {
    packet->header = header;
    packet->setup = rest;
@@ -975,13 +980,13 @@ bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_
    return true;
  } else {
    dispatchBlockingWait();
-    return dispatchGenericAqlPacket(packet, header, rest, blocking);
+    return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
  }
 }
 // ================================================================================================
-bool VirtualGPU::dispatchAqlPacket(
-  hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
-  return dispatchGenericAqlPacket(packet, header, rest, blocking);
+bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
+                                   bool blocking, bool attach_signal) {
+  return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
 }

 // ================================================================================================
@@ -1074,10 +1079,9 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
    barrier_packet_.completion_signal = signal;
  }

-  // Reset fence_dirty_ and addSystemScope_ flag if we submit a barrier with system scopes
+  // Reset fence_dirty_ flag if we submit a barrier with system scopes
  if (cache_state == amd::Device::kCacheStateSystem) {
    fence_dirty_ = false;
-    addSystemScope_ = false;
  }

  while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1424,7 +1428,7 @@ bool VirtualGPU::ManagedBuffer::Create() {

 // ================================================================================================
 address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
-  auto alignment = gpu_.dev().info().globalMemCacheLineSize_;
+  auto alignment = amd::alignUp(256u, gpu_.dev().info().globalMemCacheLineSize_);
  address result = nullptr;
  result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
  const size_t pool_new_usage = (result + size) - pool_base_;
@@ -1713,7 +1717,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
  bool imageBuffer = false;

  // Force buffer read for IMAGE1D_BUFFER
-  if ((type == CL_COMMAND_READ_IMAGE) && (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+  if ((type == CL_COMMAND_READ_IMAGE) &&
+      (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    type = CL_COMMAND_READ_BUFFER;
    imageBuffer = true;
  }
@@ -1732,7 +1737,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
        result = blitMgr().copyBuffer(*devMem, *hostMemory, origin, dstOrigin, size,
                                      cmd.isEntireMemory(), cmd.copyMetadata());
      } else {
-        result = blitMgr().readBuffer(*devMem, dst, origin, size, cmd.isEntireMemory(), cmd.copyMetadata());
+        result = blitMgr().readBuffer(*devMem, dst, origin, size,
+                                      cmd.isEntireMemory(), cmd.copyMetadata());
      }
      break;
    }
@@ -1752,7 +1758,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
      break;
    }
    case CL_COMMAND_READ_IMAGE: {
-      if ((cmd.source().parent() != nullptr) && (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
+      if ((cmd.source().parent() != nullptr) &&
+          (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
        Image* imageBuffer = static_cast<Image*>(devMem);
        // Check if synchronization has to be performed
        if (nullptr != imageBuffer->CopyImageBuffer()) {
@@ -1772,7 +1779,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
        amd::Coord3D dstOrigin(offset);
        result =
            blitMgr().copyImageToBuffer(*devMem, *hostMemory, cmd.origin(), dstOrigin, size,
-                                        cmd.isEntireMemory(), cmd.rowPitch(), cmd.slicePitch(), cmd.copyMetadata());
+                                        cmd.isEntireMemory(), cmd.rowPitch(),
+                                        cmd.slicePitch(), cmd.copyMetadata());
      } else {
        result = blitMgr().readImage(*devMem, dst, cmd.origin(), size, cmd.rowPitch(),
                                     cmd.slicePitch(), cmd.isEntireMemory(), cmd.copyMetadata());
@@ -1839,7 +1847,8 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
        result = blitMgr().copyBuffer(*hostMemory, *devMem, srcOrigin, origin, size,
                                      cmd.isEntireMemory(), cmd.copyMetadata());
      } else {
-        result = blitMgr().writeBuffer(src, *devMem, origin, size, cmd.isEntireMemory(), cmd.copyMetadata());
+        result = blitMgr().writeBuffer(src, *devMem, origin, size,
+                                       cmd.isEntireMemory(), cmd.copyMetadata());
      }
      break;
    }
@@ -3131,9 +3140,9 @@ void VirtualGPU::HiddenHeapInit() { const_cast<Device&>(dev()).HiddenHeapInit(*t

 // ================================================================================================
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
-    const amd::Kernel& kernel, const_address parameters, void* eventHandle,
+    const amd::Kernel& kernel, const_address parameters, void* event_handle,
    uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd,
-    hsa_kernel_dispatch_packet_t* aql_packet) {
+    hsa_kernel_dispatch_packet_t* aql_packet, bool attach_signal) {
  device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
  Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
  size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -3482,7 +3491,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
      addSystemScope_ = true;
    }

-
    // Copy scheduler's AQL packet for possible relaunch from the scheduler itself
    if (aql_packet != nullptr) {
      *aql_packet = dispatchPacket;
@@ -3504,7 +3512,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    } else {
      if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                             (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION)) {
+                             GPU_FLUSH_ON_EXECUTION, false, nullptr, attach_signal)) {
        return false;
      }
    }
@@ -30,6 +30,7 @@
 #include "rocprintf.hpp"
 #include "hsa/hsa_ven_amd_aqlprofile.h"
 #include "rocsched.hpp"
+#include "device/device.hpp"

 namespace amd::roc {
 class Device;
@@ -270,7 +271,8 @@ class VirtualGPU : public device::VirtualDevice {
    HwQueueEngine GetActiveEngine() const { return engine_; }

    //! Returns the last submitted signal for a wait
-    std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
+    std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute,
+                                             bool forceHostWait = true);

    //! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
    void ResetCurrentSignal();
@@ -341,8 +343,8 @@ class VirtualGPU : public device::VirtualDevice {
                            void* event_handle,  //!< Handle to OCL event for debugging
                            uint32_t sharedMemBytes = 0, //!< Shared memory size
                            amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command
-                            hsa_kernel_dispatch_packet_t* aql_packet = nullptr  //!< Scheduler launch
-                            );
+                            hsa_kernel_dispatch_packet_t* aql_packet = nullptr,  //!< Scheduler launch
+                            bool attach_signal = false);
  void submitNativeFn(amd::NativeFnCommand& cmd);
  void submitMarker(amd::Marker& cmd);
  void submitAccumulate(amd::AccumulateCommand& cmd);
@@ -420,7 +422,10 @@ class VirtualGPU : public device::VirtualDevice {

  void hasPendingDispatch() { hasPendingDispatch_ = true; }
  bool IsPendingDispatch() const { return (hasPendingDispatch_) ? true : false; }
-  void addSystemScope() { addSystemScope_ = true; }
+  void addSystemScope() {
+    addSystemScope_ = true;
+    fence_state_ = amd::Device::CacheState::kCacheStateInvalid;
+  }
  void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }

  HwQueueTracker& Barriers() { return barriers_; }
@@ -444,11 +449,12 @@ class VirtualGPU : public device::VirtualDevice {
                                amd::AccumulateCommand* vcmd = nullptr);
  bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
                         bool blocking = true, bool capturing = false,
-                         const uint8_t* aqlPacket = nullptr);
+                         const uint8_t* aqlPacket = nullptr, bool attach_signal = false);
  bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header,
-                        uint16_t rest, bool blocking = true);
+                        uint16_t rest, bool blocking = true, bool attach_signal = false);
  template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
-                                                              uint16_t rest, bool blocking);
+                                                              uint16_t rest, bool blocking,
+                                                              bool attach_signal = false);

  bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
                                bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
@@ -85,7 +85,7 @@ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128,                                \
 release(size_t, GPU_RESOURCE_CACHE_SIZE, 64,                                  \
        "The resource cache size in MB")                                      \
 release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096,                                  \
-        "The maximum size accepted for suballocaitons in KB")                 \
+        "The maximum size accepted for suballocations in KB")                 \
 release(size_t, GPU_NUM_MEM_DEPENDENCY, 256,                                  \
        "Number of memory objects for dependency tracking")                   \
 release(size_t, GPU_XFER_BUFFER_SIZE, 0,                                      \
@@ -105,7 +105,7 @@ release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
 release(bool, AMD_THREAD_TRACE_ENABLE, true,                                  \
        "Enable thread trace extension")                                      \
 release(uint, OPENCL_VERSION, 200,                                            \
-        "Force GPU opencl verison")                                           \
+        "Force GPU opencl version")                                           \
 release(bool, HSA_LOCAL_MEMORY_ENABLE, true,                                  \
        "Enable HSA device local memory usage")                               \
 release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024,                             \
@@ -186,7 +186,7 @@ release(bool, AMD_DIRECT_DISPATCH, false,                                     \
 release(uint, HIP_HIDDEN_FREE_MEM, 0,                                         \
        "Reserve free mem reporting in Mb"                                    \
        "0 = Disable")                                                        \
-release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 0,                                  \
+release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 16,                                 \
        "Use Blit until this size(in KB) for copies")                         \
 release(uint, ROC_ACTIVE_WAIT_TIMEOUT, 0,                                     \
        "Forces active wait of GPU interrup for the timeout(us)")             \