diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp
index 737529aa0e..e96cabbcea 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.cpp
@@ -48,21 +48,6 @@ inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const {
   return static_cast<Memory&>(mem);
 }
 
-// ================================================================================================
-bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory& xferBuf,
-                                      size_t origin, size_t& offset, size_t& totalSize,
-                                      size_t xferSize) const {
-  const_address src = srcMemory.getDeviceMemory();
-  address staging = xferBuf.getDeviceMemory();
-
-  // Copy data from device to host
-  src += origin + offset;
-  address dst = reinterpret_cast<address>(dstHost) + offset;
-  bool ret = hsaCopyStaged(src, dst, totalSize, staging, false);
-
-  return ret;
-}
-
 // ================================================================================================
 bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
@@ -77,77 +62,15 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
     gpu().Barriers().WaitCurrent();
     return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
   } else {
-    size_t srcSize = size[0];
-    size_t offset = 0;
-    size_t pinSize = dev().settings().pinnedXferSize_;
-    pinSize = std::min(pinSize, srcSize);
-
-    // Check if a pinned transfer can be executed
-    if (pinSize && (srcSize > MinSizeForPinnedTransfer)) {
-      // Align offset to 4K boundary
-      char* tmpHost = const_cast<char*>(
-          amd::alignDown(reinterpret_cast<const char*>(dstHost), PinnedMemoryAlignment));
-
-      // Find the partial size for unaligned copy
-      size_t partial = reinterpret_cast<const char*>(dstHost) - tmpHost;
-
-      amd::Memory* pinned = nullptr;
-      bool first = true;
-      size_t tmpSize;
-      size_t pinAllocSize;
-
-      // Copy memory, using pinning
-      while (srcSize > 0) {
-        // If it's the first iterarion, then readjust the copy size
-        // to include alignment
-        if (first) {
-          pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
-          tmpSize = std::min(pinAllocSize - partial, srcSize);
-          first = false;
-        } else {
-          tmpSize = std::min(pinSize, srcSize);
-          pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
-          partial = 0;
-        }
-        amd::Coord3D dst(partial, 0, 0);
-        amd::Coord3D srcPin(origin[0] + offset, 0, 0);
-        amd::Coord3D copySizePin(tmpSize, 0, 0);
-        size_t partial2;
-
-        // Allocate a GPU resource for pinning
-        pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
-        if (pinned != nullptr) {
-          // Get device memory for this virtual device
-          Memory* dstMemory = dev().getRocMemory(pinned);
-          const KernelBlitManager *kb = dynamic_cast<const KernelBlitManager*>(this);
-          if (!kb->copyBuffer(gpuMem(srcMemory), *dstMemory, srcPin, dst,
-                              copySizePin)) {
-            LogWarning("DmaBlitManager::readBuffer failed a pinned copy!");
-            gpu().addPinnedMem(pinned);
-            break;
-          }
-          gpu().addPinnedMem(pinned);
-        } else {
-          LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
-          break;
-        }
-        srcSize -= tmpSize;
-        offset += tmpSize;
-        tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
-      }
-    }
-
-    if (0 != srcSize) {
-      Memory& xferBuf = dev().xferRead().acquire();
-
-      // Read memory using a staging resource
-      if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], offset, srcSize,
-                            srcSize)) {
-        LogError("DmaBlitManager::readBuffer failed!");
+    size_t copySize = size[0];
+    if (0 != copySize) {
+      const_address addrSrc = gpuMem(srcMemory).getDeviceMemory() + origin[0];
+      address addrDst = reinterpret_cast<address>(dstHost);
+      constexpr bool kHostToDev = false;
+      if(!hsaCopyStaged(addrSrc, addrDst, copySize, kHostToDev, copyMetadata)) {
+        LogError("DmaBlitManager::readBuffer staged copy failed!");
         return false;
       }
-
-      dev().xferRead().release(gpu(), xferBuf);
     }
   }
 
@@ -170,8 +93,6 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
     gpu().Barriers().WaitCurrent();
     return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire, copyMetadata);
   } else {
-    Memory& xferBuf = dev().xferRead().acquire();
-    address staging = xferBuf.getDeviceMemory();
     const_address src = gpuMem(srcMemory).getDeviceMemory();
 
     size_t srcOffset;
@@ -184,13 +105,12 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
 
         // Copy data from device to host - line by line
         address dst = reinterpret_cast<address>(dstHost) + dstOffset;
-        bool retval = hsaCopyStaged(src + srcOffset, dst, size[0], staging, false);
+        bool retval = hsaCopyStaged(src + srcOffset, dst, size[0], false, copyMetadata);
         if (!retval) {
           return retval;
         }
       }
     }
-    dev().xferRead().release(gpu(), xferBuf);
   }
 
   return true;
@@ -216,20 +136,6 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
   return true;
 }
 
-// ================================================================================================
-bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, address staging,
-                                       size_t origin, size_t& offset, size_t& totalSize,
-                                       size_t xferSize) const {
-  address dst = dstMemory.getDeviceMemory();
-
-  // Copy data from host to device
-  dst += origin + offset;
-  const_address src = reinterpret_cast<const_address>(srcHost) + offset;
-  bool retval = hsaCopyStaged(src, dst, totalSize, staging, true);
-
-  return retval;
-}
-
 // ================================================================================================
 bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                  const amd::Coord3D& origin, const amd::Coord3D& size,
@@ -241,79 +147,17 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
     gpu().releaseGpuMemoryFence();
     return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
   } else {
-    // HSA copy functionality with a possible async operation
-    gpu().releaseGpuMemoryFence(kSkipCpuWait);
-
-    size_t dstSize = size[0];
-    size_t tmpSize = 0;
-    size_t offset = 0;
-    size_t pinSize = dev().settings().pinnedXferSize_;
-    pinSize = std::min(pinSize, dstSize);
-
-    // Check if a pinned transfer can be executed
-    if (pinSize && (dstSize > MinSizeForPinnedTransfer)) {
-      // Align offset to 4K boundary
-      char* tmpHost = const_cast<char*>(
-          amd::alignDown(reinterpret_cast<const char*>(srcHost), PinnedMemoryAlignment));
-
-      // Find the partial size for unaligned copy
-      size_t partial = reinterpret_cast<const char*>(srcHost) - tmpHost;
-
-      amd::Memory* pinned = nullptr;
-      bool first = true;
-      size_t tmpSize;
-      size_t pinAllocSize;
-
-      // Copy memory, using pinning
-      while (dstSize > 0) {
-        // If it's the first iterarion, then readjust the copy size
-        // to include alignment
-        if (first) {
-          pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
-          tmpSize = std::min(pinAllocSize - partial, dstSize);
-          first = false;
-        } else {
-          tmpSize = std::min(pinSize, dstSize);
-          pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
-          partial = 0;
-        }
-        amd::Coord3D src(partial, 0, 0);
-        amd::Coord3D dstPin(origin[0] + offset, 0, 0);
-        amd::Coord3D copySizePin(tmpSize, 0, 0);
-        size_t partial2;
-
-        // Allocate a GPU resource for pinning
-        pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
-
-        if (pinned != nullptr) {
-          // Get device memory for this virtual device
-          Memory* srcMemory = dev().getRocMemory(pinned);
-          const KernelBlitManager *kb = dynamic_cast<const KernelBlitManager*>(this);
-          if (!kb->copyBuffer(*srcMemory, gpuMem(dstMemory), src, dstPin,
-                              copySizePin)) {
-            LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!");
-            gpu().addPinnedMem(pinned);
-            break;
-          }
-          gpu().addPinnedMem(pinned);
-        } else {
-          LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
-          break;
-        }
-        dstSize -= tmpSize;
-        offset += tmpSize;
-        tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
-      }
-    }
-
-    if (dstSize != 0) {
-      address staging = gpu().Staging().Acquire(
-        std::min(dstSize, dev().settings().stagedXferSize_));
+    size_t copySize = size[0];
 
+    // For small copies use managed staging buffers which can be non blocking
+    if (copySize != 0) {
+      address dstAddr = gpuMem(dstMemory).getDeviceMemory() + origin[0];
+      const_address srcAddr = reinterpret_cast<const_address>(srcHost);
       // Write memory using a staging resource
-      if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), staging, origin[0], offset, dstSize,
-                             dstSize)) {
-        LogError("DmaBlitManager::writeBuffer failed!");
+      constexpr bool kHostToDev = true;
+      bool result = hsaCopyStaged(srcAddr, dstAddr, copySize, kHostToDev, copyMetadata);
+      if (!result) {
+        LogError("DmaBlitManager::writeBuffer staging copy failed!");
         return false;
       }
     }
@@ -336,8 +180,6 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
     return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire,
                                             copyMetadata);
   } else {
-    address staging = gpu().Staging().Acquire(
-      std::min(size[0], dev().settings().stagedXferSize_));
     address dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
 
     size_t srcOffset;
@@ -350,7 +192,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
 
         // Copy data from host to device - line by line
         const_address src = reinterpret_cast<const_address>(srcHost) + srcOffset;
-        bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], staging, true);
+        constexpr bool kHostToDev = true;
+        bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], kHostToDev, copyMetadata);
         if (!retval) {
           return retval;
         }
@@ -632,43 +475,10 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
 }
 
 // ================================================================================================
-bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
-                             const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
-                             const amd::Coord3D& size, amd::CopyMetadata copyMetadata) const {
-  address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
-  address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
-
-  gpu().releaseGpuMemoryFence(kSkipCpuWait);
-
-  src += srcOrigin[0];
-  dst += dstOrigin[0];
-
-  // Just call copy function for full profile
+inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
+                                           const_address src, hsa_agent_t& srcAgent, size_t size,
+                                           amd::CopyMetadata& copyMetadata) const {
   hsa_status_t status = HSA_STATUS_SUCCESS;
-  if (dev().agent_profile() == HSA_PROFILE_FULL) {
-    // Stall GPU, sicne CPU copy is possible
-    gpu().Barriers().WaitCurrent();
-    status = hsa_memory_copy(dst, src, size[0]);
-    if (status != HSA_STATUS_SUCCESS) {
-      LogPrintfError("Hsa copy of data failed with code %d", status);
-    }
-    return (status == HSA_STATUS_SUCCESS);
-  }
-
-  hsa_agent_t srcAgent;
-  hsa_agent_t dstAgent;
-
-  if (&srcMemory.dev() == &dstMemory.dev()) {
-    // Detect the agents for memory allocations
-    srcAgent =
-      (srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
-    dstAgent =
-      (dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
-  }
-  else {
-    srcAgent = srcMemory.dev().getBackendDevice();
-    dstAgent = dstMemory.dev().getBackendDevice();
-  }
 
   uint32_t copyMask = 0;
   uint32_t freeEngineMask = 0;
@@ -707,9 +517,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
   }
 
   // Check if host wait has to be forced
-  bool forceHostWait = forceHostWaitFunc(size[0]);
+  bool forceHostWait = forceHostWaitFunc(size);
 
-  auto wait_events = gpu().Barriers().WaitingSignal(engine);
+  constexpr bool kIgnoreHostWait = false;
+  // Ignore waiting on any previous kernel dispatch and queue a signal to ROCr copy api instead
+  auto wait_events = gpu().Barriers().WaitingSignal(engine, kIgnoreHostWait);
   hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp(),
                                                       forceHostWait);
 
@@ -740,11 +552,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
       ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
               "HSA Async Copy on copy_engine=0x%x, dst=0x%zx, src=0x%zx, "
               "size=%ld, forceSDMA=%d, wait_event=0x%zx, completion_signal=0x%zx", copyEngine,
-              dst, src, size[0], forceSDMA, (wait_events.size() != 0) ? wait_events[0].handle : 0,
+              dst, src, size, forceSDMA, (wait_events.size() != 0) ? wait_events[0].handle : 0,
               active.handle);
 
       status = hsa_amd_memory_async_copy_on_engine(dst, dstAgent, src, srcAgent,
-                                                  size[0], wait_events.size(),
+                                                  size, wait_events.size(),
                                                   wait_events.data(), active, copyEngine,
                                                   forceSDMA);
     } else {
@@ -756,11 +568,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
     ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
             "HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, "
             "completion_signal=0x%zx",
-            dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0,
+            dst, src, size, (wait_events.size() != 0) ? wait_events[0].handle : 0,
             active.handle);
 
     status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
-        size[0], wait_events.size(), wait_events.data(), active);
+        size, wait_events.size(), wait_events.data(), active);
   }
 
   if (status == HSA_STATUS_SUCCESS) {
@@ -773,93 +585,104 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
   return (status == HSA_STATUS_SUCCESS);
 }
 
+
+// ================================================================================================
+bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
+                             const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
+                             const amd::Coord3D& size, amd::CopyMetadata& copyMetadata) const {
+  address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
+  address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
+
+  gpu().releaseGpuMemoryFence(kSkipCpuWait);
+
+  src += srcOrigin[0];
+  dst += dstOrigin[0];
+
+  hsa_agent_t srcAgent;
+  hsa_agent_t dstAgent;
+
+  if (&srcMemory.dev() == &dstMemory.dev()) {
+    // Detect the agents for memory allocations
+    srcAgent =
+      (srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
+    dstAgent =
+      (dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
+  }
+  else {
+    srcAgent = srcMemory.dev().getBackendDevice();
+    dstAgent = dstMemory.dev().getBackendDevice();
+  }
+
+  return rocrCopyBuffer(dst, dstAgent, src, srcAgent, size[0], copyMetadata);
+}
+
 // ================================================================================================
 bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
-                                   address staging, bool hostToDev) const {
+                                   bool hostToDev, amd::CopyMetadata& copyMetadata)  const {
   // Stall GPU, sicne CPU copy is possible
   gpu().releaseGpuMemoryFence(hostToDev);
 
-  // No allocation is necessary for Full Profile
-  hsa_status_t status;
-  if (dev().agent_profile() == HSA_PROFILE_FULL) {
-    status = hsa_memory_copy(hostDst, hostSrc, size);
-    if (status != HSA_STATUS_SUCCESS) {
-      LogPrintfError("Hsa copy of data failed with code %d", status);
-    }
-    return (status == HSA_STATUS_SUCCESS);
-  }
-
   size_t totalSize = size;
-  size_t offset = 0;
+  size_t stagedCopyOffset = 0;
+  bool status = true;
+  Memory* xferBuf = nullptr;
+  address stagingBuffer = 0;
+  size_t maxStagedXferSize = dev().settings().stagedXferSize_;
 
-  address hsaBuffer = staging;
+  if (!hostToDev) {
+    // Get static staging buffer as we need to wait until copy on GPU completes to copy
+    // it back to the unpinned buffer
+    xferBuf = &dev().xferRead().acquire();
+    stagingBuffer = xferBuf->getDeviceMemory();
+  }
 
   // Allocate requested size of memory
   while (totalSize > 0) {
-    size = std::min(totalSize, dev().settings().stagedXferSize_);
+    size = std::min(totalSize, maxStagedXferSize);
+
+    hsa_agent_t srcAgent;
+    hsa_agent_t dstAgent;
 
     // Copy data from Host to Device
     if (hostToDev) {
-      const hsa_agent_t srcAgent = dev().getCpuAgent();
+      hsa_agent_t srcAgent = dev().getCpuAgent();
+      hsa_agent_t dstAgent = dev().getBackendDevice();
 
-      HwQueueEngine engine = HwQueueEngine::Unknown;
-      if (srcAgent.handle == dev().getBackendDevice().handle) {
-        engine = HwQueueEngine::SdmaWrite;
+      // Get an address from managed staging buffer
+      stagingBuffer = gpu().Staging().Acquire(std::min(size, maxStagedXferSize));
+
+      address dst = hostDst + stagedCopyOffset;
+      memcpy(stagingBuffer, hostSrc + stagedCopyOffset, size);
+      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged H2D");
+      status = rocrCopyBuffer(dst, dstAgent, stagingBuffer, srcAgent, size, copyMetadata);
+      if (!status) {
+        break;
       }
-      gpu().Barriers().SetActiveEngine(engine);
-      auto wait_events = gpu().Barriers().WaitingSignal(engine);
-      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
-
-      memcpy(hsaBuffer, hostSrc + offset, size);
-      status = hsa_amd_memory_async_copy(
-          hostDst + offset, dev().getBackendDevice(), hsaBuffer, srcAgent, size,
-          wait_events.size(), wait_events.data(), active);
-      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-          "HSA Async Copy staged H2D dst=0x%zx, src=0x%zx, size=%ld, completion_signal=0x%zx",
-          hostDst + offset, hsaBuffer, size, active.handle);
-
-      if (status != HSA_STATUS_SUCCESS) {
-        gpu().Barriers().ResetCurrentSignal();
-        LogPrintfError("Hsa copy from host to device failed with code %d", status);
-        return false;
-      }
-      totalSize -= size;
-      if (totalSize > 0) {
-        // Wait if there are extra copies, which don't fit in a single staging buffer
-        gpu().Barriers().WaitCurrent();
-      }
-      offset += size;
-      continue;
-    }
-
-    const hsa_agent_t dstAgent = dev().getCpuAgent();
-
-    HwQueueEngine engine = HwQueueEngine::Unknown;
-    if (dstAgent.handle == dev().getBackendDevice().handle) {
-      engine = HwQueueEngine::SdmaRead;
-    }
-    gpu().Barriers().SetActiveEngine(engine);
-    auto wait_events = gpu().Barriers().WaitingSignal(engine);
-    hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
-
-    // Copy data from Device to Host
-    status = hsa_amd_memory_async_copy(
-        hsaBuffer, dstAgent, hostSrc + offset, dev().getBackendDevice(), size,
-        wait_events.size(), wait_events.data(), active);
-    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-            "HSA Async Copy staged D2H dst=0x%zx, src=0x%zx, size=%ld, completion_signal=0x%zx",
-            hsaBuffer, hostSrc + offset, size, active.handle);
-
-    if (status == HSA_STATUS_SUCCESS) {
-      gpu().Barriers().WaitCurrent();
-      memcpy(hostDst + offset, hsaBuffer, size);
     } else {
-      gpu().Barriers().ResetCurrentSignal();
-      LogPrintfError("Hsa copy from device to host failed with code %d", status);
-      return false;
+      dstAgent = dev().getCpuAgent();
+      srcAgent = dev().getBackendDevice();
+
+      const_address src = static_cast<const_address>(hostSrc) + stagedCopyOffset;
+      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged D2H");
+      status = rocrCopyBuffer(stagingBuffer, dstAgent, src, srcAgent, size, copyMetadata);
+      if (status) {
+        gpu().Barriers().WaitCurrent();
+        memcpy(hostDst + stagedCopyOffset, stagingBuffer, size);
+      } else {
+        break;
+      }
     }
+
     totalSize -= size;
-    offset += size;
+    stagedCopyOffset += size;
+  }
+
+  if (!hostToDev) {
+    dev().xferRead().release(gpu(), *xferBuf);
+  }
+
+  if (!status) {
+    return false;
   }
 
   gpu().addSystemScope();
@@ -1829,13 +1652,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
     synchronize();
     return result;
   } else {
-    size_t pinSize = size[0];
+    size_t totalSize = size[0];
 
     // Check if a pinned transfer can be executed with a single pin
-
-    if (((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer))) {
+    if (((totalSize <= dev().settings().pinnedXferSize_) &&
+         (totalSize > MinSizeForPinnedTransfer))) {
       size_t partial;
-      amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
+      amd::Memory* amdMemory = pinHostMemory(dstHost, totalSize, partial);
 
       if (amdMemory == nullptr) {
         // Force SW copy
@@ -1857,7 +1680,55 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
       // Add pinned memory for a later release
       gpu().addPinnedMem(amdMemory);
     } else {
-      result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
+      // Do a staging copy
+      bool useShaderCopyPath = setup_.disableHwlCopyBuffer_                         ||
+                               (totalSize <= dev().settings().sdmaCopyThreshold_)   ||
+                               (copyMetadata.copyEnginePreference_ ==
+                                amd::CopyMetadata::CopyEnginePreference::BLIT);
+
+      if (!useShaderCopyPath) {
+        // HSA copy using a staging resource
+        result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size,
+                                            entire, copyMetadata);
+      }
+      if (!result) {
+        // Blit copy using a staging resource
+        address srcAddr = gpuMem(srcMemory).getDeviceMemory();
+        address dstAddr = reinterpret_cast<address>(dstHost);
+        amd::Coord3D dstOrigin(0, 0, 0);
+        size_t copySize = 0;
+        size_t stagedCopyOffset = 0;
+        size_t maxStagedXferSize = dev().settings().stagedXferSize_;
+        Memory& xferBuf = dev().xferRead().acquire();
+        address xferBufAddr = xferBuf.getDeviceMemory();
+
+        constexpr bool kAttachSignal = true;
+        while (totalSize > 0) {
+          copySize = std::min(totalSize, maxStagedXferSize);
+          srcAddr += stagedCopyOffset;
+          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, "
+                  "dstOrigin=%zu, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize);
+          // Flush caches for coherency after the copy as we need to std::memcpy
+          // from staging buffer to unpinned dst. Also attach a signal to the dispatch packet
+          // itself that we can wait on without extra barrier packet.
+          gpu().addSystemScope();
+          result = shaderCopyBuffer(xferBufAddr, srcAddr, dstOrigin, origin, copySize,
+                                    entire, dev().settings().limit_blit_wg_, copyMetadata,
+                                    kAttachSignal);
+          if (!result) {
+            break;
+          }
+          // Wait on current signal of previous blit copy
+          gpu().Barriers().WaitCurrent();
+          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
+                  (void*)(dstAddr + stagedCopyOffset), xferBufAddr, copySize);
+          memcpy(dstAddr + stagedCopyOffset, xferBufAddr, copySize);
+          totalSize -= copySize;
+          stagedCopyOffset += copySize;
+        }
+
+        dev().xferRead().release(gpu(), xferBuf);
+      }
     }
   }
 
@@ -1934,16 +1805,20 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
     synchronize();
     return result;
   } else {
-    size_t pinSize = size[0];
-
+    size_t totalSize = size[0];
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Unpinned write path");
+    // If size > min pinned size, do a pinning copy, since we are limited by staging buffer size
     // Check if a pinned transfer can be executed with a single pin
-    if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) {
+    if ((totalSize <= dev().settings().pinnedXferSize_) &&
+        (totalSize > MinSizeForPinnedTransfer)) {
+      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Pinned write copy for size=%ld", totalSize);
       size_t partial;
-      amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
+      amd::Memory* amdMemory = pinHostMemory(srcHost, totalSize, partial);
 
       if (amdMemory == nullptr) {
         // Force SW copy
-        result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
+        result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
+                                             size, entire, copyMetadata);
         synchronize();
         return result;
       }
@@ -1960,7 +1835,47 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
       // Add pinned memory for a later release
       gpu().addPinnedMem(amdMemory);
     } else {
-      result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
+      // Do a staging copy
+      bool useShaderCopyPath = setup_.disableHwlCopyBuffer_                         ||
+                               (totalSize <= dev().settings().sdmaCopyThreshold_)   ||
+                               (copyMetadata.copyEnginePreference_ ==
+                                amd::CopyMetadata::CopyEnginePreference::BLIT);
+
+      if (!useShaderCopyPath) {
+        // HSA copy using a staging resource
+        result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
+                                             size, entire, copyMetadata);
+      }
+
+      if (!result) {
+        // Blit copy using a staging resource
+        address dstAddr = gpuMem(dstMemory).getDeviceMemory();
+        const_address srcAddr = reinterpret_cast<const_address>(srcHost);
+        amd::Coord3D srcOrigin(0, 0, 0);
+        size_t copySize = 0;
+        size_t stagedCopyOffset = 0;
+        size_t maxStagedXferSize = dev().settings().stagedXferSize_;
+
+        while (totalSize > 0) {
+          copySize = std::min(totalSize, maxStagedXferSize);
+          // Get an address from managed staging buffer
+          address stagingBuffer = gpu().Staging().Acquire(std::min(copySize, maxStagedXferSize));
+          dstAddr += stagedCopyOffset;
+          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
+                  stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize);
+          memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize);
+          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
+                  "dstOrigin=%zu, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
+          result = shaderCopyBuffer(dstAddr, stagingBuffer,
+                                    origin, srcOrigin, copySize,
+                                    entire, dev().settings().limit_blit_wg_, copyMetadata);
+          if (!result) {
+            break;
+          }
+          totalSize -= copySize;
+          stagedCopyOffset += copySize;
+        }
+      }
     }
   }
 
@@ -2230,6 +2145,68 @@ bool KernelBlitManager::fillBuffer3D(device::Memory& memory, const void* pattern
   ShouldNotReachHere();
   return false;
 }
+
+// ================================================================================================
+bool KernelBlitManager::shaderCopyBuffer(address dst, address src,
+                                         const amd::Coord3D& dstOrigin,
+                                         const amd::Coord3D& srcOrigin,
+                                         const amd::Coord3D& sizeIn, bool entire,
+                                         const uint32_t blitWg,
+                                         amd::CopyMetadata copyMetadata,
+                                         bool attachSignal) const {
+  constexpr uint32_t kBlitType = BlitCopyBuffer;
+  constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
+  amd::Coord3D size(sizeIn[0]);
+
+  // Check alignments for source and destination
+  bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
+  uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
+
+  // Setup copy size accordingly to the alignment
+  uint32_t remainder = size[0] % aligned_size;
+  size.c[0] /= aligned_size;
+  size.c[0] += (remainder != 0) ? 1 : 0;
+
+  // Program the dispatch dimensions
+  const size_t localWorkSize = (aligned) ? 512 : 1024;
+  size_t globalWorkSize = std::min(blitWg * localWorkSize, size[0]);
+  globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
+
+  // Program kernels arguments for the blit operation
+  // Program source origin
+  setArgument(kernels_[kBlitType], 0, sizeof(src), reinterpret_cast<void*>(src),
+              srcOrigin[0], nullptr, true);
+
+  // Program destinaiton origin
+  setArgument(kernels_[kBlitType], 1, sizeof(dst), reinterpret_cast<void*>(dst),
+              dstOrigin[0], nullptr, true);
+
+  uint64_t copySize = sizeIn[0];
+  setArgument(kernels_[kBlitType], 2, sizeof(copySize), &copySize);
+
+  setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
+  setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
+
+  // End pointer is the aligned copy size and destination offset
+  uint64_t end_ptr = reinterpret_cast<uint64_t>(dst) + dstOrigin[0] + sizeIn[0] - remainder;
+
+  setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
+
+  uint32_t next_chunk = globalWorkSize;
+  setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
+
+  // Create ND range object for the kernel's execution
+  amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
+
+  // Execute the blit
+  address parameters = captureArguments(kernels_[kBlitType]);
+  bool result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters, nullptr,
+                                           0, nullptr, nullptr, attachSignal);
+  releaseArguments(parameters);
+
+  return result;
+}
+
 // ================================================================================================
 bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                    const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
@@ -2238,32 +2215,28 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
   amd::ScopedLock k(lockXferOps_);
   bool result = false;
   bool p2p = false;
-  uint32_t blit_wg_ = dev().settings().limit_blit_wg_;
+  uint32_t blitWg = dev().settings().limit_blit_wg_;
 
   if (&gpuMem(srcMemory).dev() != &gpuMem(dstMemory).dev()) {
     if (sizeIn[0] > dev().settings().sdma_p2p_threshold_) {
       p2p = true;
     } else {
       constexpr uint32_t kLimitWgForKernelP2p = 16;
-      blit_wg_ = kLimitWgForKernelP2p;
+      blitWg = kLimitWgForKernelP2p;
     }
   }
 
-  bool asan = false;
   bool ipcShared = srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
-#if defined(__clang__)
-#if __has_feature(address_sanitizer)
-  asan = true;
-#endif
-#endif
 
-  bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
-      (sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
-      (!(p2p || asan || ipcShared) &&
-           (!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess() &&
-            !(copyMetadata.copyEnginePreference_ ==
-              amd::CopyMetadata::CopyEnginePreference::SDMA)) ||
-       (copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT));
+  bool useShaderCopyPath = setup_.disableHwlCopyBuffer_                       ||
+                          (sizeIn[0] <= dev().settings().sdmaCopyThreshold_)  ||
+                          (!(p2p || ipcShared) &&
+                          (!srcMemory.isHostMemDirectAccess()
+                            && !dstMemory.isHostMemDirectAccess() &&
+                            !(copyMetadata.copyEnginePreference_ ==
+                              amd::CopyMetadata::CopyEnginePreference::SDMA)) ||
+                          (copyMetadata.copyEnginePreference_ ==
+                           amd::CopyMetadata::CopyEnginePreference::BLIT));
 
   if (!useShaderCopyPath) {
     if (amd::IS_HIP) {
@@ -2275,60 +2248,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
         gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER);
       }
     }
-    result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire,
-                                        copyMetadata);
+    result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn,
+                                        entire, copyMetadata);
   }
 
   if (!result) {
-    constexpr uint32_t kBlitType = BlitCopyBuffer;
-    constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
-    amd::Coord3D size(sizeIn[0]);
-
-    // Check alignments for source and destination
-    bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
-    uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
-
-    // Setup copy size accordingly to the alignment
-    uint32_t remainder = size[0] % aligned_size;
-    size.c[0] /= aligned_size;
-    size.c[0] += (remainder != 0) ? 1 : 0;
-
-    // Program the dispatch dimensions
-    const size_t localWorkSize = (aligned) ? 512 : 1024;
-    size_t globalWorkSize = std::min(blit_wg_ * localWorkSize, size[0]);
-    globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
-
-    // Program kernels arguments for the blit operation
-    cl_mem mem = as_cl<amd::Memory>(srcMemory.owner());
-    // Program source origin
-    uint64_t srcOffset = srcOrigin[0];
-    setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem, srcOffset, &srcMemory);
-    mem = as_cl<amd::Memory>(dstMemory.owner());
-    // Program destinaiton origin
-    uint64_t dstOffset = dstOrigin[0];
-    setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem, dstOffset, &dstMemory);
-
-    uint64_t copySize = sizeIn[0];
-    setArgument(kernels_[kBlitType], 2, sizeof(copySize), &copySize);
-
-    setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
-    setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
-
-    // End pointer is the aligned copy size and destination offset
-    uint64_t end_ptr = dstMemory.virtualAddress() + dstOffset + sizeIn[0] - remainder;
-
-    setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
-
-    uint32_t next_chunk = globalWorkSize;
-    setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
-
-    // Create ND range object for the kernel's execution
-    amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
-
-    // Execute the blit
-    address parameters = captureArguments(kernels_[kBlitType]);
-    result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters, nullptr);
-    releaseArguments(parameters);
+    result = shaderCopyBuffer(reinterpret_cast<address>(dstMemory.virtualAddress()),
+                              reinterpret_cast<address>(srcMemory.virtualAddress()),
+                              dstOrigin, srcOrigin, sizeIn,
+                              entire, blitWg, copyMetadata);
   }
 
   synchronize();
diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp
index 537fb4f628..63c5b0737c 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.hpp
@@ -231,7 +231,11 @@ class DmaBlitManager : public device::HostBlitManager {
   //! taking into account the Hsail profile supported by Hsa Agent
   bool hsaCopy(const Memory& srcMemory, const Memory& dstMemory, const amd::Coord3D& srcOrigin,
                const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
-               amd::CopyMetadata copyMetadata) const;
+               amd::CopyMetadata& copyMetadata) const;
+
+  inline bool rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
+                             const_address src, hsa_agent_t& srcAgent, size_t size,
+                             amd::CopyMetadata& copyMetadata) const;
 
   const size_t MinSizeForPinnedTransfer;
   bool completeOperation_;                    //!< DMA blit manager must complete operation
@@ -248,33 +252,13 @@ class DmaBlitManager : public device::HostBlitManager {
   //! Disable operator=
   DmaBlitManager& operator=(const DmaBlitManager&);
 
-  //! Reads video memory, using a staged buffer
-  bool readMemoryStaged(Memory& srcMemory,  //!< Source memory object
-                        void* dstHost,      //!< Destination host memory
-                        Memory& xferBuf,    //!< Staged buffer for read
-                        size_t origin,      //!< Original offset in the source memory
-                        size_t& offset,     //!< Offset for the current copy pointer
-                        size_t& totalSize,  //!< Total size for copy region
-                        size_t xferSize     //!< Transfer size
-                        ) const;
-
-  //! Write into video memory, using a staged buffer
-  bool writeMemoryStaged(const void* srcHost,  //!< Source host memory
-                         Memory& dstMemory,    //!< Destination memory object
-                         address staging,      //!< Staged buffer for write
-                         size_t origin,        //!< Original offset in the destination memory
-                         size_t& offset,       //!< Offset for the current copy pointer
-                         size_t& totalSize,    //!< Total size for the copy region
-                         size_t xferSize       //!< Transfer size
-                         ) const;
-
   //! Assits in transferring data from Host to Local or vice versa
   //! taking into account the Hsail profile supported by Hsa Agent
-  bool hsaCopyStaged(const_address hostSrc,  //!< Contains source data to be copied
-                     address hostDst,        //!< Destination buffer address for copying
-                     size_t size,            //!< Size of data to copy in bytes
-                     address staging,        //!< Staging resource
-                     bool hostToDev          //!< True if data is copied from Host To Device
+  bool hsaCopyStaged(const_address hostSrc,           //!< Contains source data to be copied
+                     address hostDst,                 //!< Destination buffer address for copying
+                     size_t size,                     //!< Size of data to copy in bytes
+                     bool hostToDev,                  //!< True if data is copied from H2D
+                     amd::CopyMetadata& copyMetadata  //!< Memory copy MetaData
                      ) const;
 
   bool forceHostWaitFunc(size_t copy_size) const;
@@ -583,6 +567,12 @@ class KernelBlitManager : public DmaBlitManager {
     return (dev().info().imageSupport_) ? BlitTotal : BlitLinearTotal;
   }
 
+  //! Copies a buffer using the shader path
+  bool shaderCopyBuffer(address dst, address src,
+                        const amd::Coord3D& dstOrigin, const amd::Coord3D& srcOrigin,
+                        const amd::Coord3D& size, bool entire, const uint32_t blitWg,
+                        amd::CopyMetadata copyMetadata, bool attachSignal = false) const;
+
   //! Disable copy constructor
   KernelBlitManager(const KernelBlitManager&);
 
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 648b63f83a..8fc76ecdf8 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -1202,7 +1202,8 @@ bool Device::populateOCLDeviceConstants() {
                                                &info_.globalMemCacheLineSize_)) {
     return false;
   }
-  assert(info_.globalMemCacheLineSize_ > 0);
+  info_.globalMemCacheLineSize_ = (info_.globalMemCacheLineSize_ != 0) ?
+                                         info_.globalMemCacheLineSize_ : 64;
 
   uint32_t cachesize[4] = {0};
   if (HSA_STATUS_SUCCESS !=
diff --git a/projects/clr/rocclr/device/rocm/rockernel.cpp b/projects/clr/rocclr/device/rocm/rockernel.cpp
index ad55e08b85..b79cffbc44 100644
--- a/projects/clr/rocclr/device/rocm/rockernel.cpp
+++ b/projects/clr/rocclr/device/rocm/rockernel.cpp
@@ -57,8 +57,7 @@ bool LightningKernel::postLoad() {
   }
   kernargSegmentAlignment_ =
       amd::alignUp(std::max(kernargSegmentAlignment_, 128u),
-                   device().info().globalMemCacheLineSize_ > 0 ?
-                       device().info().globalMemCacheLineSize_ : 64);
+                   device().info().globalMemCacheLineSize_);
 
   // Set the workgroup information for the kernel
   workGroupInfo_.availableLDSSize_ = device().info().localMemSizePerCU_;
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 7f13ae5797..7d52ce0cf9 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -497,7 +497,8 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
 }
 
 // ================================================================================================
-std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
+std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine,
+                                                                     bool forceHostWait) {
   bool explicit_wait = false;
   // Reset all current waiting signals
   waiting_signals_.clear();
@@ -545,8 +546,8 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
       const Settings& settings = gpu_.dev().settings();
       // Actively wait on CPU to avoid extra overheads of signal tracking on GPU.
       // For small copies set forced wait
-      if (!WaitForSignal<true>(external_signals_[i]->signal_, false,
-                               external_signals_[i]->flags_.forceHostWait_)) {
+      if (!WaitForSignal<true>(external_signals_[i]->signal_, false, forceHostWait ?
+                               external_signals_[i]->flags_.forceHostWait_ : false)) {
         if (settings.cpu_wait_for_signal_) {
           // Wait on CPU for completion if requested
           CpuWaitForSignal(external_signals_[i]);
@@ -713,6 +714,9 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
       else {
         uint32_t index = desc.info_.arrayIndex_;
         mem = memories[index];
+        const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
+        ClPrint(amd::LOG_INFO, amd::LOG_KERN,
+            "Arg%d: %s %s = ptr:%p", i, desc.typeName_.c_str(), desc.name_.c_str(), globalAddress);
         if (mem == nullptr) {
           //! This condition is for SVM fine-grain
           if (dev().isFineGrainedSystem(true)) {
@@ -839,7 +843,7 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1
 // ================================================================================================
 template <typename AqlPacket>
 bool VirtualGPU::dispatchGenericAqlPacket(
-  AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking) {
+  AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, bool attach_signal) {
   const uint32_t queueSize = gpu_queue_->size;
   const uint32_t queueMask = queueSize - 1;
   const uint32_t sw_queue_size = queueMask;
@@ -847,6 +851,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   // Check for queue full and wait if needed.
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
   uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
+
   if (addSystemScope_) {
     header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
                 HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
@@ -858,15 +863,15 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                          HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
 
-  if (fence_state_ == amd::Device::kCacheStateSystem &&
-      expected_fence_state == amd::Device::kCacheStateSystem) {
+  if (fence_state_ == amd::Device::kCacheStateSystem
+      && expected_fence_state == amd::Device::kCacheStateSystem) {
     header = dispatchPacketHeader_;
     fence_dirty_ = true;
   }
 
   fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
 
-  if (timestamp_ != nullptr) {
+  if (timestamp_ != nullptr || attach_signal) {
     // Get active signal for current dispatch if profiling is necessary
     packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
 
@@ -967,7 +972,7 @@ void VirtualGPU::dispatchBlockingWait() {
 // ================================================================================================
 bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
                                    uint16_t rest, bool blocking, bool capturing,
-                                   const uint8_t* aqlPacket) {
+                                   const uint8_t* aqlPacket, bool attach_signal) {
   if (capturing == true) {
     packet->header = header;
     packet->setup = rest;
@@ -975,13 +980,13 @@ bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_
     return true;
   } else {
     dispatchBlockingWait();
-    return dispatchGenericAqlPacket(packet, header, rest, blocking);
+    return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
   }
 }
 // ================================================================================================
-bool VirtualGPU::dispatchAqlPacket(
-  hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
-  return dispatchGenericAqlPacket(packet, header, rest, blocking);
+bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
+                                   bool blocking, bool attach_signal) {
+  return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
 }
 
 // ================================================================================================
@@ -1074,10 +1079,9 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
     barrier_packet_.completion_signal = signal;
   }
 
-  // Reset fence_dirty_ and addSystemScope_ flag if we submit a barrier with system scopes
+  // Reset fence_dirty_ flag if we submit a barrier with system scopes
   if (cache_state == amd::Device::kCacheStateSystem) {
     fence_dirty_ = false;
-    addSystemScope_ = false;
   }
 
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1424,7 +1428,7 @@ bool VirtualGPU::ManagedBuffer::Create() {
 
 // ================================================================================================
 address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
-  auto alignment = gpu_.dev().info().globalMemCacheLineSize_;
+  auto alignment = amd::alignUp(256u, gpu_.dev().info().globalMemCacheLineSize_);
   address result = nullptr;
   result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
   const size_t pool_new_usage = (result + size) - pool_base_;
@@ -1713,7 +1717,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
   bool imageBuffer = false;
 
   // Force buffer read for IMAGE1D_BUFFER
-  if ((type == CL_COMMAND_READ_IMAGE) && (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+  if ((type == CL_COMMAND_READ_IMAGE) &&
+      (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
     type = CL_COMMAND_READ_BUFFER;
     imageBuffer = true;
   }
@@ -1732,7 +1737,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
         result = blitMgr().copyBuffer(*devMem, *hostMemory, origin, dstOrigin, size,
                                       cmd.isEntireMemory(), cmd.copyMetadata());
       } else {
-        result = blitMgr().readBuffer(*devMem, dst, origin, size, cmd.isEntireMemory(), cmd.copyMetadata());
+        result = blitMgr().readBuffer(*devMem, dst, origin, size,
+                                      cmd.isEntireMemory(), cmd.copyMetadata());
       }
       break;
     }
@@ -1752,7 +1758,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
       break;
     }
     case CL_COMMAND_READ_IMAGE: {
-      if ((cmd.source().parent() != nullptr) && (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
+      if ((cmd.source().parent() != nullptr) &&
+          (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
         Image* imageBuffer = static_cast<Image*>(devMem);
         // Check if synchronization has to be performed
         if (nullptr != imageBuffer->CopyImageBuffer()) {
@@ -1772,7 +1779,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
         amd::Coord3D dstOrigin(offset);
         result =
             blitMgr().copyImageToBuffer(*devMem, *hostMemory, cmd.origin(), dstOrigin, size,
-                                        cmd.isEntireMemory(), cmd.rowPitch(), cmd.slicePitch(), cmd.copyMetadata());
+                                        cmd.isEntireMemory(), cmd.rowPitch(),
+                                        cmd.slicePitch(), cmd.copyMetadata());
       } else {
         result = blitMgr().readImage(*devMem, dst, cmd.origin(), size, cmd.rowPitch(),
                                      cmd.slicePitch(), cmd.isEntireMemory(), cmd.copyMetadata());
@@ -1839,7 +1847,8 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
         result = blitMgr().copyBuffer(*hostMemory, *devMem, srcOrigin, origin, size,
                                       cmd.isEntireMemory(), cmd.copyMetadata());
       } else {
-        result = blitMgr().writeBuffer(src, *devMem, origin, size, cmd.isEntireMemory(), cmd.copyMetadata());
+        result = blitMgr().writeBuffer(src, *devMem, origin, size,
+                                       cmd.isEntireMemory(), cmd.copyMetadata());
       }
       break;
     }
@@ -3131,9 +3140,9 @@ void VirtualGPU::HiddenHeapInit() { const_cast<Device&>(dev()).HiddenHeapInit(*t
 
 // ================================================================================================
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
-    const amd::Kernel& kernel, const_address parameters, void* eventHandle,
+    const amd::Kernel& kernel, const_address parameters, void* event_handle,
     uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd,
-    hsa_kernel_dispatch_packet_t* aql_packet) {
+    hsa_kernel_dispatch_packet_t* aql_packet, bool attach_signal) {
   device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
   Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
   size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -3482,7 +3491,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       addSystemScope_ = true;
     }
 
-
     // Copy scheduler's AQL packet for possible relaunch from the scheduler itself
     if (aql_packet != nullptr) {
       *aql_packet = dispatchPacket;
@@ -3504,7 +3512,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     } else {
       if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                              (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION)) {
+                             GPU_FLUSH_ON_EXECUTION, false, nullptr, attach_signal)) {
         return false;
       }
     }
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index d404ee57ba..ce252991f9 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -30,6 +30,7 @@
 #include "rocprintf.hpp"
 #include "hsa/hsa_ven_amd_aqlprofile.h"
 #include "rocsched.hpp"
+#include "device/device.hpp"
 
 namespace amd::roc {
 class Device;
@@ -270,7 +271,8 @@ class VirtualGPU : public device::VirtualDevice {
     HwQueueEngine GetActiveEngine() const { return engine_; }
 
     //! Returns the last submitted signal for a wait
-    std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
+    std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute,
+                                             bool forceHostWait = true);
 
     //! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
     void ResetCurrentSignal();
@@ -341,8 +343,8 @@ class VirtualGPU : public device::VirtualDevice {
                             void* event_handle,  //!< Handle to OCL event for debugging
                             uint32_t sharedMemBytes = 0, //!< Shared memory size
                             amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command
-                            hsa_kernel_dispatch_packet_t* aql_packet = nullptr  //!< Scheduler launch
-                            );
+                            hsa_kernel_dispatch_packet_t* aql_packet = nullptr,  //!< Scheduler launch
+                            bool attach_signal = false);
   void submitNativeFn(amd::NativeFnCommand& cmd);
   void submitMarker(amd::Marker& cmd);
   void submitAccumulate(amd::AccumulateCommand& cmd);
@@ -420,7 +422,10 @@ class VirtualGPU : public device::VirtualDevice {
 
   void hasPendingDispatch() { hasPendingDispatch_ = true; }
   bool IsPendingDispatch() const { return (hasPendingDispatch_) ? true : false; }
-  void addSystemScope() { addSystemScope_ = true; }
+  void addSystemScope() {
+    addSystemScope_ = true;
+    fence_state_ = amd::Device::CacheState::kCacheStateInvalid;
+  }
   void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }
 
   HwQueueTracker& Barriers() { return barriers_; }
@@ -444,11 +449,12 @@ class VirtualGPU : public device::VirtualDevice {
                                 amd::AccumulateCommand* vcmd = nullptr);
   bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
                          bool blocking = true, bool capturing = false,
-                         const uint8_t* aqlPacket = nullptr);
+                         const uint8_t* aqlPacket = nullptr, bool attach_signal = false);
   bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header,
-                        uint16_t rest, bool blocking = true);
+                        uint16_t rest, bool blocking = true, bool attach_signal = false);
   template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
-                                                              uint16_t rest, bool blocking);
+                                                              uint16_t rest, bool blocking,
+                                                              bool attach_signal = false);
 
   bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
                                 bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index 37f74857e1..8d60e3ae28 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -85,7 +85,7 @@ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128,                                \
 release(size_t, GPU_RESOURCE_CACHE_SIZE, 64,                                  \
         "The resource cache size in MB")                                      \
 release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096,                                  \
-        "The maximum size accepted for suballocaitons in KB")                 \
+        "The maximum size accepted for suballocations in KB")                 \
 release(size_t, GPU_NUM_MEM_DEPENDENCY, 256,                                  \
         "Number of memory objects for dependency tracking")                   \
 release(size_t, GPU_XFER_BUFFER_SIZE, 0,                                      \
@@ -105,7 +105,7 @@ release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
 release(bool, AMD_THREAD_TRACE_ENABLE, true,                                  \
         "Enable thread trace extension")                                      \
 release(uint, OPENCL_VERSION, 200,                                            \
-        "Force GPU opencl verison")                                           \
+        "Force GPU opencl version")                                           \
 release(bool, HSA_LOCAL_MEMORY_ENABLE, true,                                  \
         "Enable HSA device local memory usage")                               \
 release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024,                             \
@@ -186,7 +186,7 @@ release(bool, AMD_DIRECT_DISPATCH, false,                                     \
 release(uint, HIP_HIDDEN_FREE_MEM, 0,                                         \
         "Reserve free mem reporting in Mb"                                    \
         "0 = Disable")                                                        \
-release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 0,                                  \
+release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 16,                                 \
         "Use Blit until this size(in KB) for copies")                         \
 release(uint, ROC_ACTIVE_WAIT_TIMEOUT, 0,                                     \
         "Forces active wait of GPU interrup for the timeout(us)")             \