diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index f8d2d22a96..3a7285c2ad 100755
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -35,11 +35,8 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
       context_(nullptr) {}
 
 inline void DmaBlitManager::synchronize() const {
-  // todo TS tracking isn't implemented
-  gpu().releaseGpuMemoryFence();
-
   if (syncOperation_) {
-    //        gpu().waitAllEngines();
+    gpu().releaseGpuMemoryFence();
     gpu().releasePinnedMem();
   }
 }
@@ -65,6 +62,9 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory&
 bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   // Use host copy if memory has direct access
   if (setup_.disableReadBuffer_ ||
       (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
@@ -149,6 +149,9 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
 bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
                                     const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
                                     const amd::Coord3D& size, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   // Use host copy if memory has direct access
   if (setup_.disableReadBufferRect_ ||
       (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
@@ -184,6 +187,9 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
 bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin,
                                const amd::Coord3D& size, size_t rowPitch, size_t slicePitch,
                                bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   if (setup_.disableReadImage_) {
     return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch,
                                       entire);
@@ -213,6 +219,9 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
 bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                  const amd::Coord3D& origin, const amd::Coord3D& size,
                                  bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   // Use host copy if memory has direct access
   if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
       gpuMem(dstMemory).IsPersistentDirectMap()) {
@@ -300,6 +309,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
                                      const amd::BufferRect& hostRect,
                                      const amd::BufferRect& bufRect, const amd::Coord3D& size,
                                      bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   // Use host copy if memory has direct access
   if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() ||
       gpuMem(dstMemory).IsPersistentDirectMap()) {
@@ -335,6 +347,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
 bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 size_t rowPitch, size_t slicePitch, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   if (setup_.disableWriteImage_) {
     return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
                                        entire);
@@ -350,6 +365,9 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
 bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                 const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                 const amd::Coord3D& size, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   if (setup_.disableCopyBuffer_ ||
       (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
       (dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) {
@@ -364,6 +382,9 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
 bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
                                     const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
                                     const amd::Coord3D& size, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   if (setup_.disableCopyBufferRect_ ||
       (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
        dstMemory.isHostMemDirectAccess())) {
@@ -471,6 +492,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
                                        const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                        const amd::Coord3D& size, bool entire, size_t rowPitch,
                                        size_t slicePitch) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   bool result = false;
 
   if (setup_.disableCopyImageToBuffer_) {
@@ -512,6 +536,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
                                        const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                        const amd::Coord3D& size, bool entire, size_t rowPitch,
                                        size_t slicePitch) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   bool result = false;
 
   if (setup_.disableCopyBufferToImage_) {
@@ -550,6 +577,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
 bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
                                const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                const amd::Coord3D& size, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
   bool result = false;
 
   if (setup_.disableCopyImage_) {
@@ -1626,6 +1656,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
 
   if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
     if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
+      // CPU read ahead, hence release GPU memory
+      gpu().releaseGpuMemoryFence();
       void* src = srcMemory.owner()->getSvmPtr();
       std::memcpy(dstHost, src, size[0]);
       // Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
@@ -1729,11 +1761,13 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
 
   if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) {
     if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
+      // CPU read ahead, hence release GPU memory
+      gpu().releaseGpuMemoryFence();
       void* dst = dstMemory.owner()->getSvmPtr();
       std::memcpy(dst, srcHost, size[0]);
-      // Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache
+      // Set HASPENDINGDISPATCH_ FLAG. Then releaseGpuMemoryFence() will use barrier to invalidate cache
       gpu().hasPendingDispatch();
-      synchronize();
+      gpu().releaseGpuMemoryFence();
       return true;
     }
   }
@@ -1972,6 +2006,7 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
     result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
     releaseArguments(parameters);
   } else {
+    //printf("rocm!\n");
     result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
   }
 
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index 62173325c7..7a7e3efc1b 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -922,9 +922,6 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
 
   size_t offset = 0;
@@ -1030,9 +1027,6 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
 
   size_t offset = 0;
@@ -1229,9 +1223,6 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
 
   cl_command_type type = cmd.type();
@@ -1249,9 +1240,6 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // in-order semantics: previous commands need to be done before we start
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
   // no op for FGS supported device
   if (!dev().isFineGrainedSystem(true)) {
@@ -1286,6 +1274,9 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
     if ((nullptr == srcMem && nullptr == dstMem) || // both not in svm space
         dev().forceFineGrain(srcMem) ||
         dev().forceFineGrain(dstMem)) {
+      // Wait on a kernel if one is outstanding
+      releaseGpuMemoryFence();
+
       // If these are from different contexts, then one of them could be in the device memory
       // This is fine, since spec doesn't allow for copies with pointers from different contexts
       amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize());
@@ -1328,9 +1319,6 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
 
   Memory* srcDevMem = static_cast<roc::Memory*>(
@@ -1424,9 +1412,6 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
 
   // no op for FGS supported device
@@ -1447,6 +1432,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
           LogError("submitSVMMapMemory() - copy failed");
           cmd.setStatus(CL_MAP_FAILURE);
         }
+        // Wait on a kernel if one is outstanding
         releaseGpuMemoryFence();
         const void* mappedPtr = hsaMapMemory->owner()->getHostMem();
         amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
@@ -1463,9 +1449,6 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
 
   // no op for FGS supported device
@@ -1476,6 +1459,8 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
 
     if (memory->mapMemory() != nullptr) {
       if (writeMapInfo->isUnmapWrite()) {
+        // Wait on a kernel if one is outstanding
+        releaseGpuMemoryFence();
         amd::Coord3D srcOrigin(0, 0, 0);
         Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());
 
@@ -1503,9 +1488,6 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
 
   //! @todo add multi-devices synchronization when supported.
@@ -1563,8 +1545,8 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
         result = blitMgr().copyBuffer(*hsaMemory, *hsaMapMemory, origin, dstOrigin, size,
                                       cmd.isEntireMemory());
         void* svmPtr = devMemory->owner()->getSvmPtr();
-        if ((svmPtr != nullptr) &&
-            (hostPtr != svmPtr)) {
+        if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
+          // Wait on a kernel if one is outstanding
           releaseGpuMemoryFence();
           amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]);
         }
@@ -1608,8 +1590,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
     LogError("Unmap without map call");
     return;
   }
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
+
   profilingBegin(cmd);
 
   // Force buffer write for IMAGE1D_BUFFER
@@ -1663,8 +1644,9 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
 
           const void* svmPtr = devMemory->owner()->getSvmPtr();
           void* hostPtr = mapMemory->getHostMem();
-          if ((svmPtr != nullptr) &&
-              (hostPtr != svmPtr)) {
+          if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
+            // Wait on a kernel if one is outstanding
+            releaseGpuMemoryFence();
             amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]);
           }
           result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_,
@@ -1751,9 +1733,6 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
 
   if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(),
@@ -1767,9 +1746,6 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // in-order semantics: previous commands need to be done before we start
-  releaseGpuMemoryFence();
-
   profilingBegin(cmd);
 
   amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst());
@@ -1811,9 +1787,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
 
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
   profilingBegin(vcmd);
 
   for (auto itr : vcmd.memObjects()) {