From 30cf81fc939ff775e5b2ed69d2829e7c0116045e Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Thu, 7 Jan 2021 16:41:30 -0500
Subject: [PATCH] Add HSA signal global tracking logic.

Implement the global class for signals tracking per device queue.
Switch to the new tracking mechanism.

Change-Id: I3c4dda04b34e6d18d6a95510d84102909633b415


[ROCm/clr commit: 8698aeef0d35bb363d9340613e04b5af3fd47ffa]
---
 projects/clr/rocclr/device/rocm/rocblit.cpp   | 172 ++++++-------
 projects/clr/rocclr/device/rocm/rocblit.hpp   |  19 +-
 projects/clr/rocclr/device/rocm/rocdevice.cpp |   5 +
 projects/clr/rocclr/device/rocm/rocmemory.cpp |   3 +-
 .../clr/rocclr/device/rocm/rocvirtual.cpp     | 227 +++++-------------
 .../clr/rocclr/device/rocm/rocvirtual.hpp     | 147 +++++++++++-
 6 files changed, 284 insertions(+), 289 deletions(-)

diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp
index 320c8885fc..e983af887a 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.cpp
@@ -62,12 +62,14 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory&
 bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  gpu().releaseGpuMemoryFence();
+  // HSA copy functionality with a possible async operation
+  gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
 
   // Use host copy if memory has direct access
   if (setup_.disableReadBuffer_ ||
       (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().Barriers().WaitCurrent();
     return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
   } else {
     size_t srcSize = size[0];
@@ -149,12 +151,14 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
 bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
                                     const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
                                     const amd::Coord3D& size, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation
   gpu().releaseGpuMemoryFence();
 
   // Use host copy if memory has direct access
   if (setup_.disableReadBufferRect_ ||
       (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().Barriers().WaitCurrent();
     return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
   } else {
     Memory& xferBuf = dev().xferRead().acquire();
@@ -187,7 +191,7 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
 bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin,
                                const amd::Coord3D& size, size_t rowPitch, size_t slicePitch,
                                bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation
   gpu().releaseGpuMemoryFence();
 
   if (setup_.disableReadImage_) {
@@ -219,14 +223,16 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
 bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                  const amd::Coord3D& origin, const amd::Coord3D& size,
                                  bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  gpu().releaseGpuMemoryFence();
-
   // Use host copy if memory has direct access
   if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
       gpuMem(dstMemory).IsPersistentDirectMap()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
   } else {
+    // HSA copy functionality with a possible async operation
+    gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
+
     size_t dstSize = size[0];
     size_t tmpSize = 0;
     size_t offset = 0;
@@ -309,7 +315,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
                                      const amd::BufferRect& hostRect,
                                      const amd::BufferRect& bufRect, const amd::Coord3D& size,
                                      bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation
   gpu().releaseGpuMemoryFence();
 
   // Use host copy if memory has direct access
@@ -347,7 +353,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
 bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 size_t rowPitch, size_t slicePitch, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation
   gpu().releaseGpuMemoryFence();
 
   if (setup_.disableWriteImage_) {
@@ -365,12 +371,11 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
 bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                 const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                 const amd::Coord3D& size, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  gpu().releaseGpuMemoryFence();
-
   if (setup_.disableCopyBuffer_ ||
       (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
       (dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size);
   } else {
     return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory), srcOrigin, dstOrigin, size);
@@ -383,14 +388,14 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
 bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
                                     const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
                                     const amd::Coord3D& size, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  gpu().releaseGpuMemoryFence();
-
   if (setup_.disableCopyBufferRect_ ||
       (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
        dstMemory.isHostMemDirectAccess())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire);
   } else {
+    gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
 
     void* src = gpuMem(srcMemory).getDeviceMemory();
     void* dst = gpuMem(dstMemory).getDeviceMemory();
@@ -436,25 +441,21 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
     }
 
     if (isSubwindowRectCopy ) {
-      hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
+      hsa_signal_t wait = gpu().Barriers().WaitSignal();
+      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
 
       // Copy memory line by line
-      hsa_status_t status =
-          hsa_amd_memory_async_copy_rect(&dstMem, &offset, &srcMem, &offset, &dim, agent,
-                                    direction, 0, nullptr, completion_signal_);
+      hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
+          &srcMem, &offset, &dim, agent, direction, 1, &wait, active);
       if (status != HSA_STATUS_SUCCESS) {
         LogPrintfError("DMA buffer failed with code %d", status);
         return false;
       }
-
-      if (!WaitForSignal(completion_signal_)) {
-        LogError("Async copy failed");
-        return false;
-      }
     } else {
       // Fall to line by line copies
       const hsa_signal_value_t kInitVal = size[2] * size[1];
-      hsa_signal_store_relaxed(completion_signal_, kInitVal);
+      hsa_signal_t wait = gpu().Barriers().WaitSignal();
+      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());
 
       for (size_t z = 0; z < size[2]; ++z) {
         for (size_t y = 0; y < size[1]; ++y) {
@@ -462,10 +463,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
           size_t dstOffset = dstRect.offset(0, y, z);
 
           // Copy memory line by line
-          hsa_status_t status =
-              hsa_amd_memory_async_copy((reinterpret_cast<address>(dst) + dstOffset), dstAgent,
-                                        (reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
-                                        size[0], 0, nullptr, completion_signal_);
+          hsa_status_t status = hsa_amd_memory_async_copy(
+              (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
+              (reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
+              size[0], 1, &wait, active);
           gpu().setLastCommandSDMA(true) ;
           if (status != HSA_STATUS_SUCCESS) {
             LogPrintfError("DMA buffer failed with code %d", status);
@@ -473,14 +474,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
           }
         }
       }
-
-      if (!WaitForSignal(completion_signal_)) {
-        LogError("Async copy failed");
-        return false;
-      }
     }
-
   }
+  // Explicit wait for now, until runtime could distinguish compute and sdma operations
+  gpu().Barriers().WaitCurrent();
   return true;
 }
 
@@ -489,12 +486,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
                                        const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                        const amd::Coord3D& size, bool entire, size_t rowPitch,
                                        size_t slicePitch) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
-    gpu().releaseGpuMemoryFence(true);
-  } else {
-    gpu().releaseGpuMemoryFence();
-  }
+  // HSA copy functionality with a possible async operation, hence make sure GPU is done
+  bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+  gpu().releaseGpuMemoryFence(force_barrier);
 
   bool result = false;
 
@@ -504,9 +498,6 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
   } else {
     Image& srcImage = static_cast<roc::Image&>(srcMemory);
     Buffer& dstBuffer = static_cast<roc::Buffer&>(dstMemory);
-
-    // Use ROC path for a transfer
-    // Note: it doesn't support SDMA
     address dstHost = reinterpret_cast<address>(dstBuffer.getDeviceMemory()) + dstOrigin[0];
 
     // Use ROCm path for a transfer.
@@ -540,12 +531,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
                                        const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                        const amd::Coord3D& size, bool entire, size_t rowPitch,
                                        size_t slicePitch) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
-    gpu().releaseGpuMemoryFence(true);
-  } else {
-    gpu().releaseGpuMemoryFence();
-  }
+  // HSA copy functionality with a possible async operation, hence make sure GPU is done
+  bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+  gpu().releaseGpuMemoryFence(force_barrier);
 
   bool result = false;
 
@@ -588,7 +576,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
 bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
                                const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                const amd::Coord3D& size, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation, hence make sure GPU is done
   gpu().releaseGpuMemoryFence();
 
   bool result = false;
@@ -610,9 +598,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
   address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
   address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
 
-  if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
-    gpu().releaseGpuMemoryFence(true);
-  }
+  bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+  gpu().releaseGpuMemoryFence(force_barrier, kSkipCpuWait);
 
   src += srcOrigin[0];
   dst += dstOrigin[0];
@@ -620,6 +607,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
   // Just call copy function for full profile
   hsa_status_t status;
   if (dev().agent_profile() == HSA_PROFILE_FULL) {
+    // Stall GPU, sicne CPU copy is possible
+    gpu().Barriers().WaitCurrent();
     status = hsa_memory_copy(dst, src, size[0]);
     if (status != HSA_STATUS_SUCCESS) {
       LogPrintfError("Hsa copy of data failed with code %d", status);
@@ -649,21 +638,15 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
     srcAgent = dstAgent = dev().getBackendDevice();
   }
 
-  hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
-
+  hsa_signal_t wait = gpu().Barriers().WaitSignal();
+  hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
   // Use SDMA to transfer the data
-  status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 0, nullptr,
-                                     completion_signal_);
+  status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 1, &wait, active);
   gpu().setLastCommandSDMA(true);
+  // Explicit wait for now, until runtime could distinguish compute and sdma operations
+  gpu().Barriers().WaitCurrent();
   if (status == HSA_STATUS_SUCCESS) {
-    hsa_signal_value_t val;
-
-    if (!WaitForSignal(completion_signal_)) {
-      LogError("Async copy failed");
-      status = HSA_STATUS_ERROR;
-    } else {
-      gpu().addSystemScope();
-    }
+    gpu().addSystemScope();
   } else {
     LogPrintfError("Hsa copy from host to device failed with code %d", status);
   }
@@ -674,6 +657,10 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
 // ================================================================================================
 bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
                                    address staging, bool hostToDev) const {
+  // Stall GPU, sicne CPU copy is possible
+  bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+  gpu().releaseGpuMemoryFence(force_barrier);
+
   // No allocation is necessary for Full Profile
   hsa_status_t status;
   if (dev().agent_profile() == HSA_PROFILE_FULL) {
@@ -688,14 +675,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
   size_t offset = 0;
 
   address hsaBuffer = staging;
-  if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
-    gpu().releaseGpuMemoryFence(true);
-  }
 
   // Allocate requested size of memory
   while (totalSize > 0) {
     size = std::min(totalSize, dev().settings().stagedXferSize_);
-    hsa_signal_silent_store_relaxed(completion_signal_, kInitSignalValueOne);
+    hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
 
     // Copy data from Host to Device
     if (hostToDev) {
@@ -707,17 +691,13 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
 
       memcpy(hsaBuffer, hostSrc + offset, size);
       status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
-                                         srcAgent, size, 0, nullptr, completion_signal_);
+                                         srcAgent, size, 0, nullptr, active);
       gpu().setLastCommandSDMA(true);
-      if (status == HSA_STATUS_SUCCESS) {
-        if (!WaitForSignal(completion_signal_)) {
-          LogError("Async copy failed");
-          return false;
-        }
-      } else {
+      if (status != HSA_STATUS_SUCCESS) {
         LogPrintfError("Hsa copy from host to device failed with code %d", status);
         return false;
       }
+      gpu().Barriers().WaitCurrent();
       totalSize -= size;
       offset += size;
       continue;
@@ -730,15 +710,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
         (size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
 
     // Copy data from Device to Host
-    status =
-        hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
-                                  dev().getBackendDevice(), size, 0, nullptr, completion_signal_);
+    status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
+        dev().getBackendDevice(), size, 0, nullptr, active);
     gpu().setLastCommandSDMA(true);
     if (status == HSA_STATUS_SUCCESS) {
-      if (!WaitForSignal(completion_signal_)) {
-        LogError("Async copy failed");
-        return false;
-      }
+      gpu().Barriers().WaitCurrent();
       memcpy(hostDst + offset, hsaBuffer, size);
     } else {
       LogPrintfError("Hsa copy from device to host failed with code %d", status);
@@ -1083,11 +1059,7 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
   releaseArguments(parameters);
   if (releaseView) {
     // todo SRD programming could be changed to avoid a stall
-    if(!dev().settings().barrier_sync_) {
-      gpu().releaseGpuMemoryFence(true);
-   } else {
-     gpu().releaseGpuMemoryFence();
-   }
+    gpu().releaseGpuMemoryFence();
     dstView->owner()->release();
   }
 
@@ -1285,11 +1257,7 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
   releaseArguments(parameters);
   if (releaseView) {
     // todo SRD programming could be changed to avoid a stall
-    if(!dev().settings().barrier_sync_) {
-      gpu().releaseGpuMemoryFence(true);
-    } else {
-      gpu().releaseGpuMemoryFence();
-    }
+    gpu().releaseGpuMemoryFence();
     srcView->owner()->release();
   }
 
@@ -1465,6 +1433,8 @@ bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
 
   // Use host copy if memory has direct access
   if (setup_.disableReadImage_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     result = HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire);
     synchronize();
     return result;
@@ -1510,6 +1480,8 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor
 
   // Use host copy if memory has direct access
   if (setup_.disableWriteImage_ || dstMemory.isHostMemDirectAccess()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
     synchronize();
     return result;
@@ -1704,6 +1676,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
 
   // Use host copy if memory has direct access
   if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
     synchronize();
     return result;
@@ -1753,6 +1727,8 @@ bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
   // Use host copy if memory has direct access
   if (setup_.disableReadBufferRect_ ||
       (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
     synchronize();
     return result;
@@ -1814,6 +1790,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
   // Use host copy if memory has direct access
   if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
       gpuMem(dstMemory).IsPersistentDirectMap()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
     synchronize();
     return result;
@@ -1864,6 +1842,8 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
   // Use host copy if memory has direct access
   if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() ||
       gpuMem(dstMemory).IsPersistentDirectMap()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
     synchronize();
     return result;
@@ -1913,6 +1893,8 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
 
   // Use host fill if memory has direct access
   if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire);
     synchronize();
     return result;
@@ -2074,6 +2056,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
 
   // Use host fill if memory has direct access
   if (setup_.disableFillImage_ || memory.isHostMemDirectAccess()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
     result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);
     synchronize();
     return result;
diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp
index c4f4b89ba9..5f344ff1af 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.hpp
@@ -40,6 +40,9 @@ class Kernel;
 class Memory;
 class VirtualGPU;
 
+constexpr bool kSkipCpuWait = true;
+constexpr bool kIgnoreBarrier = false;
+
 //! DMA Blit Manager
 class DmaBlitManager : public device::HostBlitManager {
  public:
@@ -49,19 +52,10 @@ class DmaBlitManager : public device::HostBlitManager {
                  );
 
   //! Destructor
-  virtual ~DmaBlitManager() {
-    if (completion_signal_.handle != 0) {
-      hsa_signal_destroy(completion_signal_);
-    }
-  }
+  virtual ~DmaBlitManager() {}
 
   //! Creates DmaBlitManager object
-  virtual bool create(amd::Device& device) {
-    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &completion_signal_)) {
-      return false;
-    }
-    return true;
-  }
+  virtual bool create(amd::Device& device) { return true; }
 
   //! Copies a buffer object to system memory
   virtual bool readBuffer(device::Memory& srcMemory,   //!< Source memory object
@@ -225,9 +219,6 @@ class DmaBlitManager : public device::HostBlitManager {
                          size_t xferSize       //!< Transfer size
                          ) const;
 
-  //! Handle of ROC Device object
-  hsa_signal_t completion_signal_;
-
   //! Assits in transferring data from Host to Local or vice versa
   //! taking into account the Hsail profile supported by Hsa Agent
   bool hsaCopyStaged(const_address hostSrc,  //!< Contains source data to be copied
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 7ed5ffc5e6..a19987e3c2 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -260,7 +260,12 @@ Device::~Device() {
     context().svmFree(coopHostcallBuffer_);
     coopHostcallBuffer_ = nullptr;
   }
+
+  if (0 != prefetch_signal_.handle) {
+    hsa_signal_destroy(prefetch_signal_);
+  }
 }
+
 bool NullDevice::initCompiler(bool isOffline) {
 #if defined(WITH_COMPILER_LIB)
   // Initialize the compiler handle if has already not been initialized
diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp
index 2f20890cee..9c1055d140 100644
--- a/projects/clr/rocclr/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp
@@ -199,8 +199,9 @@ void Memory::cpuUnmap(device::VirtualDevice& vDev) {
                                     amd::Coord3D(size()), true)) {
       LogError("[OCL] Fail sync the device memory on cpuUnmap");
     }
+    // Wait on CPU for the transfer
+    static_cast<roc::VirtualGPU&>(vDev).releaseGpuMemoryFence();
   }
-
   decIndMapCount();
 }
 
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index bb9cfdfacf..bc65024fcf 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -456,34 +456,12 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   // Check for queue full and wait if needed.
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
   uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
-  hsa_signal_t signal;
 
-  // TODO: placeholder to setup the kernel to populate start and end timestamp.
   if (timestamp_ != nullptr) {
     // Pool size must grow to the size of pending AQL packets
     const uint32_t pool_size = index - read;
-    if (pool_size >= signal_pool_.size()) {
-      ProfilingSignal profiling_signal = {};
-      if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
-        LogPrintfError("Failed signal allocation id = %d", pool_size);
-        return false;
-      }
-      signal_pool_.push_back(profiling_signal);
-      assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
-    }
-    // Move index inside the valid pool
-    ++current_signal_ %= signal_pool_.size();
-    // Find signal slot
-    ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
-    // Make sure we save the old results in the TS structure
-    if (profilingSignal->ts_ != nullptr) {
-      profilingSignal->ts_->checkGpuTime();
-    }
-    // Update the new TS with the signal info
-    timestamp_->setProfilingSignal(profilingSignal);
-    packet->completion_signal = profilingSignal->signal_;
-    profilingSignal->ts_ = timestamp_;
-    timestamp_->setAgent(gpu_device_);
+    // Get active signal for current dispatch if profiling is necessary
+    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
   }
 
   // Make sure the slot is free for usage
@@ -494,23 +472,11 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   // Add blocking command if the original value of read index was behind of the queue size
   if (blocking || (index - read) >= queueMask) {
     if (packet->completion_signal.handle == 0) {
-      packet->completion_signal = barrier_signal_;
+      packet->completion_signal = Barriers().ActiveSignal();
     }
-    signal = packet->completion_signal;
-    // Initialize signal for a wait
-    hsa_signal_store_relaxed(signal, kInitSignalValueOne);
     blocking = true;
   }
 
-  // If runtime doesn't use the barrier, then make sure it tracks the last submitted command
-  if (!dev().settings().barrier_sync_) {
-    // Initialize signal for a wait
-    assert(packet->completion_signal.handle != 0 &&
-        "There is no HSA signal associated with the last command!");
-    hsa_signal_store_relaxed(packet->completion_signal, kInitSignalValueOne);
-    last_signal_ = packet->completion_signal;
-  }
-
   // Insert packet(s)
   // NOTE: need multiple packets to dispatch the performance counter
   //       packet blob of the legacy devices (gfx8)
@@ -550,12 +516,10 @@ bool VirtualGPU::dispatchGenericAqlPacket(
 
   // Wait on signal ?
   if (blocking) {
-    if (!WaitForSignal(signal)) {
-      LogPrintfError("Failed signal [0x%lx] wait", signal.handle);
+    if (!Barriers().WaitCurrent()) {
+      LogPrintfError("Failed blocking queue wait with signal [0x%lx]", packet->completion_signal.handle);
       return false;
     }
-    // Reset the pool of signals
-    current_signal_ = 0;
   }
 
   return true;
@@ -601,6 +565,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
 
   return false;
 }
+
+// ================================================================================================
 void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
   assert(packet->completion_signal.handle != 0);
   const uint32_t queueSize = gpu_queue_->size;
@@ -631,6 +597,7 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
           packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal);
 }
 
+// ================================================================================================
 void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
                                        uint16_t packetHeader, hsa_signal_t signal) {
   const uint32_t queueSize = gpu_queue_->size;
@@ -641,30 +608,8 @@ void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
   if (signal.handle == 0) {
     // Pool size must grow to the size of pending AQL packets
     const uint32_t pool_size = index - read;
-    if (pool_size >= signal_pool_.size()) {
-      ProfilingSignal profiling_signal = {};
-      if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
-        LogPrintfError("Failed signal allocation id = %d", pool_size);
-      }
-      signal_pool_.push_back(profiling_signal);
-      assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
-    }
-    // Move index inside the valid pool
-    ++current_signal_ %= signal_pool_.size();
-    // Find signal slot
-    ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
-    // Make sure we save the old results in the TS structure
-    if (profilingSignal->ts_ != nullptr) {
-      profilingSignal->ts_->checkGpuTime();
-    }
-    if (timestamp_ != nullptr) {
-      // Update the new TS with the signal info
-      timestamp_->setProfilingSignal(profilingSignal);
-      profilingSignal->ts_ = timestamp_;
-      timestamp_->setAgent(gpu_device_);
-    }
-    packet->completion_signal = profilingSignal->signal_;
-    hsa_signal_store_relaxed(profilingSignal->signal_, kInitSignalValueOne);
+    // Get active signal for current dispatch if profiling is necessary
+    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
   } else {
     assert(signal.handle != 0);
     packet->completion_signal = signal;
@@ -705,44 +650,25 @@ void VirtualGPU::ResetQueueStates() {
     // Release the pool, since runtime just completed a barrier
     // @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued
     resetKernArgPool();
-  } else {
-    // Reset the pool of signals
-    current_signal_ = 0;
   }
 }
 
 // ================================================================================================
-bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier) {
-  // Return if there is no pending dispatch
-  if (!hasPendingDispatch_) {
-    if (dev().settings().barrier_sync_ || !force_barrier) {
-      return false;
-    }
-  }
-  hsa_signal_t wait_signal = barrier_signal_;
+bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier, bool skip_cpu_wait) {
+  if (hasPendingDispatch_ && (dev().settings().barrier_sync_ || force_barrier)) {
+    barrier_packet_.completion_signal = Barriers().ActiveSignal();
 
-  // If barrier sync was requested or runtime didn't provide the last signal
-  if (dev().settings().barrier_sync_ || force_barrier) {
-    // Initialize signal for the barrier packet.
-    hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
-
-    // Dispatch barrier packet into the queue and wait till it finishes.
+    // Dispatch barrier packet into the queue
     dispatchBarrierPacket(&barrier_packet_);
-  }
-  else {
-    // Take the signal of the last submitted dispatch
-    wait_signal = last_signal_;
+    hasPendingDispatch_ = false;
   }
 
-  // Wait for compute work previously submitted
-  if (!WaitForSignal(wait_signal)) {
-    LogError("Waiting for compute work failed!");
-    return false;
+  // Check if runtime could skip CPU wait
+  if (!skip_cpu_wait) {
+    Barriers().WaitCurrent();
+
+    ResetQueueStates();
   }
-
-  hasPendingDispatch_ = false;
-
-  ResetQueueStates();
   return true;
 }
 
@@ -800,7 +726,6 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
   }
 
   aqlHeader_ = dispatchPacketHeader_;
-  barrier_signal_.handle = 0;
 
   // Note: Virtual GPU device creation must be a thread safe operation
   roc_device_.vgpus_.resize(roc_device_.numOfVgpus_);
@@ -808,16 +733,13 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
 
 }
 
+// ================================================================================================
 VirtualGPU::~VirtualGPU() {
   delete blitMgr_;
 
   // Release the resources of signal
   releaseGpuMemoryFence();
 
-  if (barrier_signal_.handle != 0) {
-    hsa_signal_destroy(barrier_signal_);
-  }
-
   destroyPool();
 
   releasePinnedMem();
@@ -868,8 +790,7 @@ bool VirtualGPU::create() {
   gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
   if (!gpu_queue_) return false;
 
-  if (!initPool(dev().settings().kernargPoolSize_,
-                (profiling_ || (amd::IS_HIP)) ? queue_size : 0)) {
+  if (!initPool(dev().settings().kernargPoolSize_)) {
     LogError("Couldn't allocate arguments/signals for the queue");
     return false;
   }
@@ -881,17 +802,9 @@ bool VirtualGPU::create() {
     return false;
   }
 
-  // Create signal for the barrier packet.
-  hsa_signal_t signal = {0};
-  if (HSA_STATUS_SUCCESS != hsa_signal_create(kInitSignalValueOne, 0, nullptr, &signal)) {
-    return false;
-  }
-  barrier_signal_ = signal;
-
   // Initialize barrier packet.
   memset(&barrier_packet_, 0, sizeof(barrier_packet_));
   barrier_packet_.header = kInvalidAql;
-  barrier_packet_.completion_signal = barrier_signal_;
 
   // Create a object of PrintfDbg
   printfdbg_ = new PrintfDbg(roc_device_);
@@ -912,62 +825,32 @@ bool VirtualGPU::create() {
     return false;
   }
 
+  // Allocate signal tracker for ROCr copy queue
+  if (!Barriers().Create(gpu_device())) {
+    LogError("Could not create signal for copy queue!");
+    return false;
+  }
   return true;
 }
 
-bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) {
+// ================================================================================================
+bool VirtualGPU::initPool(size_t kernarg_pool_size) {
   kernarg_pool_size_ = kernarg_pool_size;
   kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, false));
   if (kernarg_pool_base_ == nullptr) {
     return false;
   }
-
-  // Optimization :
-  // For better resource utilization runtime should create them only when required
-  // In case of HIP, Apps create short live streams which do not need more signals
-  // hence starting with smaller number 32. There is code inplace to grow the pool
-  // later when it is needed.
-  bool forced_default_pool_sz = false;
-  if (!profiling_ && (amd::IS_HIP)) {
-    forced_default_pool_sz = true;
-  }
-
-  if (signal_pool_count != 0) {
-    // Reserve signal pool for all entries in the queue, since profiling logic will save the
-    // pointer in timestamp info for the future references
-    signal_pool_.reserve(signal_pool_count);
-    // If barrier is disable, then allocate a small portion of all signals and grow the array later.
-    // @note: the optimization requires a wait for signal on reuse, which is only available when
-    // the barrier is disabled
-    constexpr uint32_t kDefaultSignalPoolSize = 32;
-    const uint32_t default_signal_pool_size =
-    (dev().settings().barrier_sync_ && !forced_default_pool_sz) ?
-        signal_pool_count : kDefaultSignalPoolSize;
-    signal_pool_.resize(default_signal_pool_size);
-    for (uint i = 0; i < default_signal_pool_size; ++i) {
-      ProfilingSignal profilingSignal;
-      if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profilingSignal.signal_)) {
-        return false;
-      }
-      signal_pool_[i] = profilingSignal;
-    }
-  }
-
   return true;
 }
 
+// ================================================================================================
 void VirtualGPU::destroyPool() {
   if (kernarg_pool_base_ != nullptr) {
     roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_);
   }
-
-  if (signal_pool_.size() > 0) {
-    for (uint i = 0; i < signal_pool_.size(); ++i) {
-      hsa_signal_destroy(signal_pool_[i].signal_);
-    }
-  }
 }
 
+// ================================================================================================
 void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
   char* result = nullptr;
   do {
@@ -982,24 +865,21 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
       //! We can issue a barrier to avoid expensive extra memory allocations.
 
       // Initialize signal for the barrier packet.
-      hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
+      barrier_packet_.completion_signal = Barriers().ActiveSignal();
 
       // Dispatch barrier packet into the queue and wait till it finishes.
       dispatchBarrierPacket(&barrier_packet_);
-      if (!WaitForSignal(barrier_signal_)) {
+      if (!Barriers().WaitCurrent()) {
         LogError("Kernel arguments reset failed");
       }
-
       resetKernArgPool();
-
-      // Reset the pool of signals
-      current_signal_ = 0;
     }
   } while (true);
 
   return result;
 }
 
+// ================================================================================================
 /* profilingBegin, when profiling is enabled, creates a timestamp to save in
 * virtualgpu's timestamp_, and calls start() to get the current host
 * timestamp.
@@ -1007,9 +887,8 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
 void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
   if (command.profilingInfo().enabled_) {
     if (timestamp_ != nullptr) {
-      LogWarning(
-          "Trying to create a second timestamp in VirtualGPU. \
-                        This could have unintended consequences.");
+      LogWarning("Trying to create a second timestamp in VirtualGPU. \
+                  This could have unintended consequences.");
       return;
     }
     // Without barrier profiling will wait for each individual signal
@@ -1018,6 +897,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
   }
 }
 
+// ================================================================================================
 /* profilingEnd, when profiling is enabled, checks to see if a signal was
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available. It then saves the pointer
@@ -1033,6 +913,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
   }
 }
 
+// ================================================================================================
 void VirtualGPU::updateCommandsState(amd::Command* list) {
   Timestamp* ts = nullptr;
 
@@ -1335,8 +1216,10 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
 // ================================================================================================
 void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
 #if AMD_HMM_SUPPORT
+  profilingBegin(cmd);
   // Initialize signal for the barrier
-  hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
+  hsa_signal_t wait = Barriers().WaitSignal();
+  hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); 
 
   // Find the requested agent for the transfer
   hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1345,16 +1228,18 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
 
   // Initiate a prefetch command
   hsa_status_t status = hsa_amd_svm_prefetch_async(
-      const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, 0, nullptr, barrier_signal_);
+      const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, 1, &wait, active);
 
-  // Wait for the prefetch
-  if ((status != HSA_STATUS_SUCCESS) || !WaitForSignal(barrier_signal_)) {
+  // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
+  if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
     LogError("hsa_amd_svm_prefetch_async failed");
     cmd.setStatus(CL_INVALID_OPERATION);
   }
 
   // Add system scope, since the prefetch scope is unclear
   addSystemScope();
+
+  profilingEnd(cmd);
 #endif // AMD_HMM_SUPPORT
 }
 
@@ -2490,9 +2375,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
   }
 
   if (gpuKernel.dynamicParallelism()) {
+    barrier_packet_.completion_signal.handle = 0;
     dispatchBarrierPacket(&barrier_packet_);
     static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
-      getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
+        getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
   }
 
   // Check if image buffer write back is required
@@ -2594,9 +2480,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
       uint16_t header = kNopPacketHeader;
       hsa_signal_t sig { 0 };
       dispatchGenericBarrierPacket(&barrier_packet_, header, sig);
-      last_signal_ = barrier_packet_.completion_signal;
-      // Restore barrier signal
-      barrier_packet_.completion_signal = barrier_signal_;
     }
     profilingEnd(vcmd);
   }
@@ -2618,8 +2501,8 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
   profilingBegin(vcmd);
   if (!dev().settings().barrier_sync_) {
     // Force barrier to make sure L2 flush, since interop can be in sysmem
-    constexpr bool ForceBarrier = true;
-    releaseGpuMemoryFence(ForceBarrier);
+    constexpr bool kForceBarrier = true;
+    releaseGpuMemoryFence(kForceBarrier);
   }
   profilingEnd(vcmd);
 }
@@ -2644,6 +2527,9 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
 
 // ================================================================================================
 void VirtualGPU::addXferWrite(Memory& memory) {
+  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+  //!        unconditionally, before it can release pinned memory
+  releaseGpuMemoryFence();
   if (xferWriteBuffers_.size() > 7) {
     dev().xferWrite().release(*this, *xferWriteBuffers_.front());
     xferWriteBuffers_.erase(xferWriteBuffers_.begin());
@@ -2653,6 +2539,7 @@ void VirtualGPU::addXferWrite(Memory& memory) {
   xferWriteBuffers_.push_back(&memory);
 }
 
+// ================================================================================================
 void VirtualGPU::releaseXferWrite() {
   for (auto& memory : xferWriteBuffers_) {
     dev().xferWrite().release(*this, *memory);
@@ -2660,7 +2547,11 @@ void VirtualGPU::releaseXferWrite() {
   xferWriteBuffers_.resize(0);
 }
 
+// ================================================================================================
 void VirtualGPU::addPinnedMem(amd::Memory* mem) {
+  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+  //!        unconditionally, before it can release pinned memory
+  releaseGpuMemoryFence();
   if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
     if (pinnedMems_.size() > 7) {
       pinnedMems_.front()->release();
@@ -2672,6 +2563,7 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
   }
 }
 
+// ================================================================================================
 void VirtualGPU::releasePinnedMem() {
   for (auto& amdMemory : pinnedMems_) {
     amdMemory->release();
@@ -2679,6 +2571,7 @@ void VirtualGPU::releasePinnedMem() {
   pinnedMems_.resize(0);
 }
 
+// ================================================================================================
 amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
   for (auto& amdMemory : pinnedMems_) {
     if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 7f8b93f49d..413159cf3f 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -36,10 +36,11 @@ class Memory;
 class Timestamp;
 
 struct ProfilingSignal : public amd::HeapObject {
-  hsa_signal_t signal_;  //!< HSA signal to track profiling information
-  Timestamp* ts_;        //!< Timestamp object associated with the signal
+  hsa_signal_t signal_; //!< HSA signal to track profiling information
+  Timestamp* ts_;       //!< Timestamp object associated with the signal
+  bool done_;           //!< True if signal is done
 
-  ProfilingSignal() : ts_(nullptr) { signal_.handle = 0; }
+  ProfilingSignal() : ts_(nullptr), done_(true) { signal_.handle = 0; }
 };
 
 // Initial HSA signal value
@@ -111,13 +112,19 @@ class Timestamp {
       hsa_amd_profiling_dispatch_time_t time;
 
       if (splittedDispatch_) {
-        uint64_t start = UINT64_MAX;
+        uint64_t start = std::numeric_limits<uint64_t>::max();
         uint64_t end = 0;
         for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) {
           if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) {
             WaitForSignal(*it);
           }
           hsa_amd_profiling_get_dispatch_time(agent_, *it, &time);
+          if ((time.end - time.start) == 0) {
+            hsa_amd_profiling_async_copy_time_t time_sdma = {};
+            hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
+            time.start = time_sdma.start;
+            time.end = time_sdma.end;
+          }
           if (time.start < start) {
             start = time.start;
           }
@@ -133,10 +140,18 @@ class Timestamp {
           WaitForSignal(profilingSignal_->signal_);
         }
         hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
-        start_ = time.start * ticksToTime_;
-        end_ = time.end * ticksToTime_;
+        if ((time.end - time.start) == 0) {
+          hsa_amd_profiling_async_copy_time_t time_sdma = {};
+          hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
+          start_ = time_sdma.start * ticksToTime_;
+          end_ = time_sdma.end * ticksToTime_;
+        } else {
+          start_ = time.start * ticksToTime_;
+          end_ = time.end * ticksToTime_;
+        }
       }
       profilingSignal_->ts_ = nullptr;
+      profilingSignal_->done_ = true;
       profilingSignal_ = nullptr;
     }
   }
@@ -192,6 +207,109 @@ class VirtualGPU : public device::VirtualDevice {
     size_t maxMemObjectsInQueue_;     //!< Maximum number of mem objects in the queue
   };
 
+  class HwQueueTracker : public amd::EmbeddedObject {
+   public:
+    HwQueueTracker() {}
+
+    ~HwQueueTracker() {
+      for (auto& signal: signal_list_) {
+        if (signal->signal_.handle != 0) {
+          hsa_signal_destroy(signal->signal_);
+        }
+        delete signal;
+      }
+    }
+
+    //! Creates a pool of signals for tracking of HW operations on the queue
+    bool Create(hsa_agent_t agent) {
+      constexpr size_t kSignalListSize = 16;
+      signal_list_.resize(kSignalListSize);
+      for (uint i = 0; i < kSignalListSize; ++i) {
+        ProfilingSignal* signal = new ProfilingSignal();
+        if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
+                                    0, 1, &agent, &signal->signal_))) {
+          return false;
+        }
+        signal_list_[i] = signal;
+      }
+      agent_ = agent;
+      return true;
+    }
+
+    //! Finds a free signal for the upcomming operation
+    hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
+                              Timestamp* ts = nullptr, uint32_t queue_size = 0) {
+      // If queue size grows, then add more signals to avoid more frequent stalls
+      if (queue_size > signal_list_.size()) {
+        ProfilingSignal* signal = new ProfilingSignal();
+        if (signal != nullptr) {
+          if (HSA_STATUS_SUCCESS == hsa_signal_create(
+              0, 1, &agent_, &signal->signal_)) {
+            signal_list_.push_back(signal);
+          }
+        }
+      }
+      // Find valid index
+      ++current_id_ %= signal_list_.size();
+
+      // Make sure the previous operation on the current signal is done
+      WaitCurrent();
+
+      // Have to wait the next signal in the queue to avoid a race condition between
+      // a GPU waiter(which may be not triggered yet) and CPU signal reset below
+      WaitNext();
+
+      // Reset the signal and return
+      hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
+      signal_list_[current_id_]->done_ = false;
+      if (ts != 0) {
+        if (!sdma_profiling_) {
+          hsa_amd_profiling_async_copy_enable(true);
+          sdma_profiling_ = true;
+        }
+        signal_list_[current_id_]->ts_ = ts;
+        ts->setProfilingSignal(signal_list_[current_id_]);
+        ts->setAgent(agent_);
+      }
+      return signal_list_[current_id_]->signal_;
+    }
+
+    //! Wait for the curent active signal. Can idle the queue
+    bool WaitCurrent() { return WaitIndex(current_id_); }
+
+    //! Returns the last submitted signal for a wait
+    hsa_signal_t WaitSignal() const { return signal_list_[current_id_]->signal_; }
+
+   private:
+    //! Wait for the next active signal
+    void WaitNext() {
+      size_t next = (current_id_ + 1) % signal_list_.size();
+      WaitIndex(next);
+    }
+
+    //! Wait for the provided signal
+    bool WaitIndex(size_t index) {
+      // Wait for the current signal
+      if (!signal_list_[index]->done_) {
+        // Update timestamp values if requested
+        if (signal_list_[index]->ts_ != nullptr) {
+          signal_list_[index]->ts_->checkGpuTime();
+        } else {
+          if (!WaitForSignal(signal_list_[index]->signal_)) {
+            LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
+            return false;
+          }
+          signal_list_[index]->done_ = true;
+        }
+      }
+      return true;
+    }
+    std::vector<ProfilingSignal*> signal_list_;  //!< The pool of all signals for processing
+    size_t      current_id_ = 0;          //!< Last submitted signal
+    hsa_agent_t agent_;                   //!< HSA device agent
+    bool        sdma_profiling_ = false;  //!< Don't enable SDMA profiling by default
+  };
+
   VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
              const std::vector<uint32_t>& cuMask = {},
              amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
@@ -256,7 +374,7 @@ class VirtualGPU : public device::VirtualDevice {
    *
    * @return bool true if Wait returned successfully, false otherwise
    */
-  bool releaseGpuMemoryFence(bool force_barrier    = false);
+  bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);
 
   hsa_agent_t gpu_device() { return gpu_device_; }
   hsa_queue_t* gpu_queue() { return gpu_queue_; }
@@ -297,6 +415,10 @@ class VirtualGPU : public device::VirtualDevice {
   void addSystemScope() { addSystemScope_ = true; }
   void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }
 
+  HwQueueTracker& Barriers() { return barriers_; }
+
+  Timestamp* timestamp() const { return timestamp_; }
+
   // } roc OpenCL integration
  private:
   bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
@@ -316,7 +438,7 @@ class VirtualGPU : public device::VirtualDevice {
   void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet,
                                 amd::NDRangeContainer& sizes);
 
-  bool initPool(size_t kernarg_pool_size, uint signal_pool_count);
+  bool initPool(size_t kernarg_pool_size);
   void destroyPool();
 
   void* allocKernArg(size_t size, size_t alignment);
@@ -368,7 +490,7 @@ class VirtualGPU : public device::VirtualDevice {
       uint32_t cooperative_        : 1; //!< Cooperative launch is enabled
       uint32_t addSystemScope_     : 1; //!< Insert a system scope to the next aql
       uint32_t isLastCommandSDMA_  : 1; //!< Keep track if the last command was SDMA and 
-                                               //!< not send Barrier packets if barrier_sync is 0
+                                        //!< not send Barrier packets if barrier_sync is 0
     };
     uint32_t  state_;
   };
@@ -379,8 +501,7 @@ class VirtualGPU : public device::VirtualDevice {
   hsa_agent_t gpu_device_;  //!< Physical device
   hsa_queue_t* gpu_queue_;  //!< Queue associated with a gpu
   hsa_barrier_and_packet_t barrier_packet_;
-  hsa_signal_t barrier_signal_;
-  hsa_signal_t last_signal_ = {};  //!< Last submitted signal
+
   uint32_t dispatch_id_;  //!< This variable must be updated atomically.
   Device& roc_device_;    //!< roc device object
   PrintfDbg* printfdbg_;
@@ -396,12 +517,12 @@ class VirtualGPU : public device::VirtualDevice {
   hsa_queue_t* schedulerQueue_;
   hsa_signal_t schedulerSignal_;
 
+  HwQueueTracker  barriers_;      //!< Tracks active barriers in ROCr
+
   char* kernarg_pool_base_;
   size_t kernarg_pool_size_;
   uint kernarg_pool_cur_offset_;
 
-  std::vector<ProfilingSignal> signal_pool_;  //!< Pool of signals for profiling
-  uint32_t current_signal_ = 0;               //!< Current avaialble signal in the pool
   friend class Timestamp;
 
   //  PM4 packet for gfx8 performance counter