Add HSA signal global tracking logic.

Implement the global class for signals tracking per device queue. Switch to the new tracking mechanism. Change-Id: I3c4dda04b34e6d18d6a95510d84102909633b415 [ROCm/clr commit: 8698aeef0d]
2021-01-07 16:41:30 -05:00
parent a962e2d0b3
commit 30cf81fc93
6 changed files with 284 additions and 289 deletions
@@ -62,12 +62,14 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory&
 bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                const amd::Coord3D& origin, const amd::Coord3D& size,
                                bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  gpu().releaseGpuMemoryFence();
+  // HSA copy functionality with a possible async operation
+  gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);

  // Use host copy if memory has direct access
  if (setup_.disableReadBuffer_ ||
      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().Barriers().WaitCurrent();
    return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
  } else {
    size_t srcSize = size[0];
@@ -149,12 +151,14 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
 bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
                                    const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
                                    const amd::Coord3D& size, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation
  gpu().releaseGpuMemoryFence();

  // Use host copy if memory has direct access
  if (setup_.disableReadBufferRect_ ||
      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().Barriers().WaitCurrent();
    return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
  } else {
    Memory& xferBuf = dev().xferRead().acquire();
@@ -187,7 +191,7 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
 bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin,
                               const amd::Coord3D& size, size_t rowPitch, size_t slicePitch,
                               bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation
  gpu().releaseGpuMemoryFence();

  if (setup_.disableReadImage_) {
@@ -219,14 +223,16 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
 bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  gpu().releaseGpuMemoryFence();
-
  // Use host copy if memory has direct access
  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
      gpuMem(dstMemory).IsPersistentDirectMap()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
  } else {
+    // HSA copy functionality with a possible async operation
+    gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
+
    size_t dstSize = size[0];
    size_t tmpSize = 0;
    size_t offset = 0;
@@ -309,7 +315,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
                                     const amd::BufferRect& hostRect,
                                     const amd::BufferRect& bufRect, const amd::Coord3D& size,
                                     bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation
  gpu().releaseGpuMemoryFence();

  // Use host copy if memory has direct access
@@ -347,7 +353,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
 bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
                                const amd::Coord3D& origin, const amd::Coord3D& size,
                                size_t rowPitch, size_t slicePitch, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation
  gpu().releaseGpuMemoryFence();

  if (setup_.disableWriteImage_) {
@@ -365,12 +371,11 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
 bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                const amd::Coord3D& size, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  gpu().releaseGpuMemoryFence();
-
  if (setup_.disableCopyBuffer_ ||
      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
      (dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size);
  } else {
    return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory), srcOrigin, dstOrigin, size);
@@ -383,14 +388,14 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
 bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
                                    const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
                                    const amd::Coord3D& size, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  gpu().releaseGpuMemoryFence();
-
  if (setup_.disableCopyBufferRect_ ||
      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
       dstMemory.isHostMemDirectAccess())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire);
  } else {
+    gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);

    void* src = gpuMem(srcMemory).getDeviceMemory();
    void* dst = gpuMem(dstMemory).getDeviceMemory();
@@ -436,25 +441,21 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
    }

    if (isSubwindowRectCopy ) {
-      hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
+      hsa_signal_t wait = gpu().Barriers().WaitSignal();
+      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());

      // Copy memory line by line
-      hsa_status_t status =
-          hsa_amd_memory_async_copy_rect(&dstMem, &offset, &srcMem, &offset, &dim, agent,
-                                    direction, 0, nullptr, completion_signal_);
+      hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
+          &srcMem, &offset, &dim, agent, direction, 1, &wait, active);
      if (status != HSA_STATUS_SUCCESS) {
        LogPrintfError("DMA buffer failed with code %d", status);
        return false;
      }
-
-      if (!WaitForSignal(completion_signal_)) {
-        LogError("Async copy failed");
-        return false;
-      }
    } else {
      // Fall to line by line copies
      const hsa_signal_value_t kInitVal = size[2] * size[1];
-      hsa_signal_store_relaxed(completion_signal_, kInitVal);
+      hsa_signal_t wait = gpu().Barriers().WaitSignal();
+      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());

      for (size_t z = 0; z < size[2]; ++z) {
        for (size_t y = 0; y < size[1]; ++y) {
@@ -462,10 +463,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
          size_t dstOffset = dstRect.offset(0, y, z);

          // Copy memory line by line
-          hsa_status_t status =
-              hsa_amd_memory_async_copy((reinterpret_cast<address>(dst) + dstOffset), dstAgent,
-                                        (reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
-                                        size[0], 0, nullptr, completion_signal_);
+          hsa_status_t status = hsa_amd_memory_async_copy(
+              (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
+              (reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
+              size[0], 1, &wait, active);
          gpu().setLastCommandSDMA(true) ;
          if (status != HSA_STATUS_SUCCESS) {
            LogPrintfError("DMA buffer failed with code %d", status);
@@ -473,14 +474,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
          }
        }
      }
-
-      if (!WaitForSignal(completion_signal_)) {
-        LogError("Async copy failed");
-        return false;
-      }
    }
-
  }
+  // Explicit wait for now, until runtime could distinguish compute and sdma operations
+  gpu().Barriers().WaitCurrent();
  return true;
 }

@@ -489,12 +486,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
                                       const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                       const amd::Coord3D& size, bool entire, size_t rowPitch,
                                       size_t slicePitch) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
-    gpu().releaseGpuMemoryFence(true);
-  } else {
-    gpu().releaseGpuMemoryFence();
-  }
+  // HSA copy functionality with a possible async operation, hence make sure GPU is done
+  bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+  gpu().releaseGpuMemoryFence(force_barrier);

  bool result = false;

@@ -504,9 +498,6 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
  } else {
    Image& srcImage = static_cast<roc::Image&>(srcMemory);
    Buffer& dstBuffer = static_cast<roc::Buffer&>(dstMemory);
-
-    // Use ROC path for a transfer
-    // Note: it doesn't support SDMA
    address dstHost = reinterpret_cast<address>(dstBuffer.getDeviceMemory()) + dstOrigin[0];

    // Use ROCm path for a transfer.
@@ -540,12 +531,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
                                       const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                       const amd::Coord3D& size, bool entire, size_t rowPitch,
                                       size_t slicePitch) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
-  if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
-    gpu().releaseGpuMemoryFence(true);
-  } else {
-    gpu().releaseGpuMemoryFence();
-  }
+  // HSA copy functionality with a possible async operation, hence make sure GPU is done
+  bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+  gpu().releaseGpuMemoryFence(force_barrier);

  bool result = false;

@@ -588,7 +576,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
 bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
                               const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                               const amd::Coord3D& size, bool entire) const {
-  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  // HSA copy functionality with a possible async operation, hence make sure GPU is done
  gpu().releaseGpuMemoryFence();

  bool result = false;
@@ -610,9 +598,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
  address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
  address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());

-  if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
-    gpu().releaseGpuMemoryFence(true);
-  }
+  bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+  gpu().releaseGpuMemoryFence(force_barrier, kSkipCpuWait);

  src += srcOrigin[0];
  dst += dstOrigin[0];
@@ -620,6 +607,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
  // Just call copy function for full profile
  hsa_status_t status;
  if (dev().agent_profile() == HSA_PROFILE_FULL) {
+    // Stall GPU, sicne CPU copy is possible
+    gpu().Barriers().WaitCurrent();
    status = hsa_memory_copy(dst, src, size[0]);
    if (status != HSA_STATUS_SUCCESS) {
      LogPrintfError("Hsa copy of data failed with code %d", status);
@@ -649,21 +638,15 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
    srcAgent = dstAgent = dev().getBackendDevice();
  }

-  hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
-
+  hsa_signal_t wait = gpu().Barriers().WaitSignal();
+  hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
  // Use SDMA to transfer the data
-  status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 0, nullptr,
-                                     completion_signal_);
+  status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 1, &wait, active);
  gpu().setLastCommandSDMA(true);
+  // Explicit wait for now, until runtime could distinguish compute and sdma operations
+  gpu().Barriers().WaitCurrent();
  if (status == HSA_STATUS_SUCCESS) {
-    hsa_signal_value_t val;
-
-    if (!WaitForSignal(completion_signal_)) {
-      LogError("Async copy failed");
-      status = HSA_STATUS_ERROR;
-    } else {
-      gpu().addSystemScope();
-    }
+    gpu().addSystemScope();
  } else {
    LogPrintfError("Hsa copy from host to device failed with code %d", status);
  }
@@ -674,6 +657,10 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
 // ================================================================================================
 bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
                                   address staging, bool hostToDev) const {
+  // Stall GPU, sicne CPU copy is possible
+  bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
+  gpu().releaseGpuMemoryFence(force_barrier);
+
  // No allocation is necessary for Full Profile
  hsa_status_t status;
  if (dev().agent_profile() == HSA_PROFILE_FULL) {
@@ -688,14 +675,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
  size_t offset = 0;

  address hsaBuffer = staging;
-  if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
-    gpu().releaseGpuMemoryFence(true);
-  }

  // Allocate requested size of memory
  while (totalSize > 0) {
    size = std::min(totalSize, dev().settings().stagedXferSize_);
-    hsa_signal_silent_store_relaxed(completion_signal_, kInitSignalValueOne);
+    hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());

    // Copy data from Host to Device
    if (hostToDev) {
@@ -707,17 +691,13 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_

      memcpy(hsaBuffer, hostSrc + offset, size);
      status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
-                                         srcAgent, size, 0, nullptr, completion_signal_);
+                                         srcAgent, size, 0, nullptr, active);
      gpu().setLastCommandSDMA(true);
-      if (status == HSA_STATUS_SUCCESS) {
-        if (!WaitForSignal(completion_signal_)) {
-          LogError("Async copy failed");
-          return false;
-        }
-      } else {
+      if (status != HSA_STATUS_SUCCESS) {
        LogPrintfError("Hsa copy from host to device failed with code %d", status);
        return false;
      }
+      gpu().Barriers().WaitCurrent();
      totalSize -= size;
      offset += size;
      continue;
@@ -730,15 +710,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
        (size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();

    // Copy data from Device to Host
-    status =
-        hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
-                                  dev().getBackendDevice(), size, 0, nullptr, completion_signal_);
+    status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
+        dev().getBackendDevice(), size, 0, nullptr, active);
    gpu().setLastCommandSDMA(true);
    if (status == HSA_STATUS_SUCCESS) {
-      if (!WaitForSignal(completion_signal_)) {
-        LogError("Async copy failed");
-        return false;
-      }
+      gpu().Barriers().WaitCurrent();
      memcpy(hostDst + offset, hsaBuffer, size);
    } else {
      LogPrintfError("Hsa copy from device to host failed with code %d", status);
@@ -1083,11 +1059,7 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
  releaseArguments(parameters);
  if (releaseView) {
    // todo SRD programming could be changed to avoid a stall
-    if(!dev().settings().barrier_sync_) {
-      gpu().releaseGpuMemoryFence(true);
-   } else {
-     gpu().releaseGpuMemoryFence();
-   }
+    gpu().releaseGpuMemoryFence();
    dstView->owner()->release();
  }

@@ -1285,11 +1257,7 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
  releaseArguments(parameters);
  if (releaseView) {
    // todo SRD programming could be changed to avoid a stall
-    if(!dev().settings().barrier_sync_) {
-      gpu().releaseGpuMemoryFence(true);
-    } else {
-      gpu().releaseGpuMemoryFence();
-    }
+    gpu().releaseGpuMemoryFence();
    srcView->owner()->release();
  }

@@ -1465,6 +1433,8 @@ bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost,

  // Use host copy if memory has direct access
  if (setup_.disableReadImage_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire);
    synchronize();
    return result;
@@ -1510,6 +1480,8 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor

  // Use host copy if memory has direct access
  if (setup_.disableWriteImage_ || dstMemory.isHostMemDirectAccess()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
    synchronize();
    return result;
@@ -1704,6 +1676,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,

  // Use host copy if memory has direct access
  if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
    synchronize();
    return result;
@@ -1753,6 +1727,8 @@ bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
  // Use host copy if memory has direct access
  if (setup_.disableReadBufferRect_ ||
      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
    synchronize();
    return result;
@@ -1814,6 +1790,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
  // Use host copy if memory has direct access
  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
      gpuMem(dstMemory).IsPersistentDirectMap()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
    synchronize();
    return result;
@@ -1864,6 +1842,8 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
  // Use host copy if memory has direct access
  if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() ||
      gpuMem(dstMemory).IsPersistentDirectMap()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
    synchronize();
    return result;
@@ -1913,6 +1893,8 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,

  // Use host fill if memory has direct access
  if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire);
    synchronize();
    return result;
@@ -2074,6 +2056,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,

  // Use host fill if memory has direct access
  if (setup_.disableFillImage_ || memory.isHostMemDirectAccess()) {
+    // Stall GPU before CPU access
+    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);
    synchronize();
    return result;
@@ -40,6 +40,9 @@ class Kernel;
 class Memory;
 class VirtualGPU;

+constexpr bool kSkipCpuWait = true;
+constexpr bool kIgnoreBarrier = false;
+
 //! DMA Blit Manager
 class DmaBlitManager : public device::HostBlitManager {
 public:
@@ -49,19 +52,10 @@ class DmaBlitManager : public device::HostBlitManager {
                 );

  //! Destructor
-  virtual ~DmaBlitManager() {
-    if (completion_signal_.handle != 0) {
-      hsa_signal_destroy(completion_signal_);
-    }
-  }
+  virtual ~DmaBlitManager() {}

  //! Creates DmaBlitManager object
-  virtual bool create(amd::Device& device) {
-    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &completion_signal_)) {
-      return false;
-    }
-    return true;
-  }
+  virtual bool create(amd::Device& device) { return true; }

  //! Copies a buffer object to system memory
  virtual bool readBuffer(device::Memory& srcMemory,   //!< Source memory object
@@ -225,9 +219,6 @@ class DmaBlitManager : public device::HostBlitManager {
                         size_t xferSize       //!< Transfer size
                         ) const;

-  //! Handle of ROC Device object
-  hsa_signal_t completion_signal_;
-
  //! Assits in transferring data from Host to Local or vice versa
  //! taking into account the Hsail profile supported by Hsa Agent
  bool hsaCopyStaged(const_address hostSrc,  //!< Contains source data to be copied
@@ -260,7 +260,12 @@ Device::~Device() {
    context().svmFree(coopHostcallBuffer_);
    coopHostcallBuffer_ = nullptr;
  }
+
+  if (0 != prefetch_signal_.handle) {
+    hsa_signal_destroy(prefetch_signal_);
+  }
 }
+
 bool NullDevice::initCompiler(bool isOffline) {
 #if defined(WITH_COMPILER_LIB)
  // Initialize the compiler handle if has already not been initialized
@@ -199,8 +199,9 @@ void Memory::cpuUnmap(device::VirtualDevice& vDev) {
                                    amd::Coord3D(size()), true)) {
      LogError("[OCL] Fail sync the device memory on cpuUnmap");
    }
+    // Wait on CPU for the transfer
+    static_cast<roc::VirtualGPU&>(vDev).releaseGpuMemoryFence();
  }
-
  decIndMapCount();
 }

@@ -456,34 +456,12 @@ bool VirtualGPU::dispatchGenericAqlPacket(
  // Check for queue full and wait if needed.
  uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
  uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
-  hsa_signal_t signal;

-  // TODO: placeholder to setup the kernel to populate start and end timestamp.
  if (timestamp_ != nullptr) {
    // Pool size must grow to the size of pending AQL packets
    const uint32_t pool_size = index - read;
-    if (pool_size >= signal_pool_.size()) {
-      ProfilingSignal profiling_signal = {};
-      if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
-        LogPrintfError("Failed signal allocation id = %d", pool_size);
-        return false;
-      }
-      signal_pool_.push_back(profiling_signal);
-      assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
-    }
-    // Move index inside the valid pool
-    ++current_signal_ %= signal_pool_.size();
-    // Find signal slot
-    ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
-    // Make sure we save the old results in the TS structure
-    if (profilingSignal->ts_ != nullptr) {
-      profilingSignal->ts_->checkGpuTime();
-    }
-    // Update the new TS with the signal info
-    timestamp_->setProfilingSignal(profilingSignal);
-    packet->completion_signal = profilingSignal->signal_;
-    profilingSignal->ts_ = timestamp_;
-    timestamp_->setAgent(gpu_device_);
+    // Get active signal for current dispatch if profiling is necessary
+    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
  }

  // Make sure the slot is free for usage
@@ -494,23 +472,11 @@ bool VirtualGPU::dispatchGenericAqlPacket(
  // Add blocking command if the original value of read index was behind of the queue size
  if (blocking || (index - read) >= queueMask) {
    if (packet->completion_signal.handle == 0) {
-      packet->completion_signal = barrier_signal_;
+      packet->completion_signal = Barriers().ActiveSignal();
    }
-    signal = packet->completion_signal;
-    // Initialize signal for a wait
-    hsa_signal_store_relaxed(signal, kInitSignalValueOne);
    blocking = true;
  }

-  // If runtime doesn't use the barrier, then make sure it tracks the last submitted command
-  if (!dev().settings().barrier_sync_) {
-    // Initialize signal for a wait
-    assert(packet->completion_signal.handle != 0 &&
-        "There is no HSA signal associated with the last command!");
-    hsa_signal_store_relaxed(packet->completion_signal, kInitSignalValueOne);
-    last_signal_ = packet->completion_signal;
-  }
-
  // Insert packet(s)
  // NOTE: need multiple packets to dispatch the performance counter
  //       packet blob of the legacy devices (gfx8)
@@ -550,12 +516,10 @@ bool VirtualGPU::dispatchGenericAqlPacket(

  // Wait on signal ?
  if (blocking) {
-    if (!WaitForSignal(signal)) {
-      LogPrintfError("Failed signal [0x%lx] wait", signal.handle);
+    if (!Barriers().WaitCurrent()) {
+      LogPrintfError("Failed blocking queue wait with signal [0x%lx]", packet->completion_signal.handle);
      return false;
    }
-    // Reset the pool of signals
-    current_signal_ = 0;
  }

  return true;
@@ -601,6 +565,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,

  return false;
 }
+
+// ================================================================================================
 void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
  assert(packet->completion_signal.handle != 0);
  const uint32_t queueSize = gpu_queue_->size;
@@ -631,6 +597,7 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
          packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal);
 }

+// ================================================================================================
 void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
                                       uint16_t packetHeader, hsa_signal_t signal) {
  const uint32_t queueSize = gpu_queue_->size;
@@ -641,30 +608,8 @@ void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
  if (signal.handle == 0) {
    // Pool size must grow to the size of pending AQL packets
    const uint32_t pool_size = index - read;
-    if (pool_size >= signal_pool_.size()) {
-      ProfilingSignal profiling_signal = {};
-      if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
-        LogPrintfError("Failed signal allocation id = %d", pool_size);
-      }
-      signal_pool_.push_back(profiling_signal);
-      assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
-    }
-    // Move index inside the valid pool
-    ++current_signal_ %= signal_pool_.size();
-    // Find signal slot
-    ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
-    // Make sure we save the old results in the TS structure
-    if (profilingSignal->ts_ != nullptr) {
-      profilingSignal->ts_->checkGpuTime();
-    }
-    if (timestamp_ != nullptr) {
-      // Update the new TS with the signal info
-      timestamp_->setProfilingSignal(profilingSignal);
-      profilingSignal->ts_ = timestamp_;
-      timestamp_->setAgent(gpu_device_);
-    }
-    packet->completion_signal = profilingSignal->signal_;
-    hsa_signal_store_relaxed(profilingSignal->signal_, kInitSignalValueOne);
+    // Get active signal for current dispatch if profiling is necessary
+    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
  } else {
    assert(signal.handle != 0);
    packet->completion_signal = signal;
@@ -705,44 +650,25 @@ void VirtualGPU::ResetQueueStates() {
    // Release the pool, since runtime just completed a barrier
    // @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued
    resetKernArgPool();
-  } else {
-    // Reset the pool of signals
-    current_signal_ = 0;
  }
 }

 // ================================================================================================
-bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier) {
-  // Return if there is no pending dispatch
-  if (!hasPendingDispatch_) {
-    if (dev().settings().barrier_sync_ || !force_barrier) {
-      return false;
-    }
-  }
-  hsa_signal_t wait_signal = barrier_signal_;
+bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier, bool skip_cpu_wait) {
+  if (hasPendingDispatch_ && (dev().settings().barrier_sync_ || force_barrier)) {
+    barrier_packet_.completion_signal = Barriers().ActiveSignal();

-  // If barrier sync was requested or runtime didn't provide the last signal
-  if (dev().settings().barrier_sync_ || force_barrier) {
-    // Initialize signal for the barrier packet.
-    hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
-
-    // Dispatch barrier packet into the queue and wait till it finishes.
+    // Dispatch barrier packet into the queue
    dispatchBarrierPacket(&barrier_packet_);
-  }
-  else {
-    // Take the signal of the last submitted dispatch
-    wait_signal = last_signal_;
+    hasPendingDispatch_ = false;
  }

-  // Wait for compute work previously submitted
-  if (!WaitForSignal(wait_signal)) {
-    LogError("Waiting for compute work failed!");
-    return false;
+  // Check if runtime could skip CPU wait
+  if (!skip_cpu_wait) {
+    Barriers().WaitCurrent();
+
+    ResetQueueStates();
  }
-
-  hasPendingDispatch_ = false;
-
-  ResetQueueStates();
  return true;
 }

@@ -800,7 +726,6 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
  }

  aqlHeader_ = dispatchPacketHeader_;
-  barrier_signal_.handle = 0;

  // Note: Virtual GPU device creation must be a thread safe operation
  roc_device_.vgpus_.resize(roc_device_.numOfVgpus_);
@@ -808,16 +733,13 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,

 }

+// ================================================================================================
 VirtualGPU::~VirtualGPU() {
  delete blitMgr_;

  // Release the resources of signal
  releaseGpuMemoryFence();

-  if (barrier_signal_.handle != 0) {
-    hsa_signal_destroy(barrier_signal_);
-  }
-
  destroyPool();

  releasePinnedMem();
@@ -868,8 +790,7 @@ bool VirtualGPU::create() {
  gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
  if (!gpu_queue_) return false;

-  if (!initPool(dev().settings().kernargPoolSize_,
-                (profiling_ || (amd::IS_HIP)) ? queue_size : 0)) {
+  if (!initPool(dev().settings().kernargPoolSize_)) {
    LogError("Couldn't allocate arguments/signals for the queue");
    return false;
  }
@@ -881,17 +802,9 @@ bool VirtualGPU::create() {
    return false;
  }

-  // Create signal for the barrier packet.
-  hsa_signal_t signal = {0};
-  if (HSA_STATUS_SUCCESS != hsa_signal_create(kInitSignalValueOne, 0, nullptr, &signal)) {
-    return false;
-  }
-  barrier_signal_ = signal;
-
  // Initialize barrier packet.
  memset(&barrier_packet_, 0, sizeof(barrier_packet_));
  barrier_packet_.header = kInvalidAql;
-  barrier_packet_.completion_signal = barrier_signal_;

  // Create a object of PrintfDbg
  printfdbg_ = new PrintfDbg(roc_device_);
@@ -912,62 +825,32 @@ bool VirtualGPU::create() {
    return false;
  }

+  // Allocate signal tracker for ROCr copy queue
+  if (!Barriers().Create(gpu_device())) {
+    LogError("Could not create signal for copy queue!");
+    return false;
+  }
  return true;
 }

-bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) {
+// ================================================================================================
+bool VirtualGPU::initPool(size_t kernarg_pool_size) {
  kernarg_pool_size_ = kernarg_pool_size;
  kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, false));
  if (kernarg_pool_base_ == nullptr) {
    return false;
  }
-
-  // Optimization :
-  // For better resource utilization runtime should create them only when required
-  // In case of HIP, Apps create short live streams which do not need more signals
-  // hence starting with smaller number 32. There is code inplace to grow the pool
-  // later when it is needed.
-  bool forced_default_pool_sz = false;
-  if (!profiling_ && (amd::IS_HIP)) {
-    forced_default_pool_sz = true;
-  }
-
-  if (signal_pool_count != 0) {
-    // Reserve signal pool for all entries in the queue, since profiling logic will save the
-    // pointer in timestamp info for the future references
-    signal_pool_.reserve(signal_pool_count);
-    // If barrier is disable, then allocate a small portion of all signals and grow the array later.
-    // @note: the optimization requires a wait for signal on reuse, which is only available when
-    // the barrier is disabled
-    constexpr uint32_t kDefaultSignalPoolSize = 32;
-    const uint32_t default_signal_pool_size =
-    (dev().settings().barrier_sync_ && !forced_default_pool_sz) ?
-        signal_pool_count : kDefaultSignalPoolSize;
-    signal_pool_.resize(default_signal_pool_size);
-    for (uint i = 0; i < default_signal_pool_size; ++i) {
-      ProfilingSignal profilingSignal;
-      if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profilingSignal.signal_)) {
-        return false;
-      }
-      signal_pool_[i] = profilingSignal;
-    }
-  }
-
  return true;
 }

+// ================================================================================================
 void VirtualGPU::destroyPool() {
  if (kernarg_pool_base_ != nullptr) {
    roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_);
  }
-
-  if (signal_pool_.size() > 0) {
-    for (uint i = 0; i < signal_pool_.size(); ++i) {
-      hsa_signal_destroy(signal_pool_[i].signal_);
-    }
-  }
 }

+// ================================================================================================
 void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
  char* result = nullptr;
  do {
@@ -982,24 +865,21 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
      //! We can issue a barrier to avoid expensive extra memory allocations.

      // Initialize signal for the barrier packet.
-      hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
+      barrier_packet_.completion_signal = Barriers().ActiveSignal();

      // Dispatch barrier packet into the queue and wait till it finishes.
      dispatchBarrierPacket(&barrier_packet_);
-      if (!WaitForSignal(barrier_signal_)) {
+      if (!Barriers().WaitCurrent()) {
        LogError("Kernel arguments reset failed");
      }
-
      resetKernArgPool();
-
-      // Reset the pool of signals
-      current_signal_ = 0;
    }
  } while (true);

  return result;
 }

+// ================================================================================================
 /* profilingBegin, when profiling is enabled, creates a timestamp to save in
 * virtualgpu's timestamp_, and calls start() to get the current host
 * timestamp.
@@ -1007,9 +887,8 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
 void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
  if (command.profilingInfo().enabled_) {
    if (timestamp_ != nullptr) {
-      LogWarning(
-          "Trying to create a second timestamp in VirtualGPU. \
-                        This could have unintended consequences.");
+      LogWarning("Trying to create a second timestamp in VirtualGPU. \
+                  This could have unintended consequences.");
      return;
    }
    // Without barrier profiling will wait for each individual signal
@@ -1018,6 +897,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
  }
 }

+// ================================================================================================
 /* profilingEnd, when profiling is enabled, checks to see if a signal was
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available. It then saves the pointer
@@ -1033,6 +913,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
  }
 }

+// ================================================================================================
 void VirtualGPU::updateCommandsState(amd::Command* list) {
  Timestamp* ts = nullptr;

@@ -1335,8 +1216,10 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
 // ================================================================================================
 void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
 #if AMD_HMM_SUPPORT
+  profilingBegin(cmd);
  // Initialize signal for the barrier
-  hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
+  hsa_signal_t wait = Barriers().WaitSignal();
+  hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); 

  // Find the requested agent for the transfer
  hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1345,16 +1228,18 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {

  // Initiate a prefetch command
  hsa_status_t status = hsa_amd_svm_prefetch_async(
-      const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, 0, nullptr, barrier_signal_);
+      const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, 1, &wait, active);

-  // Wait for the prefetch
-  if ((status != HSA_STATUS_SUCCESS) || !WaitForSignal(barrier_signal_)) {
+  // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
+  if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
    LogError("hsa_amd_svm_prefetch_async failed");
    cmd.setStatus(CL_INVALID_OPERATION);
  }

  // Add system scope, since the prefetch scope is unclear
  addSystemScope();
+
+  profilingEnd(cmd);
 #endif // AMD_HMM_SUPPORT
 }

@@ -2490,9 +2375,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
  }

  if (gpuKernel.dynamicParallelism()) {
+    barrier_packet_.completion_signal.handle = 0;
    dispatchBarrierPacket(&barrier_packet_);
    static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
-      getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
+        getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
  }

  // Check if image buffer write back is required
@@ -2594,9 +2480,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
      uint16_t header = kNopPacketHeader;
      hsa_signal_t sig { 0 };
      dispatchGenericBarrierPacket(&barrier_packet_, header, sig);
-      last_signal_ = barrier_packet_.completion_signal;
-      // Restore barrier signal
-      barrier_packet_.completion_signal = barrier_signal_;
    }
    profilingEnd(vcmd);
  }
@@ -2618,8 +2501,8 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
  profilingBegin(vcmd);
  if (!dev().settings().barrier_sync_) {
    // Force barrier to make sure L2 flush, since interop can be in sysmem
-    constexpr bool ForceBarrier = true;
-    releaseGpuMemoryFence(ForceBarrier);
+    constexpr bool kForceBarrier = true;
+    releaseGpuMemoryFence(kForceBarrier);
  }
  profilingEnd(vcmd);
 }
@@ -2644,6 +2527,9 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {

 // ================================================================================================
 void VirtualGPU::addXferWrite(Memory& memory) {
+  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+  //!        unconditionally, before it can release pinned memory
+  releaseGpuMemoryFence();
  if (xferWriteBuffers_.size() > 7) {
    dev().xferWrite().release(*this, *xferWriteBuffers_.front());
    xferWriteBuffers_.erase(xferWriteBuffers_.begin());
@@ -2653,6 +2539,7 @@ void VirtualGPU::addXferWrite(Memory& memory) {
  xferWriteBuffers_.push_back(&memory);
 }

+// ================================================================================================
 void VirtualGPU::releaseXferWrite() {
  for (auto& memory : xferWriteBuffers_) {
    dev().xferWrite().release(*this, *memory);
@@ -2660,7 +2547,11 @@ void VirtualGPU::releaseXferWrite() {
  xferWriteBuffers_.resize(0);
 }

+// ================================================================================================
 void VirtualGPU::addPinnedMem(amd::Memory* mem) {
+  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+  //!        unconditionally, before it can release pinned memory
+  releaseGpuMemoryFence();
  if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
    if (pinnedMems_.size() > 7) {
      pinnedMems_.front()->release();
@@ -2672,6 +2563,7 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
  }
 }

+// ================================================================================================
 void VirtualGPU::releasePinnedMem() {
  for (auto& amdMemory : pinnedMems_) {
    amdMemory->release();
@@ -2679,6 +2571,7 @@ void VirtualGPU::releasePinnedMem() {
  pinnedMems_.resize(0);
 }

+// ================================================================================================
 amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
  for (auto& amdMemory : pinnedMems_) {
    if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
@@ -36,10 +36,11 @@ class Memory;
 class Timestamp;

 struct ProfilingSignal : public amd::HeapObject {
-  hsa_signal_t signal_;  //!< HSA signal to track profiling information
-  Timestamp* ts_;        //!< Timestamp object associated with the signal
+  hsa_signal_t signal_; //!< HSA signal to track profiling information
+  Timestamp* ts_;       //!< Timestamp object associated with the signal
+  bool done_;           //!< True if signal is done

-  ProfilingSignal() : ts_(nullptr) { signal_.handle = 0; }
+  ProfilingSignal() : ts_(nullptr), done_(true) { signal_.handle = 0; }
 };

 // Initial HSA signal value
@@ -111,13 +112,19 @@ class Timestamp {
      hsa_amd_profiling_dispatch_time_t time;

      if (splittedDispatch_) {
-        uint64_t start = UINT64_MAX;
+        uint64_t start = std::numeric_limits<uint64_t>::max();
        uint64_t end = 0;
        for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) {
          if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) {
            WaitForSignal(*it);
          }
          hsa_amd_profiling_get_dispatch_time(agent_, *it, &time);
+          if ((time.end - time.start) == 0) {
+            hsa_amd_profiling_async_copy_time_t time_sdma = {};
+            hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
+            time.start = time_sdma.start;
+            time.end = time_sdma.end;
+          }
          if (time.start < start) {
            start = time.start;
          }
@@ -133,10 +140,18 @@ class Timestamp {
          WaitForSignal(profilingSignal_->signal_);
        }
        hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
-        start_ = time.start * ticksToTime_;
-        end_ = time.end * ticksToTime_;
+        if ((time.end - time.start) == 0) {
+          hsa_amd_profiling_async_copy_time_t time_sdma = {};
+          hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
+          start_ = time_sdma.start * ticksToTime_;
+          end_ = time_sdma.end * ticksToTime_;
+        } else {
+          start_ = time.start * ticksToTime_;
+          end_ = time.end * ticksToTime_;
+        }
      }
      profilingSignal_->ts_ = nullptr;
+      profilingSignal_->done_ = true;
      profilingSignal_ = nullptr;
    }
  }
@@ -192,6 +207,109 @@ class VirtualGPU : public device::VirtualDevice {
    size_t maxMemObjectsInQueue_;     //!< Maximum number of mem objects in the queue
  };

+  class HwQueueTracker : public amd::EmbeddedObject {
+   public:
+    HwQueueTracker() {}
+
+    ~HwQueueTracker() {
+      for (auto& signal: signal_list_) {
+        if (signal->signal_.handle != 0) {
+          hsa_signal_destroy(signal->signal_);
+        }
+        delete signal;
+      }
+    }
+
+    //! Creates a pool of signals for tracking of HW operations on the queue
+    bool Create(hsa_agent_t agent) {
+      constexpr size_t kSignalListSize = 16;
+      signal_list_.resize(kSignalListSize);
+      for (uint i = 0; i < kSignalListSize; ++i) {
+        ProfilingSignal* signal = new ProfilingSignal();
+        if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
+                                    0, 1, &agent, &signal->signal_))) {
+          return false;
+        }
+        signal_list_[i] = signal;
+      }
+      agent_ = agent;
+      return true;
+    }
+
+    //! Finds a free signal for the upcomming operation
+    hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
+                              Timestamp* ts = nullptr, uint32_t queue_size = 0) {
+      // If queue size grows, then add more signals to avoid more frequent stalls
+      if (queue_size > signal_list_.size()) {
+        ProfilingSignal* signal = new ProfilingSignal();
+        if (signal != nullptr) {
+          if (HSA_STATUS_SUCCESS == hsa_signal_create(
+              0, 1, &agent_, &signal->signal_)) {
+            signal_list_.push_back(signal);
+          }
+        }
+      }
+      // Find valid index
+      ++current_id_ %= signal_list_.size();
+
+      // Make sure the previous operation on the current signal is done
+      WaitCurrent();
+
+      // Have to wait the next signal in the queue to avoid a race condition between
+      // a GPU waiter(which may be not triggered yet) and CPU signal reset below
+      WaitNext();
+
+      // Reset the signal and return
+      hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
+      signal_list_[current_id_]->done_ = false;
+      if (ts != 0) {
+        if (!sdma_profiling_) {
+          hsa_amd_profiling_async_copy_enable(true);
+          sdma_profiling_ = true;
+        }
+        signal_list_[current_id_]->ts_ = ts;
+        ts->setProfilingSignal(signal_list_[current_id_]);
+        ts->setAgent(agent_);
+      }
+      return signal_list_[current_id_]->signal_;
+    }
+
+    //! Wait for the curent active signal. Can idle the queue
+    bool WaitCurrent() { return WaitIndex(current_id_); }
+
+    //! Returns the last submitted signal for a wait
+    hsa_signal_t WaitSignal() const { return signal_list_[current_id_]->signal_; }
+
+   private:
+    //! Wait for the next active signal
+    void WaitNext() {
+      size_t next = (current_id_ + 1) % signal_list_.size();
+      WaitIndex(next);
+    }
+
+    //! Wait for the provided signal
+    bool WaitIndex(size_t index) {
+      // Wait for the current signal
+      if (!signal_list_[index]->done_) {
+        // Update timestamp values if requested
+        if (signal_list_[index]->ts_ != nullptr) {
+          signal_list_[index]->ts_->checkGpuTime();
+        } else {
+          if (!WaitForSignal(signal_list_[index]->signal_)) {
+            LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
+            return false;
+          }
+          signal_list_[index]->done_ = true;
+        }
+      }
+      return true;
+    }
+    std::vector<ProfilingSignal*> signal_list_;  //!< The pool of all signals for processing
+    size_t      current_id_ = 0;          //!< Last submitted signal
+    hsa_agent_t agent_;                   //!< HSA device agent
+    bool        sdma_profiling_ = false;  //!< Don't enable SDMA profiling by default
+  };
+
  VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
             const std::vector<uint32_t>& cuMask = {},
             amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
@@ -256,7 +374,7 @@ class VirtualGPU : public device::VirtualDevice {
   *
   * @return bool true if Wait returned successfully, false otherwise
   */
-  bool releaseGpuMemoryFence(bool force_barrier    = false);
+  bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);

  hsa_agent_t gpu_device() { return gpu_device_; }
  hsa_queue_t* gpu_queue() { return gpu_queue_; }
@@ -297,6 +415,10 @@ class VirtualGPU : public device::VirtualDevice {
  void addSystemScope() { addSystemScope_ = true; }
  void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }

+  HwQueueTracker& Barriers() { return barriers_; }
+
+  Timestamp* timestamp() const { return timestamp_; }
+
  // } roc OpenCL integration
 private:
  bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
@@ -316,7 +438,7 @@ class VirtualGPU : public device::VirtualDevice {
  void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet,
                                amd::NDRangeContainer& sizes);

-  bool initPool(size_t kernarg_pool_size, uint signal_pool_count);
+  bool initPool(size_t kernarg_pool_size);
  void destroyPool();

  void* allocKernArg(size_t size, size_t alignment);
@@ -368,7 +490,7 @@ class VirtualGPU : public device::VirtualDevice {
      uint32_t cooperative_        : 1; //!< Cooperative launch is enabled
      uint32_t addSystemScope_     : 1; //!< Insert a system scope to the next aql
      uint32_t isLastCommandSDMA_  : 1; //!< Keep track if the last command was SDMA and 
-                                               //!< not send Barrier packets if barrier_sync is 0
+                                        //!< not send Barrier packets if barrier_sync is 0
    };
    uint32_t  state_;
  };
@@ -379,8 +501,7 @@ class VirtualGPU : public device::VirtualDevice {
  hsa_agent_t gpu_device_;  //!< Physical device
  hsa_queue_t* gpu_queue_;  //!< Queue associated with a gpu
  hsa_barrier_and_packet_t barrier_packet_;
-  hsa_signal_t barrier_signal_;
-  hsa_signal_t last_signal_ = {};  //!< Last submitted signal
+
  uint32_t dispatch_id_;  //!< This variable must be updated atomically.
  Device& roc_device_;    //!< roc device object
  PrintfDbg* printfdbg_;
@@ -396,12 +517,12 @@ class VirtualGPU : public device::VirtualDevice {
  hsa_queue_t* schedulerQueue_;
  hsa_signal_t schedulerSignal_;

+  HwQueueTracker  barriers_;      //!< Tracks active barriers in ROCr
+
  char* kernarg_pool_base_;
  size_t kernarg_pool_size_;
  uint kernarg_pool_cur_offset_;

-  std::vector<ProfilingSignal> signal_pool_;  //!< Pool of signals for profiling
-  uint32_t current_signal_ = 0;               //!< Current avaialble signal in the pool
  friend class Timestamp;

  //  PM4 packet for gfx8 performance counter