From f96e9733787d2b0fb41e1b356189dc0ddc666b17 Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Fri, 22 Jan 2021 16:42:35 -0500
Subject: [PATCH] SWDEV-257787 - Add engine tracking per signal

- The logic will trace compute, sdma read/write operations and
apply signals when necessary
- ROC_CPU_WAIT_FOR_SIGNAL, ROC_SYSTEM_SCOPE_SIGNAL
and ROC_SKIP_COPY_SYNC were added to control the tracking

Change-Id: I9e8e6174c63bf7784f7ab00964e2918c8667d364


[ROCm/clr commit: dbc7abaecf6a932570e40a134998246d88e7c2cd]
---
 projects/clr/rocclr/device/rocm/rocblit.cpp   |  71 ++++--
 projects/clr/rocclr/device/rocm/rocblit.hpp   |   3 -
 projects/clr/rocclr/device/rocm/rocdefs.hpp   |   6 +-
 .../clr/rocclr/device/rocm/rocsettings.cpp    |   9 +
 .../clr/rocclr/device/rocm/rocsettings.hpp    |   5 +-
 .../clr/rocclr/device/rocm/rocvirtual.cpp     | 217 +++++++++++++++++-
 .../clr/rocclr/device/rocm/rocvirtual.hpp     | 154 +++----------
 projects/clr/rocclr/utils/flags.hpp           |   6 +
 8 files changed, 312 insertions(+), 159 deletions(-)
diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp
index e16b40ec37..d25e8e2f4f 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.cpp
@@ -433,18 +433,27 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
     hsa_dim3_t offset = { 0, 0 ,0 };
 
 
-    if ((srcRect.rowPitch_ % 4 != 0)     ||
-        (srcRect.slicePitch_ % 4 != 0)    ||
-        (dstRect.rowPitch_ % 4 != 0)     ||
+    if ((srcRect.rowPitch_ % 4 != 0)    ||
+        (srcRect.slicePitch_ % 4 != 0)  ||
+        (dstRect.rowPitch_ % 4 != 0)    ||
         (dstRect.slicePitch_ % 4 != 0)) {
       isSubwindowRectCopy = false;
     }
 
+    HwQueueEngine engine = HwQueueEngine::Unknown;
+    if ((srcAgent.handle == dev().getCpuAgent().handle) &&
+        (dstAgent.handle != dev().getCpuAgent().handle)) {
+      engine = HwQueueEngine::SdmaWrite;
+    } else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
+              (dstAgent.handle == dev().getCpuAgent().handle)) {
+      engine = HwQueueEngine::SdmaRead;
+    }
+
+    hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
+    uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+
     if (isSubwindowRectCopy ) {
-      hsa_signal_t wait = gpu().Barriers().WaitSignal();
       hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
-      uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
-      hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
 
       // Copy memory line by line
       hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
@@ -457,10 +466,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
     } else {
       // Fall to line by line copies
       const hsa_signal_value_t kInitVal = size[2] * size[1];
-      hsa_signal_t wait = gpu().Barriers().WaitSignal();
       hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());
-      uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
-      hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
 
       for (size_t z = 0; z < size[2]; ++z) {
         for (size_t y = 0; y < size[1]; ++y) {
@@ -472,18 +478,18 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
               (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
               (reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
               size[0], num_wait_events, wait_event, active);
-          gpu().setLastCommandSDMA(true) ;
           if (status != HSA_STATUS_SUCCESS) {
             gpu().Barriers().ResetCurrentSignal();
             LogPrintfError("DMA buffer failed with code %d", status);
             return false;
+          } else {
+            gpu().setLastCommandSDMA(true);
           }
         }
       }
     }
   }
-  // Explicit wait for now, until runtime could distinguish compute and sdma operations
-  gpu().Barriers().WaitCurrent();
+
   return true;
 }
 
@@ -644,18 +650,24 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
     srcAgent = dstAgent = dev().getBackendDevice();
   }
 
-  hsa_signal_t wait = gpu().Barriers().WaitSignal();
-  hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
-  uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
-  hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
+  HwQueueEngine engine = HwQueueEngine::Unknown;
+  if ((srcAgent.handle == dev().getCpuAgent().handle) &&
+      (dstAgent.handle != dev().getCpuAgent().handle)) {
+    engine = HwQueueEngine::SdmaWrite;
+  } else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
+             (dstAgent.handle == dev().getCpuAgent().handle)) {
+    engine = HwQueueEngine::SdmaRead;
+  }
+
+  hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
+  uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+  hsa_signal_t      active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
 
   // Use SDMA to transfer the data
   status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
       size[0], num_wait_events, wait_event, active);
-  gpu().setLastCommandSDMA(true);
-  // Explicit wait for now, until runtime could distinguish compute and sdma operations
-  gpu().Barriers().WaitCurrent();
   if (status == HSA_STATUS_SUCCESS) {
+    gpu().setLastCommandSDMA(true);
     gpu().addSystemScope();
   } else {
     gpu().Barriers().ResetCurrentSignal();
@@ -690,7 +702,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
   // Allocate requested size of memory
   while (totalSize > 0) {
     size = std::min(totalSize, dev().settings().stagedXferSize_);
-    hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
 
     // Copy data from Host to Device
     if (hostToDev) {
@@ -700,14 +711,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
       const hsa_agent_t srcAgent =
           (size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
 
+      HwQueueEngine engine = HwQueueEngine::Unknown;
+      if (srcAgent.handle == dev().getBackendDevice().handle) {
+        engine = HwQueueEngine::SdmaWrite;
+      }
+      gpu().Barriers().SetActiveEngine(engine);
+      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
+
       memcpy(hsaBuffer, hostSrc + offset, size);
       status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
                                          srcAgent, size, 0, nullptr, active);
-      gpu().setLastCommandSDMA(true);
       if (status != HSA_STATUS_SUCCESS) {
         gpu().Barriers().ResetCurrentSignal();
         LogPrintfError("Hsa copy from host to device failed with code %d", status);
         return false;
+      } else {
+        gpu().setLastCommandSDMA(true);
       }
       gpu().Barriers().WaitCurrent();
       totalSize -= size;
@@ -721,14 +740,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
     const hsa_agent_t dstAgent =
         (size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
 
+    HwQueueEngine engine = HwQueueEngine::Unknown;
+    if (dstAgent.handle == dev().getBackendDevice().handle) {
+      engine = HwQueueEngine::SdmaRead;
+    }
+    gpu().Barriers().SetActiveEngine(engine);
+    hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
+
     // Copy data from Device to Host
     status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
         dev().getBackendDevice(), size, 0, nullptr, active);
-    gpu().setLastCommandSDMA(true);
     if (status == HSA_STATUS_SUCCESS) {
+      gpu().setLastCommandSDMA(true);
       gpu().Barriers().WaitCurrent();
       memcpy(hostDst + offset, hsaBuffer, size);
     } else {
+      gpu().Barriers().ResetCurrentSignal();
       LogPrintfError("Hsa copy from device to host failed with code %d", status);
       return false;
     }
diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp
index 5f344ff1af..a81480f21d 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.hpp
@@ -40,9 +40,6 @@ class Kernel;
 class Memory;
 class VirtualGPU;
 
-constexpr bool kSkipCpuWait = true;
-constexpr bool kIgnoreBarrier = false;
-
 //! DMA Blit Manager
 class DmaBlitManager : public device::HostBlitManager {
  public:
diff --git a/projects/clr/rocclr/device/rocm/rocdefs.hpp b/projects/clr/rocclr/device/rocm/rocdefs.hpp
index 28284d76e0..44827de45d 100644
--- a/projects/clr/rocclr/device/rocm/rocdefs.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdefs.hpp
@@ -33,11 +33,15 @@ static constexpr uint DeviceQueueMaskSize = 32;
 //! Set to match the number of pipes, which is 8.
 static constexpr uint kMaxAsyncQueues = 8;
 
+constexpr bool kSkipCpuWait = true;
+constexpr bool kIgnoreBarrier = false;
+
 enum HwQueueEngine : uint32_t {
   Compute   = 0,
   SdmaRead  = 1,
   SdmaWrite = 2,
-  Unknown   = 3
+  Unknown   = 3,
+  External  = 4
 };
 
 } // namespace roc
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp
index 4222061abb..e2f62210f5 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp
@@ -28,6 +28,7 @@
 
 namespace roc {
 
+// ================================================================================================
 Settings::Settings() {
   // Initialize the HSA device default settings
 
@@ -91,8 +92,15 @@ Settings::Settings() {
 
   rocr_backend_ = true;
   barrier_sync_ = (!flagIsDefault(ROC_BARRIER_SYNC)) ? ROC_BARRIER_SYNC : true;
+
+  cpu_wait_for_signal_ = !AMD_DIRECT_DISPATCH;
+  cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ?
+                          ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_;
+  system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL;
+  skip_copy_sync_      = ROC_SKIP_COPY_SYNC;
 }
 
+// ================================================================================================
 bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, bool enableXNACK,
                       bool coop_groups) {
   customHostAllocator_ = false;
@@ -169,6 +177,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
   return true;
 }
 
+// ================================================================================================
 void Settings::override() {
   // Limit reported workgroup size
   if (GPU_MAX_WORKGROUP_SIZE != 0) {
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp
index e0e29c0d27..5abace8b85 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp
@@ -52,7 +52,10 @@ class Settings : public device::Settings {
       uint stagedXferWrite_ : 1;        //!< Uses a staged buffer write
       uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
       uint barrier_sync_ : 1;           //!< Use AQL barrier command to sync with CPU
-      uint reserved_ : 23;
+      uint cpu_wait_for_signal_ : 1;    //!< Wait for HSA signal on CPU
+      uint system_scope_signal_ : 1;    //!< HSA signal is visibile to the entire system
+      uint skip_copy_sync_ : 1;         //!< Ignore explicit HSA signal waits for copy functionality
+      uint reserved_ : 20;
     };
     uint value_;
   };
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 7340d99e70..a4a6257bb2 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -35,10 +35,12 @@
 #include "amd_hsa_kernel_code.h"
 
 #include <fstream>
-#include <vector>
-#include <string>
 #include <limits>
+#include <memory>
+#include <string>
 #include <thread>
+#include <vector>
+
 
 /**
 * HSA image object size in bytes (see HSAIL spec)
@@ -100,6 +102,37 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
   return (v >> pos) & ((1 << width) - 1);
 };
 
+// ================================================================================================
+void Timestamp::checkGpuTime() {
+  if (HwProfiling()) {
+    uint64_t  start = std::numeric_limits<uint64_t>::max();
+    uint64_t  end = 0;
+
+    for (auto it : signals_) {
+      if (hsa_signal_load_relaxed(it->signal_) > 0) {
+        WaitForSignal(it->signal_);
+      }
+      hsa_amd_profiling_dispatch_time_t time = {};
+      if (it->engine_ == HwQueueEngine::Compute) {
+        hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
+      } else {
+        hsa_amd_profiling_async_copy_time_t time_sdma = {};
+        hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
+        time.start = time_sdma.start;
+        time.end = time_sdma.end;
+      }
+      start = std::min(time.start, start);
+      end = std::max(time.end, end);
+      it->ts_ = nullptr;
+      it->done_ = true;
+    }
+    signals_.clear();
+    start_ = start * ticksToTime_;
+    end_ = end * ticksToTime_;
+  }
+}
+
+// ================================================================================================
 bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
   if (numMemObj > 0) {
     // Allocate the array of memory objects for dependency tracking
@@ -114,6 +147,7 @@ bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
   return true;
 }
 
+// ================================================================================================
 void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
   bool flushL1Cache = false;
 
@@ -170,6 +204,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
   numMemObjectsInQueue_++;
 }
 
+// ================================================================================================
 void VirtualGPU::MemoryDependency::clear(bool all) {
   if (numMemObjectsInQueue_ > 0) {
     size_t i, j;
@@ -205,6 +240,143 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
   }
 }
 
+// ================================================================================================
+VirtualGPU::HwQueueTracker::~HwQueueTracker() {
+  for (auto& signal: signal_list_) {
+    if (signal->signal_.handle != 0) {
+      hsa_signal_destroy(signal->signal_);
+    }
+    delete signal;
+  }
+}
+
+// ================================================================================================
+bool VirtualGPU::HwQueueTracker::Create() {
+  constexpr size_t kSignalListSize = 16;
+  signal_list_.resize(kSignalListSize);
+
+  hsa_agent_t agent = gpu_.gpu_device();
+  const Settings& settings = gpu_.dev().settings();
+  hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
+  uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
+
+  for (uint i = 0; i < kSignalListSize; ++i) {
+    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+    if ((signal == nullptr) ||
+        (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) {
+      return false;
+    }
+    signal_list_[i] = signal.release();
+  }
+  return true;
+}
+
+// ================================================================================================
+hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
+    hsa_signal_value_t init_val, Timestamp* ts, uint32_t queue_size) {
+  // If queue size grows, then add more signals to avoid more frequent stalls
+  if (queue_size > signal_list_.size()) {
+    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+    if (signal != nullptr) {
+      hsa_agent_t agent = gpu_.gpu_device();
+      const Settings& settings = gpu_.dev().settings();
+      hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
+      uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
+
+      if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
+        signal_list_.push_back(signal.release());
+      }
+    }
+  }
+  // Find valid index
+  ++current_id_ %= signal_list_.size();
+
+  // Make sure the previous operation on the current signal is done
+  WaitCurrent();
+
+  // Have to wait the next signal in the queue to avoid a race condition between
+  // a GPU waiter(which may be not triggered yet) and CPU signal reset below
+  WaitNext();
+
+  // Reset the signal and return
+  hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
+  signal_list_[current_id_]->done_ = false;
+  signal_list_[current_id_]->engine_ = engine_;
+  if (ts != 0) {
+    if (!sdma_profiling_) {
+      hsa_amd_profiling_async_copy_enable(true);
+      sdma_profiling_ = true;
+    }
+    signal_list_[current_id_]->ts_ = ts;
+    ts->AddProfilingSignal(signal_list_[current_id_]);
+  }
+  return signal_list_[current_id_]->signal_;
+}
+
+// ================================================================================================
+hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
+  bool explicit_wait = false;
+  hsa_signal_t* signal = nullptr;
+  // Does runtime switch the active engine?
+  if (engine != engine_) {
+    // Yes, return the signla from the previous operation for a wait
+    engine_ = engine;
+    explicit_wait = true;
+  } else {
+    // Unknown engine in use, hence return a wait signal always
+    if (engine == HwQueueEngine::Unknown) {
+      explicit_wait = true;
+    } else {
+      // Check if skip wait optimizaiton is enabled. It will try to predice the same engine in ROCr
+      // and ignore signal wait, relying on in-order engine execution
+      const Settings& settings = gpu_.dev().settings();
+      if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) {
+        explicit_wait = true;
+      }
+    }
+  }
+  // Check if a wait is required
+  if (explicit_wait) {
+    ProfilingSignal* prof_signal;
+    // Check if there is an external signal
+    if (external_signal_ != nullptr) {
+      prof_signal = external_signal_;
+      external_signal_ = nullptr;
+    } else {
+      prof_signal = signal_list_[current_id_];
+    }
+    // Early signal status check
+    if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) {
+      const Settings& settings = gpu_.dev().settings();
+      // Wait on CPU if requested
+      if (settings.cpu_wait_for_signal_) {
+        CpuWaitForSignal(prof_signal);
+      } else {
+        return &prof_signal->signal_;
+      }
+    }
+  }
+  return signal;
+}
+
+// ================================================================================================
+bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
+  // Wait for the current signal
+  if (!signal->done_) {
+    // Update timestamp values if requested
+    if (signal->ts_ != nullptr) {
+      signal->ts_->checkGpuTime();
+    } else {
+      if (!WaitForSignal(signal->signal_)) {
+        LogPrintfError("Failed signal [0x%lx] wait", signal->signal_);
+        return false;
+      }
+      signal->done_ = true;
+    }
+  }
+  return true;
+}
+
 // ================================================================================================
 void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
   // Reset the signal and return
@@ -537,6 +709,16 @@ bool VirtualGPU::dispatchGenericAqlPacket(
 // ================================================================================================
 bool VirtualGPU::dispatchAqlPacket(
   hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
+  hsa_signal_t* wait = Barriers().WaitingSignal();
+  // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
+  if (wait != nullptr) {
+    barrier_packet_.dep_signal[0] = *wait;
+    constexpr bool kSkipSignal = true;
+    dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal);
+  } else {
+    barrier_packet_.dep_signal[0] = hsa_signal_t{};
+  }
+
   return dispatchGenericAqlPacket(packet, header, rest, blocking);
 }
 
@@ -587,6 +769,9 @@ void VirtualGPU::dispatchBarrierPacket(hsa_barrier_and_packet_t* packet,
   if (!skipSignal) {
     // Pool size must grow to the size of pending AQL packets
     const uint32_t pool_size = index - read;
+    hsa_signal_t* wait = Barriers().WaitingSignal();
+    packet->dep_signal[0] = (wait != nullptr) ? *wait : hsa_signal_t{};
+
     // Get active signal for current dispatch if profiling is necessary
     packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
                                                         pool_size);
@@ -663,6 +848,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
       schedulerParam_(nullptr),
       schedulerQueue_(nullptr),
       schedulerSignal_({0}),
+      barriers_(*this),
       cuMask_(cuMask),
       priority_(priority),
       copy_command_type_(0)
@@ -804,7 +990,7 @@ bool VirtualGPU::create() {
   }
 
   // Allocate signal tracker for ROCr copy queue
-  if (!Barriers().Create(gpu_device())) {
+  if (!Barriers().Create()) {
     LogError("Could not create signal for copy queue!");
     return false;
   }
@@ -867,7 +1053,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
       return;
     }
     // Without barrier profiling will wait for each individual signal
-    timestamp_ = new Timestamp();
+    timestamp_ = new Timestamp(dev().getBackendDevice());
     timestamp_->start();
   }
 }
@@ -1193,10 +1379,9 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
 #if AMD_HMM_SUPPORT
   profilingBegin(cmd);
   // Initialize signal for the barrier
-  hsa_signal_t wait = Barriers().WaitSignal();
-  hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
-  uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
-  hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
+  hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown);
+  hsa_signal_t      active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
+  uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
 
   // Find the requested agent for the transfer
   hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1207,7 +1392,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
   hsa_status_t status = hsa_amd_svm_prefetch_async(
       const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active);
 
-  // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
+  // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
   if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
     Barriers().ResetCurrentSignal();
     LogError("hsa_amd_svm_prefetch_async failed");
@@ -2376,6 +2561,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
   }
   return true;
 }
+
 /**
  * @brief Api to dispatch a kernel for execution. The implementation
  * parses the input object, an instance of virtual command to obtain
@@ -2385,10 +2571,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
  * It also parses the kernel arguments buffer to inject into Hsa Runtime
  * the list of kernel parameters.
  */
+ // ================================================================================================
 void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
   if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
     // Wait for the execution on the current queue, since the coop groups will use the device queue
-    releaseGpuMemoryFence();
+    releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
 
     // Get device queue for exclusive GPU access
     VirtualGPU* queue = dev().xferQueue();
@@ -2398,6 +2585,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
 
     queue->profilingBegin(vcmd);
 
+    // Add a dependency into the device queue on the current queue
+    queue->Barriers().SetExternalSignal(Barriers().GetLastSignal());
+
     if (vcmd.cooperativeGroups()) {
       // Initialize GWS if it's cooperative groups launch
       uint32_t workgroups = 0;
@@ -2420,7 +2610,11 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
       vcmd.setStatus(CL_INVALID_OPERATION);
     }
     // Wait for the execution on the device queue. Keep the current queue in-order
-    queue->releaseGpuMemoryFence();
+    queue->releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
+
+    // Add a dependency into the current queue on the coop queue
+    Barriers().SetExternalSignal(queue->Barriers().GetLastSignal());
+    hasPendingDispatch_ = true;
 
     queue->profilingEnd(vcmd);
   } else {
@@ -2440,6 +2634,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
   }
 }
 
+// ================================================================================================
 void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
   // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
 }
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index a989b389f7..029fbb6240 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -92,46 +92,17 @@ class Timestamp {
 
   void AddProfilingSignal(ProfilingSignal* signal) { signals_.push_back(signal); }
 
-  const bool HwProfiling() const { return (signals_.size() > 0) ? true : false; }
+  const bool HwProfiling() const { return !signals_.empty(); }
 
-  void setAgent(hsa_agent_t agent) { agent_ = agent; }
-
-  Timestamp()
+  Timestamp(hsa_agent_t agent)
     : start_(std::numeric_limits<uint64_t>::max())
-    , end_(0) {
-    agent_.handle = 0;
-  }
+    , end_(0)
+    , agent_(agent) {}
 
   ~Timestamp() {}
 
   //! Finds execution ticks on GPU
-  void checkGpuTime() {
-    if (HwProfiling()) {
-      hsa_amd_profiling_dispatch_time_t time = {};
-
-      uint64_t start = std::numeric_limits<uint64_t>::max();
-      uint64_t end = 0;
-      for (auto it : signals_) {
-        if (hsa_signal_load_relaxed(it->signal_) > 0) {
-          WaitForSignal(it->signal_);
-        }
-        hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
-        if ((time.end - time.start) == 0) {
-          hsa_amd_profiling_async_copy_time_t time_sdma = {};
-          hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
-          time.start = time_sdma.start;
-          time.end = time_sdma.end;
-        }
-        start = std::min(time.start, start);
-        end = std::max(time.end, end);
-        it->ts_ = nullptr;
-        it->done_ = true;
-      }
-      signals_.clear();
-      start_ = start * ticksToTime_;
-      end_ = end * ticksToTime_;
-    }
-  }
+  void checkGpuTime();
 
   // Start a timestamp (get timestamp from OS)
   void start() { start_ = amd::Os::timeNanos(); }
@@ -183,113 +154,54 @@ class VirtualGPU : public device::VirtualDevice {
 
   class HwQueueTracker : public amd::EmbeddedObject {
    public:
-    HwQueueTracker() {}
+    HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {}
 
-    ~HwQueueTracker() {
-      for (auto& signal: signal_list_) {
-        if (signal->signal_.handle != 0) {
-          hsa_signal_destroy(signal->signal_);
-        }
-        delete signal;
-      }
-    }
+    ~HwQueueTracker();
 
     //! Creates a pool of signals for tracking of HW operations on the queue
-    bool Create(hsa_agent_t agent) {
-      constexpr size_t kSignalListSize = 16;
-      signal_list_.resize(kSignalListSize);
-      for (uint i = 0; i < kSignalListSize; ++i) {
-        ProfilingSignal* signal = new ProfilingSignal();
-        if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
-                                    0, 1, &agent, &signal->signal_))) {
-          return false;
-        }
-        signal_list_[i] = signal;
-      }
-      agent_ = agent;
-      return true;
-    }
+    bool Create();
 
     //! Finds a free signal for the upcomming operation
     hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
-                              Timestamp* ts = nullptr, uint32_t queue_size = 0) {
-      // If queue size grows, then add more signals to avoid more frequent stalls
-      if (queue_size > signal_list_.size()) {
-        ProfilingSignal* signal = new ProfilingSignal();
-        if (signal != nullptr) {
-          if (HSA_STATUS_SUCCESS == hsa_signal_create(
-              0, 1, &agent_, &signal->signal_)) {
-            signal_list_.push_back(signal);
-          }
-        }
-      }
-      // Find valid index
-      ++current_id_ %= signal_list_.size();
-
-      // Make sure the previous operation on the current signal is done
-      WaitCurrent();
-
-      // Have to wait the next signal in the queue to avoid a race condition between
-      // a GPU waiter(which may be not triggered yet) and CPU signal reset below
-      WaitNext();
-
-      // Reset the signal and return
-      hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
-      signal_list_[current_id_]->done_ = false;
-      if (ts != 0) {
-        if (!sdma_profiling_) {
-          hsa_amd_profiling_async_copy_enable(true);
-          sdma_profiling_ = true;
-        }
-        signal_list_[current_id_]->ts_ = ts;
-        ts->AddProfilingSignal(signal_list_[current_id_]);
-        ts->setAgent(agent_);
-      }
-      return signal_list_[current_id_]->signal_;
-    }
+                              Timestamp* ts = nullptr, uint32_t queue_size = 0);
 
     //! Wait for the curent active signal. Can idle the queue
-    bool WaitCurrent() { return WaitIndex(current_id_); }
+    bool WaitCurrent() { return CpuWaitForSignal(signal_list_[current_id_]); }
+
+    //! Update current active engine
+    void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
 
     //! Returns the last submitted signal for a wait
-    hsa_signal_t WaitSignal() {
-      //! @note Currently wait on CPU unconditionally to avoid a negative performance impact
-      WaitCurrent();
-      return hsa_signal_t{};
-    }
+    hsa_signal_t* WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
 
     //! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
     void ResetCurrentSignal();
 
-   private:
+    //! Inserts an external signal(submission in another queue) for dependency tracking
+    void SetExternalSignal(ProfilingSignal* signal) {
+      external_signal_ = signal;
+      engine_ = HwQueueEngine::External;
+    }
+
+    //! Inserts an external signal(submission in another queue) for dependency tracking
+    ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; }
+
+  private:
     //! Wait for the next active signal
     void WaitNext() {
       size_t next = (current_id_ + 1) % signal_list_.size();
-      WaitIndex(next);
+      CpuWaitForSignal(signal_list_[next]);
     }
 
     //! Wait for the provided signal
-    bool WaitIndex(size_t index) {
-      // Wait for the current signal
-      if (!signal_list_[index]->done_) {
-        // Update timestamp values if requested
-        if (signal_list_[index]->ts_ != nullptr) {
-          signal_list_[index]->ts_->checkGpuTime();
-        } else {
-          if (!WaitForSignal(signal_list_[index]->signal_)) {
-            LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
-            return false;
-          }
-          signal_list_[index]->done_ = true;
-        }
-      }
-      return true;
-    }
+    bool CpuWaitForSignal(ProfilingSignal* signal);
 
-    std::vector<ProfilingSignal*> signal_list_;  //!< The pool of all signals for processing
-    size_t      current_id_ = 0;          //!< Last submitted signal
-    hsa_agent_t agent_;                   //!< HSA device agent
-    bool        sdma_profiling_ = false;  //!< Don't enable SDMA profiling by default
+    HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
+    std::vector<ProfilingSignal*> signal_list_;     //!< The pool of all signals for processing
+    ProfilingSignal*  external_signal_ = nullptr;   //!< Dependency on external signal
+    size_t current_id_ = 0;       //!< Last submitted signal
+    bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled
+    const VirtualGPU& gpu_;       //!< VirtualGPU, associated with this tracker
   };
 
   VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
@@ -358,7 +270,7 @@ class VirtualGPU : public device::VirtualDevice {
    */
   bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);
 
-  hsa_agent_t gpu_device() { return gpu_device_; }
+  hsa_agent_t gpu_device() const { return gpu_device_; }
   hsa_queue_t* gpu_queue() { return gpu_queue_; }
 
   // Return pointer to PrintfDbg
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index ed9828778d..fe1b139f13 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -244,6 +244,12 @@ release(bool, ROC_ACTIVE_WAIT, false,                                         \
         "Forces unconditional active wait for GPU")                           \
 release(bool, ROC_ENABLE_LARGE_BAR, true,                                     \
         "Enable Large Bar if supported by the device")                        \
+release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true,                                  \
+        "Enable CPU wait for dependent HSA signals.")                         \
+release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true,                                  \
+        "Enable system scope for signals (uses interrupts).")                 \
+release(bool, ROC_SKIP_COPY_SYNC, false,                                      \
+        "Skips copy syncs if runtime can predict the same engine.")           \
 release(bool, HIP_FORCE_QUEUE_PROFILING, false,                               \
         "Force command queue profiling by default")                           \
 release(uint, PAL_FORCE_ASIC_REVISION, 0,                                     \