SWDEV-257787 - Add engine tracking per signal

- The logic will trace compute, sdma read/write operations and apply signals when necessary - ROC_CPU_WAIT_FOR_SIGNAL, ROC_SYSTEM_SCOPE_SIGNAL and ROC_SKIP_COPY_SYNC were added to control the tracking Change-Id: I9e8e6174c63bf7784f7ab00964e2918c8667d364 [ROCm/clr commit: dbc7abaecf]
2021-01-22 16:42:35 -05:00
@@ -433,18 +433,27 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
    hsa_dim3_t offset = { 0, 0 ,0 };


-    if ((srcRect.rowPitch_ % 4 != 0)     ||
-        (srcRect.slicePitch_ % 4 != 0)    ||
-        (dstRect.rowPitch_ % 4 != 0)     ||
+    if ((srcRect.rowPitch_ % 4 != 0)    ||
+        (srcRect.slicePitch_ % 4 != 0)  ||
+        (dstRect.rowPitch_ % 4 != 0)    ||
        (dstRect.slicePitch_ % 4 != 0)) {
      isSubwindowRectCopy = false;
    }

+    HwQueueEngine engine = HwQueueEngine::Unknown;
+    if ((srcAgent.handle == dev().getCpuAgent().handle) &&
+        (dstAgent.handle != dev().getCpuAgent().handle)) {
+      engine = HwQueueEngine::SdmaWrite;
+    } else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
+              (dstAgent.handle == dev().getCpuAgent().handle)) {
+      engine = HwQueueEngine::SdmaRead;
+    }
+
+    hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
+    uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+
    if (isSubwindowRectCopy ) {
-      hsa_signal_t wait = gpu().Barriers().WaitSignal();
      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
-      uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
-      hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;

      // Copy memory line by line
      hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
@@ -457,10 +466,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
    } else {
      // Fall to line by line copies
      const hsa_signal_value_t kInitVal = size[2] * size[1];
-      hsa_signal_t wait = gpu().Barriers().WaitSignal();
      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());
-      uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
-      hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;

      for (size_t z = 0; z < size[2]; ++z) {
        for (size_t y = 0; y < size[1]; ++y) {
@@ -472,18 +478,18 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
              (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
              (reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
              size[0], num_wait_events, wait_event, active);
-          gpu().setLastCommandSDMA(true) ;
          if (status != HSA_STATUS_SUCCESS) {
            gpu().Barriers().ResetCurrentSignal();
            LogPrintfError("DMA buffer failed with code %d", status);
            return false;
+          } else {
+            gpu().setLastCommandSDMA(true);
          }
        }
      }
    }
  }
-  // Explicit wait for now, until runtime could distinguish compute and sdma operations
-  gpu().Barriers().WaitCurrent();
+
  return true;
 }

@@ -644,18 +650,24 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
    srcAgent = dstAgent = dev().getBackendDevice();
  }

-  hsa_signal_t wait = gpu().Barriers().WaitSignal();
-  hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
-  uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
-  hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
+  HwQueueEngine engine = HwQueueEngine::Unknown;
+  if ((srcAgent.handle == dev().getCpuAgent().handle) &&
+      (dstAgent.handle != dev().getCpuAgent().handle)) {
+    engine = HwQueueEngine::SdmaWrite;
+  } else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
+             (dstAgent.handle == dev().getCpuAgent().handle)) {
+    engine = HwQueueEngine::SdmaRead;
+  }
+
+  hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
+  uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+  hsa_signal_t      active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());

  // Use SDMA to transfer the data
  status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
      size[0], num_wait_events, wait_event, active);
-  gpu().setLastCommandSDMA(true);
-  // Explicit wait for now, until runtime could distinguish compute and sdma operations
-  gpu().Barriers().WaitCurrent();
  if (status == HSA_STATUS_SUCCESS) {
+    gpu().setLastCommandSDMA(true);
    gpu().addSystemScope();
  } else {
    gpu().Barriers().ResetCurrentSignal();
@@ -690,7 +702,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
  // Allocate requested size of memory
  while (totalSize > 0) {
    size = std::min(totalSize, dev().settings().stagedXferSize_);
-    hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());

    // Copy data from Host to Device
    if (hostToDev) {
@@ -700,14 +711,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
      const hsa_agent_t srcAgent =
          (size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();

+      HwQueueEngine engine = HwQueueEngine::Unknown;
+      if (srcAgent.handle == dev().getBackendDevice().handle) {
+        engine = HwQueueEngine::SdmaWrite;
+      }
+      gpu().Barriers().SetActiveEngine(engine);
+      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
+
      memcpy(hsaBuffer, hostSrc + offset, size);
      status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
                                         srcAgent, size, 0, nullptr, active);
-      gpu().setLastCommandSDMA(true);
      if (status != HSA_STATUS_SUCCESS) {
        gpu().Barriers().ResetCurrentSignal();
        LogPrintfError("Hsa copy from host to device failed with code %d", status);
        return false;
+      } else {
+        gpu().setLastCommandSDMA(true);
      }
      gpu().Barriers().WaitCurrent();
      totalSize -= size;
@@ -721,14 +740,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
    const hsa_agent_t dstAgent =
        (size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();

+    HwQueueEngine engine = HwQueueEngine::Unknown;
+    if (dstAgent.handle == dev().getBackendDevice().handle) {
+      engine = HwQueueEngine::SdmaRead;
+    }
+    gpu().Barriers().SetActiveEngine(engine);
+    hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
+
    // Copy data from Device to Host
    status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
        dev().getBackendDevice(), size, 0, nullptr, active);
-    gpu().setLastCommandSDMA(true);
    if (status == HSA_STATUS_SUCCESS) {
+      gpu().setLastCommandSDMA(true);
      gpu().Barriers().WaitCurrent();
      memcpy(hostDst + offset, hsaBuffer, size);
    } else {
+      gpu().Barriers().ResetCurrentSignal();
      LogPrintfError("Hsa copy from device to host failed with code %d", status);
      return false;
    }
@@ -40,9 +40,6 @@ class Kernel;
 class Memory;
 class VirtualGPU;

-constexpr bool kSkipCpuWait = true;
-constexpr bool kIgnoreBarrier = false;
-
 //! DMA Blit Manager
 class DmaBlitManager : public device::HostBlitManager {
 public:
@@ -33,11 +33,15 @@ static constexpr uint DeviceQueueMaskSize = 32;
 //! Set to match the number of pipes, which is 8.
 static constexpr uint kMaxAsyncQueues = 8;

+constexpr bool kSkipCpuWait = true;
+constexpr bool kIgnoreBarrier = false;
+
 enum HwQueueEngine : uint32_t {
  Compute   = 0,
  SdmaRead  = 1,
  SdmaWrite = 2,
-  Unknown   = 3
+  Unknown   = 3,
+  External  = 4
 };

 } // namespace roc
@@ -28,6 +28,7 @@

 namespace roc {

+// ================================================================================================
 Settings::Settings() {
  // Initialize the HSA device default settings

@@ -91,8 +92,15 @@ Settings::Settings() {

  rocr_backend_ = true;
  barrier_sync_ = (!flagIsDefault(ROC_BARRIER_SYNC)) ? ROC_BARRIER_SYNC : true;
+
+  cpu_wait_for_signal_ = !AMD_DIRECT_DISPATCH;
+  cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ?
+                          ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_;
+  system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL;
+  skip_copy_sync_      = ROC_SKIP_COPY_SYNC;
 }

+// ================================================================================================
 bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, bool enableXNACK,
                      bool coop_groups) {
  customHostAllocator_ = false;
@@ -169,6 +177,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
  return true;
 }

+// ================================================================================================
 void Settings::override() {
  // Limit reported workgroup size
  if (GPU_MAX_WORKGROUP_SIZE != 0) {
@@ -52,7 +52,10 @@ class Settings : public device::Settings {
      uint stagedXferWrite_ : 1;        //!< Uses a staged buffer write
      uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
      uint barrier_sync_ : 1;           //!< Use AQL barrier command to sync with CPU
-      uint reserved_ : 23;
+      uint cpu_wait_for_signal_ : 1;    //!< Wait for HSA signal on CPU
+      uint system_scope_signal_ : 1;    //!< HSA signal is visibile to the entire system
+      uint skip_copy_sync_ : 1;         //!< Ignore explicit HSA signal waits for copy functionality
+      uint reserved_ : 20;
    };
    uint value_;
  };
@@ -35,10 +35,12 @@
 #include "amd_hsa_kernel_code.h"

 #include <fstream>
-#include <vector>
-#include <string>
 #include <limits>
+#include <memory>
+#include <string>
 #include <thread>
+#include <vector>
+

 /**
 * HSA image object size in bytes (see HSAIL spec)
@@ -100,6 +102,37 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
  return (v >> pos) & ((1 << width) - 1);
 };

+// ================================================================================================
+void Timestamp::checkGpuTime() {
+  if (HwProfiling()) {
+    uint64_t  start = std::numeric_limits<uint64_t>::max();
+    uint64_t  end = 0;
+
+    for (auto it : signals_) {
+      if (hsa_signal_load_relaxed(it->signal_) > 0) {
+        WaitForSignal(it->signal_);
+      }
+      hsa_amd_profiling_dispatch_time_t time = {};
+      if (it->engine_ == HwQueueEngine::Compute) {
+        hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
+      } else {
+        hsa_amd_profiling_async_copy_time_t time_sdma = {};
+        hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
+        time.start = time_sdma.start;
+        time.end = time_sdma.end;
+      }
+      start = std::min(time.start, start);
+      end = std::max(time.end, end);
+      it->ts_ = nullptr;
+      it->done_ = true;
+    }
+    signals_.clear();
+    start_ = start * ticksToTime_;
+    end_ = end * ticksToTime_;
+  }
+}
+
+// ================================================================================================
 bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
  if (numMemObj > 0) {
    // Allocate the array of memory objects for dependency tracking
@@ -114,6 +147,7 @@ bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
  return true;
 }

+// ================================================================================================
 void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
  bool flushL1Cache = false;

@@ -170,6 +204,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
  numMemObjectsInQueue_++;
 }

+// ================================================================================================
 void VirtualGPU::MemoryDependency::clear(bool all) {
  if (numMemObjectsInQueue_ > 0) {
    size_t i, j;
@@ -205,6 +240,143 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
  }
 }

+// ================================================================================================
+VirtualGPU::HwQueueTracker::~HwQueueTracker() {
+  for (auto& signal: signal_list_) {
+    if (signal->signal_.handle != 0) {
+      hsa_signal_destroy(signal->signal_);
+    }
+    delete signal;
+  }
+}
+
+// ================================================================================================
+bool VirtualGPU::HwQueueTracker::Create() {
+  constexpr size_t kSignalListSize = 16;
+  signal_list_.resize(kSignalListSize);
+
+  hsa_agent_t agent = gpu_.gpu_device();
+  const Settings& settings = gpu_.dev().settings();
+  hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
+  uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
+
+  for (uint i = 0; i < kSignalListSize; ++i) {
+    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+    if ((signal == nullptr) ||
+        (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) {
+      return false;
+    }
+    signal_list_[i] = signal.release();
+  }
+  return true;
+}
+
+// ================================================================================================
+hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
+    hsa_signal_value_t init_val, Timestamp* ts, uint32_t queue_size) {
+  // If queue size grows, then add more signals to avoid more frequent stalls
+  if (queue_size > signal_list_.size()) {
+    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+    if (signal != nullptr) {
+      hsa_agent_t agent = gpu_.gpu_device();
+      const Settings& settings = gpu_.dev().settings();
+      hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
+      uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
+
+      if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
+        signal_list_.push_back(signal.release());
+      }
+    }
+  }
+  // Find valid index
+  ++current_id_ %= signal_list_.size();
+
+  // Make sure the previous operation on the current signal is done
+  WaitCurrent();
+
+  // Have to wait the next signal in the queue to avoid a race condition between
+  // a GPU waiter(which may be not triggered yet) and CPU signal reset below
+  WaitNext();
+
+  // Reset the signal and return
+  hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
+  signal_list_[current_id_]->done_ = false;
+  signal_list_[current_id_]->engine_ = engine_;
+  if (ts != 0) {
+    if (!sdma_profiling_) {
+      hsa_amd_profiling_async_copy_enable(true);
+      sdma_profiling_ = true;
+    }
+    signal_list_[current_id_]->ts_ = ts;
+    ts->AddProfilingSignal(signal_list_[current_id_]);
+  }
+  return signal_list_[current_id_]->signal_;
+}
+
+// ================================================================================================
+hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
+  bool explicit_wait = false;
+  hsa_signal_t* signal = nullptr;
+  // Does runtime switch the active engine?
+  if (engine != engine_) {
+    // Yes, return the signla from the previous operation for a wait
+    engine_ = engine;
+    explicit_wait = true;
+  } else {
+    // Unknown engine in use, hence return a wait signal always
+    if (engine == HwQueueEngine::Unknown) {
+      explicit_wait = true;
+    } else {
+      // Check if skip wait optimizaiton is enabled. It will try to predice the same engine in ROCr
+      // and ignore signal wait, relying on in-order engine execution
+      const Settings& settings = gpu_.dev().settings();
+      if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) {
+        explicit_wait = true;
+      }
+    }
+  }
+  // Check if a wait is required
+  if (explicit_wait) {
+    ProfilingSignal* prof_signal;
+    // Check if there is an external signal
+    if (external_signal_ != nullptr) {
+      prof_signal = external_signal_;
+      external_signal_ = nullptr;
+    } else {
+      prof_signal = signal_list_[current_id_];
+    }
+    // Early signal status check
+    if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) {
+      const Settings& settings = gpu_.dev().settings();
+      // Wait on CPU if requested
+      if (settings.cpu_wait_for_signal_) {
+        CpuWaitForSignal(prof_signal);
+      } else {
+        return &prof_signal->signal_;
+      }
+    }
+  }
+  return signal;
+}
+
+// ================================================================================================
+bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
+  // Wait for the current signal
+  if (!signal->done_) {
+    // Update timestamp values if requested
+    if (signal->ts_ != nullptr) {
+      signal->ts_->checkGpuTime();
+    } else {
+      if (!WaitForSignal(signal->signal_)) {
+        LogPrintfError("Failed signal [0x%lx] wait", signal->signal_);
+        return false;
+      }
+      signal->done_ = true;
+    }
+  }
+  return true;
+}
+
 // ================================================================================================
 void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
  // Reset the signal and return
@@ -537,6 +709,16 @@ bool VirtualGPU::dispatchGenericAqlPacket(
 // ================================================================================================
 bool VirtualGPU::dispatchAqlPacket(
  hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
+  hsa_signal_t* wait = Barriers().WaitingSignal();
+  // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
+  if (wait != nullptr) {
+    barrier_packet_.dep_signal[0] = *wait;
+    constexpr bool kSkipSignal = true;
+    dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal);
+  } else {
+    barrier_packet_.dep_signal[0] = hsa_signal_t{};
+  }
+
  return dispatchGenericAqlPacket(packet, header, rest, blocking);
 }

@@ -587,6 +769,9 @@ void VirtualGPU::dispatchBarrierPacket(hsa_barrier_and_packet_t* packet,
  if (!skipSignal) {
    // Pool size must grow to the size of pending AQL packets
    const uint32_t pool_size = index - read;
+    hsa_signal_t* wait = Barriers().WaitingSignal();
+    packet->dep_signal[0] = (wait != nullptr) ? *wait : hsa_signal_t{};
+
    // Get active signal for current dispatch if profiling is necessary
    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
                                                        pool_size);
@@ -663,6 +848,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
      schedulerParam_(nullptr),
      schedulerQueue_(nullptr),
      schedulerSignal_({0}),
+      barriers_(*this),
      cuMask_(cuMask),
      priority_(priority),
      copy_command_type_(0)
@@ -804,7 +990,7 @@ bool VirtualGPU::create() {
  }

  // Allocate signal tracker for ROCr copy queue
-  if (!Barriers().Create(gpu_device())) {
+  if (!Barriers().Create()) {
    LogError("Could not create signal for copy queue!");
    return false;
  }
@@ -867,7 +1053,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
      return;
    }
    // Without barrier profiling will wait for each individual signal
-    timestamp_ = new Timestamp();
+    timestamp_ = new Timestamp(dev().getBackendDevice());
    timestamp_->start();
  }
 }
@@ -1193,10 +1379,9 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
 #if AMD_HMM_SUPPORT
  profilingBegin(cmd);
  // Initialize signal for the barrier
-  hsa_signal_t wait = Barriers().WaitSignal();
-  hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
-  uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
-  hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
+  hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown);
+  hsa_signal_t      active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
+  uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;

  // Find the requested agent for the transfer
  hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1207,7 +1392,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
  hsa_status_t status = hsa_amd_svm_prefetch_async(
      const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active);

-  // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
+  // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
  if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
    Barriers().ResetCurrentSignal();
    LogError("hsa_amd_svm_prefetch_async failed");
@@ -2376,6 +2561,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
  }
  return true;
 }
+
 /**
 * @brief Api to dispatch a kernel for execution. The implementation
 * parses the input object, an instance of virtual command to obtain
@@ -2385,10 +2571,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
 * It also parses the kernel arguments buffer to inject into Hsa Runtime
 * the list of kernel parameters.
 */
+ // ================================================================================================
 void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
  if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
    // Wait for the execution on the current queue, since the coop groups will use the device queue
-    releaseGpuMemoryFence();
+    releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);

    // Get device queue for exclusive GPU access
    VirtualGPU* queue = dev().xferQueue();
@@ -2398,6 +2585,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {

    queue->profilingBegin(vcmd);

+    // Add a dependency into the device queue on the current queue
+    queue->Barriers().SetExternalSignal(Barriers().GetLastSignal());
+
    if (vcmd.cooperativeGroups()) {
      // Initialize GWS if it's cooperative groups launch
      uint32_t workgroups = 0;
@@ -2420,7 +2610,11 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
    // Wait for the execution on the device queue. Keep the current queue in-order
-    queue->releaseGpuMemoryFence();
+    queue->releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
+
+    // Add a dependency into the current queue on the coop queue
+    Barriers().SetExternalSignal(queue->Barriers().GetLastSignal());
+    hasPendingDispatch_ = true;

    queue->profilingEnd(vcmd);
  } else {
@@ -2440,6 +2634,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
  }
 }

+// ================================================================================================
 void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
  // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
 }
@@ -92,46 +92,17 @@ class Timestamp {

  void AddProfilingSignal(ProfilingSignal* signal) { signals_.push_back(signal); }

-  const bool HwProfiling() const { return (signals_.size() > 0) ? true : false; }
+  const bool HwProfiling() const { return !signals_.empty(); }

-  void setAgent(hsa_agent_t agent) { agent_ = agent; }
-
-  Timestamp()
+  Timestamp(hsa_agent_t agent)
    : start_(std::numeric_limits<uint64_t>::max())
-    , end_(0) {
-    agent_.handle = 0;
-  }
+    , end_(0)
+    , agent_(agent) {}

  ~Timestamp() {}

  //! Finds execution ticks on GPU
-  void checkGpuTime() {
-    if (HwProfiling()) {
-      hsa_amd_profiling_dispatch_time_t time = {};
-
-      uint64_t start = std::numeric_limits<uint64_t>::max();
-      uint64_t end = 0;
-      for (auto it : signals_) {
-        if (hsa_signal_load_relaxed(it->signal_) > 0) {
-          WaitForSignal(it->signal_);
-        }
-        hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
-        if ((time.end - time.start) == 0) {
-          hsa_amd_profiling_async_copy_time_t time_sdma = {};
-          hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
-          time.start = time_sdma.start;
-          time.end = time_sdma.end;
-        }
-        start = std::min(time.start, start);
-        end = std::max(time.end, end);
-        it->ts_ = nullptr;
-        it->done_ = true;
-      }
-      signals_.clear();
-      start_ = start * ticksToTime_;
-      end_ = end * ticksToTime_;
-    }
-  }
+  void checkGpuTime();

  // Start a timestamp (get timestamp from OS)
  void start() { start_ = amd::Os::timeNanos(); }
@@ -183,113 +154,54 @@ class VirtualGPU : public device::VirtualDevice {

  class HwQueueTracker : public amd::EmbeddedObject {
   public:
-    HwQueueTracker() {}
+    HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {}

-    ~HwQueueTracker() {
-      for (auto& signal: signal_list_) {
-        if (signal->signal_.handle != 0) {
-          hsa_signal_destroy(signal->signal_);
-        }
-        delete signal;
-      }
-    }
+    ~HwQueueTracker();

    //! Creates a pool of signals for tracking of HW operations on the queue
-    bool Create(hsa_agent_t agent) {
-      constexpr size_t kSignalListSize = 16;
-      signal_list_.resize(kSignalListSize);
-      for (uint i = 0; i < kSignalListSize; ++i) {
-        ProfilingSignal* signal = new ProfilingSignal();
-        if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
-                                    0, 1, &agent, &signal->signal_))) {
-          return false;
-        }
-        signal_list_[i] = signal;
-      }
-      agent_ = agent;
-      return true;
-    }
+    bool Create();

    //! Finds a free signal for the upcomming operation
    hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
-                              Timestamp* ts = nullptr, uint32_t queue_size = 0) {
-      // If queue size grows, then add more signals to avoid more frequent stalls
-      if (queue_size > signal_list_.size()) {
-        ProfilingSignal* signal = new ProfilingSignal();
-        if (signal != nullptr) {
-          if (HSA_STATUS_SUCCESS == hsa_signal_create(
-              0, 1, &agent_, &signal->signal_)) {
-            signal_list_.push_back(signal);
-          }
-        }
-      }
-      // Find valid index
-      ++current_id_ %= signal_list_.size();
-
-      // Make sure the previous operation on the current signal is done
-      WaitCurrent();
-
-      // Have to wait the next signal in the queue to avoid a race condition between
-      // a GPU waiter(which may be not triggered yet) and CPU signal reset below
-      WaitNext();
-
-      // Reset the signal and return
-      hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
-      signal_list_[current_id_]->done_ = false;
-      if (ts != 0) {
-        if (!sdma_profiling_) {
-          hsa_amd_profiling_async_copy_enable(true);
-          sdma_profiling_ = true;
-        }
-        signal_list_[current_id_]->ts_ = ts;
-        ts->AddProfilingSignal(signal_list_[current_id_]);
-        ts->setAgent(agent_);
-      }
-      return signal_list_[current_id_]->signal_;
-    }
+                              Timestamp* ts = nullptr, uint32_t queue_size = 0);

    //! Wait for the curent active signal. Can idle the queue
-    bool WaitCurrent() { return WaitIndex(current_id_); }
+    bool WaitCurrent() { return CpuWaitForSignal(signal_list_[current_id_]); }
+
+    //! Update current active engine
+    void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }

    //! Returns the last submitted signal for a wait
-    hsa_signal_t WaitSignal() {
-      //! @note Currently wait on CPU unconditionally to avoid a negative performance impact
-      WaitCurrent();
-      return hsa_signal_t{};
-    }
+    hsa_signal_t* WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);

    //! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
    void ResetCurrentSignal();

-   private:
+    //! Inserts an external signal(submission in another queue) for dependency tracking
+    void SetExternalSignal(ProfilingSignal* signal) {
+      external_signal_ = signal;
+      engine_ = HwQueueEngine::External;
+    }
+
+    //! Inserts an external signal(submission in another queue) for dependency tracking
+    ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; }
+
+  private:
    //! Wait for the next active signal
    void WaitNext() {
      size_t next = (current_id_ + 1) % signal_list_.size();
-      WaitIndex(next);
+      CpuWaitForSignal(signal_list_[next]);
    }

    //! Wait for the provided signal
-    bool WaitIndex(size_t index) {
-      // Wait for the current signal
-      if (!signal_list_[index]->done_) {
-        // Update timestamp values if requested
-        if (signal_list_[index]->ts_ != nullptr) {
-          signal_list_[index]->ts_->checkGpuTime();
-        } else {
-          if (!WaitForSignal(signal_list_[index]->signal_)) {
-            LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
-            return false;
-          }
-          signal_list_[index]->done_ = true;
-        }
-      }
-      return true;
-    }
+    bool CpuWaitForSignal(ProfilingSignal* signal);

-    std::vector<ProfilingSignal*> signal_list_;  //!< The pool of all signals for processing
-    size_t      current_id_ = 0;          //!< Last submitted signal
-    hsa_agent_t agent_;                   //!< HSA device agent
-    bool        sdma_profiling_ = false;  //!< Don't enable SDMA profiling by default
+    HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
+    std::vector<ProfilingSignal*> signal_list_;     //!< The pool of all signals for processing
+    ProfilingSignal*  external_signal_ = nullptr;   //!< Dependency on external signal
+    size_t current_id_ = 0;       //!< Last submitted signal
+    bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled
+    const VirtualGPU& gpu_;       //!< VirtualGPU, associated with this tracker
  };

  VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
@@ -358,7 +270,7 @@ class VirtualGPU : public device::VirtualDevice {
   */
  bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);

-  hsa_agent_t gpu_device() { return gpu_device_; }
+  hsa_agent_t gpu_device() const { return gpu_device_; }
  hsa_queue_t* gpu_queue() { return gpu_queue_; }

  // Return pointer to PrintfDbg
@@ -244,6 +244,12 @@ release(bool, ROC_ACTIVE_WAIT, false,                                         \
        "Forces unconditional active wait for GPU")                           \
 release(bool, ROC_ENABLE_LARGE_BAR, true,                                     \
        "Enable Large Bar if supported by the device")                        \
+release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true,                                  \
+        "Enable CPU wait for dependent HSA signals.")                         \
+release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true,                                  \
+        "Enable system scope for signals (uses interrupts).")                 \
+release(bool, ROC_SKIP_COPY_SYNC, false,                                      \
+        "Skips copy syncs if runtime can predict the same engine.")           \
 release(bool, HIP_FORCE_QUEUE_PROFILING, false,                               \
        "Force command queue profiling by default")                           \
 release(uint, PAL_FORCE_ASIC_REVISION, 0,                                     \