diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index e4ff536840..2f2f54f924 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -449,8 +449,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
       engine = HwQueueEngine::SdmaRead;
     }
 
-    hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
-    uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+    auto wait_events = gpu().Barriers().WaitingSignal(engine);
 
     if (isSubwindowRectCopy ) {
       hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
@@ -458,10 +457,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
       // Copy memory line by line
       ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
               "[%zx]!\t HSA Asycn Copy Rect  wait_event=0x%zx, completion_signal=0x%zx",
-              std::this_thread::get_id(), (wait_event != nullptr) ? wait_event->handle : 0,
+              std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0,
               active.handle);
       hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
-          &srcMem, &offset, &dim, agent, direction, num_wait_events, wait_event, active);
+          &srcMem, &offset, &dim, agent, direction, wait_events.size(), &wait_events[0], active);
       if (status != HSA_STATUS_SUCCESS) {
         gpu().Barriers().ResetCurrentSignal();
         LogPrintfError("DMA buffer failed with code %d", status);
@@ -480,12 +479,12 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
           // Copy memory line by line
           ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
                   "[%zx]!\t HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx",
-                  std::this_thread::get_id(), (wait_event != nullptr) ? wait_event->handle : 0,
+                  std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0,
                   active.handle);
           hsa_status_t status = hsa_amd_memory_async_copy(
               (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
               (reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
-              size[0], num_wait_events, wait_event, active);
+              size[0], wait_events.size(), &wait_events[0], active);
           if (status != HSA_STATUS_SUCCESS) {
             gpu().Barriers().ResetCurrentSignal();
             LogPrintfError("DMA buffer failed with code %d", status);
@@ -662,18 +661,17 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
     engine = HwQueueEngine::SdmaRead;
   }
 
-  hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
-  uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
-  hsa_signal_t      active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
+  auto wait_events = gpu().Barriers().WaitingSignal(engine);
+  hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
 
   // Use SDMA to transfer the data
   ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
           "[%zx]!\t HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx",
-          std::this_thread::get_id(), (wait_event != nullptr) ? wait_event->handle : 0,
+          std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0,
           active.handle);
 
   status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
-      size[0], num_wait_events, wait_event, active);
+      size[0], wait_events.size(), &wait_events[0], active);
   if (status == HSA_STATUS_SUCCESS) {
     gpu().addSystemScope();
   } else {
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index 4e355e4759..fcfb7fd8a5 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -389,9 +389,11 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
 }
 
 // ================================================================================================
-hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
+std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
   bool explicit_wait = false;
-  hsa_signal_t* signal = nullptr;
+  // Rest all current waiting signals
+  waiting_signals_.clear();
+
   // Does runtime switch the active engine?
   if (engine != engine_) {
     // Yes, return the signla from the previous operation for a wait
@@ -412,30 +414,32 @@ hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
   }
   // Check if a wait is required
   if (explicit_wait) {
-    ProfilingSignal* prof_signal;
-    // Check if there is an external signal
-    if (external_signal_ != nullptr) {
-      prof_signal = external_signal_;
-      external_signal_ = nullptr;
-    } else {
-      prof_signal = signal_list_[current_id_];
-    }
-    // Early signal status check
-    if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) {
-      const Settings& settings = gpu_.dev().settings();
-      // Actively wait on CPU for 750 us to avoid extra overheads of signal tracking on GPU
-      if (!WaitForSignal<kTimeout750us>(prof_signal->signal_)) {
-        if (settings.cpu_wait_for_signal_) {
-          // Wait on CPU for completion if requested
-          CpuWaitForSignal(prof_signal);
-        } else {
-          // Return HSA signal for tracking on GPU
-          return &prof_signal->signal_;
+    ProfilingSignal** prof_signal;
+    // Add the oldest signal into the tracking for a wait
+    external_signals_.push_back(signal_list_[current_id_]);
+    prof_signal = &external_signals_[0];
+
+    // Validate all signals for the wait and skip already completed
+    for (uint32_t i = 0; i < external_signals_.size(); ++i) {
+      // Early signal status check
+      if (hsa_signal_load_relaxed(prof_signal[i]->signal_) > 0) {
+        const Settings& settings = gpu_.dev().settings();
+        // Actively wait on CPU for 750 us to avoid extra overheads of signal tracking on GPU
+        if (!WaitForSignal<kTimeout750us>(prof_signal[i]->signal_)) {
+          if (settings.cpu_wait_for_signal_) {
+            // Wait on CPU for completion if requested
+            CpuWaitForSignal(prof_signal[i]);
+          } else {
+            // Add HSA signal for tracking on GPU
+            waiting_signals_.push_back(prof_signal[i]->signal_);
+          }
         }
       }
     }
+    external_signals_.clear();
   }
-  return signal;
+  // Return the array of waiting HSA signals
+  return waiting_signals_;
 }
 
 // ================================================================================================
@@ -790,18 +794,32 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   return true;
 }
 
+// ================================================================================================
+void VirtualGPU::dispatchBlockingWait() {
+  auto wait_signals = Barriers().WaitingSignal();
+  // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
+  if (wait_signals.size() != 0) {
+    for (uint32_t i = 0; i < wait_signals.size(); ++i) {
+      uint32_t j = i % 5;
+      barrier_packet_.dep_signal[j] = wait_signals[i];
+      constexpr bool kSkipSignal = true;
+      // If runtime reached the packet limit or the count limit, then flush the barrier
+      if ((j == 4) || ((i + 1) == wait_signals.size())) {
+        dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal);
+        barrier_packet_.dep_signal[0] = hsa_signal_t{};
+        barrier_packet_.dep_signal[1] = hsa_signal_t{};
+        barrier_packet_.dep_signal[2] = hsa_signal_t{};
+        barrier_packet_.dep_signal[3] = hsa_signal_t{};
+        barrier_packet_.dep_signal[4] = hsa_signal_t{};
+      }
+    }
+  }
+}
+
 // ================================================================================================
 bool VirtualGPU::dispatchAqlPacket(
   hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
-  hsa_signal_t* wait = Barriers().WaitingSignal();
-  // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
-  if (wait != nullptr) {
-    barrier_packet_.dep_signal[0] = *wait;
-    constexpr bool kSkipSignal = true;
-    dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal);
-  } else {
-    barrier_packet_.dep_signal[0] = hsa_signal_t{};
-  }
+  dispatchBlockingWait();
 
   return dispatchGenericAqlPacket(packet, header, rest, blocking);
 }
@@ -851,10 +869,10 @@ void VirtualGPU::dispatchBarrierPacket(hsa_barrier_and_packet_t* packet,
   packet->completion_signal.handle = 0;
 
   if (!skipSignal) {
+    dispatchBlockingWait();
+
     // Pool size must grow to the size of pending AQL packets
     const uint32_t pool_size = index - read;
-    hsa_signal_t* wait = Barriers().WaitingSignal();
-    packet->dep_signal[0] = (wait != nullptr) ? *wait : hsa_signal_t{};
 
     // Get active signal for current dispatch if profiling is necessary
     packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
@@ -1143,6 +1161,20 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
     timestamp_ = new Timestamp(this, command);
     timestamp_->start();
   }
+
+  if (AMD_DIRECT_DISPATCH) {
+    for (auto it = command.eventWaitList().begin(); it < command.eventWaitList().end(); ++it) {
+      void* hw_event = ((*it)->NotifyEvent() != nullptr) ?
+        (*it)->NotifyEvent()->HwEvent() : (*it)->HwEvent();
+      if (hw_event != nullptr) {
+        Barriers().AddExternalSignal(reinterpret_cast<ProfilingSignal*>(hw_event));
+      } else if (static_cast<amd::Command*>(*it)->queue() != command.queue()) {
+        LogPrintfError("Waiting event(%p) doesn't have a HSA signal!\n", *it);
+      } else {
+        // Assume serialization on the same queue...
+      }
+    }
+  }
 }
 
 // ================================================================================================
@@ -1157,6 +1189,9 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
       timestamp_->end();
     }
     command.setData(timestamp_);
+    if (AMD_DIRECT_DISPATCH) {
+      command.SetHwEvent(timestamp_->Signals().back());
+    }
     timestamp_ = nullptr;
   }
 }
@@ -1467,9 +1502,8 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
 
   if (dev().info().hmmSupported_) {
     // Initialize signal for the barrier
-    hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown);
-    hsa_signal_t      active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
-    uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
+    auto wait_events = Barriers().WaitingSignal(HwQueueEngine::Unknown);
+    hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
 
     // Find the requested agent for the transfer
     hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1478,7 +1512,8 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
 
     // Initiate a prefetch command
     hsa_status_t status = hsa_amd_svm_prefetch_async(
-        const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active);
+        const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent,
+        wait_events.size(), &wait_events[0], active);
 
     // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
     if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
@@ -2785,7 +2820,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
     queue->profilingBegin(vcmd);
 
     // Add a dependency into the device queue on the current queue
-    queue->Barriers().SetExternalSignal(Barriers().GetLastSignal());
+    queue->Barriers().AddExternalSignal(Barriers().GetLastSignal());
 
     if (vcmd.cooperativeGroups()) {
       // Initialize GWS if it's cooperative groups launch
@@ -2812,7 +2847,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
     queue->releaseGpuMemoryFence(kSkipCpuWait);
 
     // Add a dependency into the current queue on the coop queue
-    Barriers().SetExternalSignal(queue->Barriers().GetLastSignal());
+    Barriers().AddExternalSignal(queue->Barriers().GetLastSignal());
     hasPendingDispatch_ = true;
 
     queue->profilingEnd(vcmd);
@@ -2885,18 +2920,12 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
     amd::Command* current = list;
     assert(current != nullptr && "Empty batch for processing!");
 
-    // HIP tests expect callbacks processed from another thread, hence force AQL barrier always, so
-    // HSA signal callback will process HIP callback asynchronously
-    if (list->Callback() != nullptr) {
-      hasPendingDispatch_ = true;
-    }
+    // Find the last command
     while (current->getNext() != nullptr) {
       current = current->getNext();
-      if (current->Callback() != nullptr) {
-        hasPendingDispatch_ = true;
-      }
     }
-
+    // Always insert a barrier. Some tests rquire async SDMA wait
+    hasPendingDispatch_ = true;
     // Enable profiling, so runtime can track TS
     profilingBegin(*current);
 
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index bfe3dceb39..6545cfbff3 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -206,18 +206,18 @@ class VirtualGPU : public device::VirtualDevice {
     void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
 
     //! Returns the last submitted signal for a wait
-    hsa_signal_t* WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
+    std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
 
     //! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
     void ResetCurrentSignal();
 
-    //! Inserts an external signal(submission in another queue) for dependency tracking
-    void SetExternalSignal(ProfilingSignal* signal) {
-      external_signal_ = signal;
+    //! Adds an external signal(submission in another queue) for dependency tracking
+    void AddExternalSignal(ProfilingSignal* signal) {
+      external_signals_.push_back(signal);
       engine_ = HwQueueEngine::External;
     }
 
-    //! Inserts an external signal(submission in another queue) for dependency tracking
+    //! Get the last active signal on the queue
     ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; }
 
   private:
@@ -235,10 +235,11 @@ class VirtualGPU : public device::VirtualDevice {
 
     HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
     std::vector<ProfilingSignal*> signal_list_;     //!< The pool of all signals for processing
-    ProfilingSignal*  external_signal_ = nullptr;   //!< Dependency on external signal
     size_t current_id_ = 0;       //!< Last submitted signal
     bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled
     const VirtualGPU& gpu_;       //!< VirtualGPU, associated with this tracker
+    std::vector<ProfilingSignal*> external_signals_;  //!< External signals for a wait in this queue
+    std::vector<hsa_signal_t> waiting_signals_;   //!< Current waiting signals in this queue
   };
 
   VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
@@ -354,9 +355,12 @@ class VirtualGPU : public device::VirtualDevice {
 
   void profilerAttach(bool enable = false) { profilerAttached_ = enable; }
 
-  bool isProfilerAttached() { return profilerAttached_; }
+  bool isProfilerAttached() const { return profilerAttached_; }
   // } roc OpenCL integration
  private:
+  //! Dispatches a barrier with blocking HSA signals
+  void dispatchBlockingWait();
+
   bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
                          uint16_t rest, bool blocking = true);
   bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header,
diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
index 4e0cad1caf..b8dfb244ae 100644
--- a/rocclr/platform/command.cpp
+++ b/rocclr/platform/command.cpp
@@ -42,16 +42,22 @@
 
 namespace amd {
 
+// ================================================================================================
 Event::Event(HostQueue& queue)
     : callbacks_(NULL),
       status_(CL_INT_MAX),
+      hw_event_(nullptr),
+      notify_event_(nullptr),
       profilingInfo_(IS_PROFILER_ON || queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
                      Agent::shouldPostEventEvents()) {
   notified_.clear();
 }
 
-Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED) { notified_.clear(); }
+// ================================================================================================
+Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
+    hw_event_(nullptr), notify_event_(nullptr) { notified_.clear(); }
 
+// ================================================================================================
 Event::~Event() {
   CallBackEntry* callback = callbacks_;
   while (callback != NULL) {
@@ -61,6 +67,7 @@ Event::~Event() {
   }
 }
 
+// ================================================================================================
 uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) {
   if (timeStamp == 0) {
     timeStamp = Os::timeNanos();
@@ -88,7 +95,7 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) {
 
 // Global epoch time since the first processed command
 uint64_t epoch = 0;
-
+// ================================================================================================
 bool Event::setStatus(int32_t status, uint64_t timeStamp) {
   assert(status <= CL_QUEUED && "invalid status");
 
@@ -157,6 +164,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
   return true;
 }
 
+// ================================================================================================
 bool Event::resetStatus(int32_t status) {
   int32_t currentStatus = this->status();
   if (currentStatus != CL_COMPLETE) {
@@ -171,6 +179,7 @@ bool Event::resetStatus(int32_t status) {
   return true;
 }
 
+// ================================================================================================
 bool Event::setCallback(int32_t status, Event::CallBackFunction callback, void* data) {
   assert(status >= CL_COMPLETE && status <= CL_QUEUED && "invalid status");
 
@@ -193,7 +202,7 @@ bool Event::setCallback(int32_t status, Event::CallBackFunction callback, void*
   return true;
 }
 
-
+// ================================================================================================
 void Event::processCallbacks(int32_t status) const {
   cl_event event = const_cast<cl_event>(as_cl(this));
   const int32_t mask = (status > CL_COMPLETE) ? status : CL_COMPLETE;
@@ -212,6 +221,7 @@ void Event::processCallbacks(int32_t status) const {
   }
 }
 
+// ================================================================================================
 bool Event::awaitCompletion() {
   if (status() > CL_COMPLETE) {
     // Notifies current command queue about waiting
@@ -219,7 +229,8 @@ bool Event::awaitCompletion() {
       return false;
     }
 
-    ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d", this, status());
+    ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d",
+      this, status());
     auto* queue = command().queue();
     if ((queue != nullptr) && queue->vdev()->ActiveWait()) {
       while (status() > CL_COMPLETE) {
@@ -262,6 +273,8 @@ bool Event::notifyCmdQueue() {
     ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
     command->enqueue();
     command->release();
+    // Save notification, associated with the current event
+    notify_event_ = command;
   }
   return true;
 }
@@ -306,10 +319,10 @@ void Command::enqueue() {
   // update will occur later after flush() with a wait
   if (AMD_DIRECT_DISPATCH) {
     setStatus(CL_QUEUED);
-    // The wait should be performed before the lock,
-    // otherwise signal handler may have a deadlock, but awaitCompletion() is thread safe itself
+    // Notify all commands about the waiter. Barrier will be sent in order to obtain
+    // HSA signal for a wait on the current queue
     std::for_each(eventWaitList().begin(), eventWaitList().end(),
-        std::mem_fun(&Command::awaitCompletion));
+        std::mem_fun(&Command::notifyCmdQueue));
 
     // The batch update must be lock protected to avoid a race condition
     // when multiple threads submit/flush/update the batch at the same time
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index 68c249b1a6..21bce75a09 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -93,6 +93,8 @@ class Event : public RuntimeObject {
   std::atomic<CallBackEntry*> callbacks_;  //!< linked list of callback entries.
   std::atomic<int32_t> status_;            //!< current execution status.
   std::atomic_flag notified_;              //!< Command queue was notified
+  void*  hw_event_;                        //!< HW event ID associated with SW event
+  Event* notify_event_;                    //!< Notify event, which should contain HW signal
 
  protected:
   static const EventWaitList nullWaitList;
@@ -210,6 +212,15 @@ class Event : public RuntimeObject {
 
   //! Returns the callback for this event
   const CallBackEntry* Callback() const { return callbacks_; }
+
+  // Saves HW event, associated with the current command
+  void SetHwEvent(void* hw_event) { hw_event_ = hw_event; }
+
+  //! Returns HW event, associated with the current command
+  void* HwEvent() const { return hw_event_; }
+
+  //! Returns notify even associated with the current command
+  Event* NotifyEvent() const { return notify_event_; }
 };
 
 /*! \brief An operation that is submitted to a command queue.