diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index be27ae96da..69ab3590df 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -3012,36 +3012,22 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
 }
 
 // ================================================================================================
-ProfilingSignal* Device::GetGlobalSignal(Timestamp* ts) const {
-  std::unique_ptr<ProfilingSignal> prof_signal(new ProfilingSignal());
-  if (prof_signal != nullptr) {
-    hsa_agent_t agent = getBackendDevice();
-    hsa_agent_t* agents = (settings().system_scope_signal_) ? nullptr : &agent;
-    uint32_t num_agents = (settings().system_scope_signal_) ? 0 : 1;
-
-    if (ts != 0) {
-      // Save HSA signal earlier to make sure the possible callback will have a valid
-      // value for processing
-      prof_signal->ts_ = ts;
-      ts->AddProfilingSignal(prof_signal.get());
-    }
-
-    if (HSA_STATUS_SUCCESS == hsa_signal_create(kInitSignalValueOne,
-                                                num_agents, agents, &prof_signal->signal_)) {
-      return prof_signal.release();
-    }
+void Device::ReleaseGlobalSignal(void* signal) const {
+  if (signal != nullptr) {
+    reinterpret_cast<ProfilingSignal*>(signal)->release();
   }
-  return nullptr;
 }
 
 // ================================================================================================
-void Device::ReleaseGlobalSignal(void* signal) const {
-  if (signal != nullptr) {
-    ProfilingSignal* prof_signal = reinterpret_cast<ProfilingSignal*>(signal);
-    if (prof_signal->signal_.handle != 0) {
-      hsa_signal_destroy(prof_signal->signal_);
+ProfilingSignal::~ProfilingSignal() {
+  if (signal_.handle != 0) {
+    if (hsa_signal_load_relaxed(signal_) > 0) {
+      LogError("Runtime shouldn't destroy a signal that is still busy!");
+      if (hsa_signal_wait_scacquire(signal_, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
+                                    kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) {
+      }
     }
-    delete prof_signal;
+    hsa_signal_destroy(signal_);
   }
 }
 
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 572d817c6e..e8f4791042 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -77,7 +77,8 @@ class VirtualDevice;
 class PrintfDbg;
 class IProDevice;
 
-struct ProfilingSignal : public amd::HeapObject {
+class ProfilingSignal : public amd::ReferenceCountedObject {
+public:
   hsa_signal_t  signal_;  //!< HSA signal to track profiling information
   Timestamp*    ts_;      //!< Timestamp object associated with the signal
   HwQueueEngine engine_;  //!< Engine used with this signal
@@ -89,6 +90,8 @@ struct ProfilingSignal : public amd::HeapObject {
     , done_(true)
     , lock_("Signal Ops Lock", true)
     { signal_.handle = 0; }
+
+  virtual ~ProfilingSignal();
   amd::Monitor& LockSignalOps() { return lock_; }
 };
 
@@ -531,8 +534,6 @@ class Device : public NullDevice {
 
   virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset);
 
-  ProfilingSignal* GetGlobalSignal(Timestamp* ts) const;
-
  private:
   bool create();
 
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 07147600fb..96172b6aff 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -319,10 +319,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
 // ================================================================================================
 VirtualGPU::HwQueueTracker::~HwQueueTracker() {
   for (auto& signal: signal_list_) {
-    if (signal->signal_.handle != 0) {
-      hsa_signal_destroy(signal->signal_);
-    }
-    delete signal;
+    signal->release();
   }
 }
 
@@ -374,6 +371,26 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
   // a GPU waiter(which may be not triggered yet) and CPU signal reset below
   WaitNext();
 
+  if (signal_list_[current_id_]->referenceCount() > 1) {
+    // The signal was assigned to the global marker's event, hence runtime can't reuse it
+    // and needs a new signal
+    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+    if (signal != nullptr) {
+      hsa_agent_t agent = gpu_.gpu_device();
+      const Settings& settings = gpu_.dev().settings();
+      hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
+      uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
+
+      if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
+        signal_list_[current_id_]->release();
+        signal_list_[current_id_] = signal.release();
+      } else {
+        assert(!"ProfilingSignal reallocaiton failed! Marker has a conflict with signal reuse!");
+      }
+    } else {
+      assert(!"ProfilingSignal reallocaiton failed! Marker has a conflict with signal reuse!");
+    }
+  }
   ProfilingSignal* prof_signal = signal_list_[current_id_];
   // Reset the signal and return
   hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val);
@@ -387,7 +404,23 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
     // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
     // requires the batch update upon HSA signal completion
     if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
-      assert(false && "Runtime should not have batch command in ActiveSignal!");
+      uint32_t init_value = kInitSignalValueOne;
+      // If API callback is enabled, then use a blocking signal for AQL queue.
+      // HSA signal will be acquired in SW and released after HSA signal callback
+      if (ts->command().Callback() != nullptr) {
+        ts->SetCallbackSignal(prof_signal->signal_);
+        // Blocks AQL queue from further processing
+        hsa_signal_add_relaxed(prof_signal->signal_, 1);
+        init_value += 1;
+      }
+      hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
+          HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
+      if (HSA_STATUS_SUCCESS != result) {
+        LogError("hsa_amd_signal_async_handler() failed to set the handler!");
+      } else {
+        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
+          prof_signal->signal_.handle, prof_signal);
+      }
     }
     if (!sdma_profiling_) {
       hsa_amd_profiling_async_copy_enable(true);
@@ -872,8 +905,7 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
 }
 
 // ================================================================================================
-void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
-  bool skipSignal, const ProfilingSignal* global_signal) {
+void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
   const uint32_t queueSize = gpu_queue_->size;
   const uint32_t queueMask = queueSize - 1;
 
@@ -896,16 +928,12 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
   barrier_packet_.completion_signal.handle = 0;
 
   if (!skipSignal) {
-    if (global_signal != nullptr) {
-      barrier_packet_.completion_signal = global_signal->signal_;
-    } else {
-      // Pool size must grow to the size of pending AQL packets
-      const uint32_t pool_size = index - read;
+    // Pool size must grow to the size of pending AQL packets
+    const uint32_t pool_size = index - read;
 
-      // Get active signal for current dispatch if profiling is necessary
-      barrier_packet_.completion_signal =
-        Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
-    }
+    // Get active signal for current dispatch if profiling is necessary
+    barrier_packet_.completion_signal =
+      Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
   }
 
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1226,6 +1254,12 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
     }
     command.setData(timestamp_);
 
+    // Update HW event only for batches
+    if ((AMD_DIRECT_DISPATCH) && (command.GetBatchHead() != nullptr)) {
+      timestamp_->Signals().back()->retain();
+      command.SetHwEvent(timestamp_->Signals().back());
+    }
+
     timestamp_ = nullptr;
   }
 }
@@ -2889,7 +2923,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
 
     queue->profilingEnd(vcmd);
   } else {
-  // Make sure VirtualGPU has an exclusive access to the resources
+    // Make sure VirtualGPU has an exclusive access to the resources
     amd::ScopedLock lock(execution());
 
     profilingBegin(vcmd);
@@ -2913,47 +2947,23 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
 // ================================================================================================
 void VirtualGPU::submitMarker(amd::Marker& vcmd) {
   if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) {
-    profilingBegin(vcmd);
-    if (timestamp_ != nullptr) {
-      ProfilingSignal* prof_signal = nullptr;
-      // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
-      // requires the batch update upon HSA signal completion
-      if (AMD_DIRECT_DISPATCH) {
-        assert(vcmd.GetBatchHead() != nullptr && "Marker doesn't have batch!");
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+    if (vcmd.CpuWaitRequested() && hasPendingDispatch_ == false) {
+      // It should be safe to call flush directly if there are not pending dispatches without
+      // HSA signal callback
+      flush(vcmd.GetBatchHead());
+    } else {
+      profilingBegin(vcmd);
+      if (timestamp_ != nullptr) {
+        // Submit a barrier with a cache flushes.
+        dispatchBarrierPacket(kBarrierPacketHeader, false);
 
-        prof_signal = dev().GetGlobalSignal(timestamp_);
-        prof_signal->done_ = false;
-
-        assert(prof_signal != nullptr && "Failed to allocate the global HSA signal!");
-        uint32_t init_value = kInitSignalValueOne;
-        // If API callback is enabled, then use a blocking signal for AQL queue.
-        // HSA signal will be acquired in SW and released after HSA signal callback
-        if (vcmd.Callback() != nullptr) {
-          timestamp_->SetCallbackSignal(prof_signal->signal_);
-          // Blocks AQL queue from further processing
-          hsa_signal_add_relaxed(prof_signal->signal_, 1);
-          init_value += 1;
-        }
-
-        hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
-            HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, timestamp_);
-        if (HSA_STATUS_SUCCESS != result) {
-          LogError("hsa_amd_signal_async_handler() failed to set the handler!");
-        } else {
-          ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
-              prof_signal->signal_.handle, prof_signal);
-        }
-        // Update HW event only for batches
-        vcmd.SetHwEvent(timestamp_->Signals().back());
+        hasPendingDispatch_ = false;
       }
-      // Submit a barrier with a cache flushes.
-      dispatchBarrierPacket(kBarrierPacketHeader, false, prof_signal);
-
-      // Don't reset the flag for direct dispatch, because the global signals are out of scope
-      // for internal barrier tracking and SDMA could lose a wait for compute
-      hasPendingDispatch_ = AMD_DIRECT_DISPATCH;
+      profilingEnd(vcmd);
     }
-    profilingEnd(vcmd);
+
   }
 }
 
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 85f938370b..f980358f20 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -223,6 +223,7 @@ class VirtualGPU : public device::VirtualDevice {
 
     //! Update current active engine
     void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
+    HwQueueEngine GetActiveEngine() const { return engine_; }
 
     //! Returns the last submitted signal for a wait
     std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
@@ -385,8 +386,7 @@ class VirtualGPU : public device::VirtualDevice {
   template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
                                                               uint16_t rest, bool blocking,
                                                               size_t size = 1);
-  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
-                             const ProfilingSignal* global_signal = nullptr);
+  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false);
   bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
                                 bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
   void dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t* packet,
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index 4c53d55c52..f4567f4179 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -232,11 +232,12 @@ void Event::processCallbacks(int32_t status) const {
   }
 }
 
+static constexpr bool kCpuWait = true;
 // ================================================================================================
 bool Event::awaitCompletion() {
   if (status() > CL_COMPLETE) {
-    // Notifies current command queue about waiting
-    if (!notifyCmdQueue()) {
+    // Notifies the current command queue about waiting
+    if (!notifyCmdQueue(kCpuWait)) {
       return false;
     }
 
@@ -262,7 +263,7 @@ bool Event::awaitCompletion() {
 }
 
 // ================================================================================================
-bool Event::notifyCmdQueue() {
+bool Event::notifyCmdQueue(bool cpu_wait) {
   HostQueue* queue = command().queue();
   if (AMD_DIRECT_DISPATCH) {
     ScopedLock l(notify_lock_);
@@ -271,7 +272,7 @@ bool Event::notifyCmdQueue() {
         (HwEvent() == nullptr) &&
         !notified_.test_and_set()) {
       // Make sure the queue is draining the enqueued commands.
-      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
+      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
       if (command == NULL) {
         notified_.clear();
         return false;
@@ -341,7 +342,7 @@ void Command::enqueue() {
     // Notify all commands about the waiter. Barrier will be sent in order to obtain
     // HSA signal for a wait on the current queue
     std::for_each(eventWaitList().begin(), eventWaitList().end(),
-        std::mem_fun(&Command::notifyCmdQueue));
+        std::bind2nd(std::mem_fun(&Command::notifyCmdQueue), !kCpuWait));
 
     // The batch update must be lock protected to avoid a race condition
     // when multiple threads submit/flush/update the batch at the same time
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index 36e71360a7..7282a28c1a 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -207,7 +207,7 @@ class Event : public RuntimeObject {
 
   /*! \brief Notifies current command queue about execution status
    */
-  bool notifyCmdQueue();
+  bool notifyCmdQueue(bool cpu_wait = false);
 
   //! RTTI internal implementation
   virtual ObjectType objectType() const { return ObjectTypeEvent; }
@@ -998,15 +998,22 @@ class ExternalSemaphoreCmd : public Command {
 
 
 class Marker : public Command {
+ private:
+  bool cpu_wait_;   //!< If true, then the marker was issued for CPU/GPU sync
+
  public:
   //! Create a new Marker
   Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList,
-         const Event* waitingEvent = nullptr)
-      : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) {}
+         const Event* waitingEvent = nullptr, bool cpu_wait = false)
+      : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent)
+      , cpu_wait_(cpu_wait) {}
 
   //! The actual command implementation.
   virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); }
 
+  //! Check if this marker requires CPU wait
+  bool CpuWaitRequested() const { return cpu_wait_; }
+
 };
 
 /*! \brief  Maps CL objects created from external ones and syncs the contents (blocking).