diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp
index 815aee9fd5..f29df7838a 100644
--- a/projects/clr/hipamd/src/hip_event.cpp
+++ b/projects/clr/hipamd/src/hip_event.cpp
@@ -64,6 +64,7 @@ hipError_t Event::query() {
   return ready() ? hipSuccess : hipErrorNotReady;
 }
 
+// ================================================================================================
 hipError_t Event::synchronize() {
   amd::ScopedLock lock(lock_);
 
@@ -76,19 +77,12 @@ hipError_t Event::synchronize() {
   // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
   static constexpr bool kWaitCompletion = true;
   if (!hip_device->devices()[0]->IsHwEventReady(*event_, kWaitCompletion, flags_)) {
-    if (event_->HwEvent() != nullptr) {
-      amd::Command* command = nullptr;
-      hipError_t status = recordCommand(command, event_->command().queue(), flags_);
-      command->enqueue();
-      hip_device->devices()[0]->IsHwEventReady(command->event(), kWaitCompletion, flags_);
-      command->release();
-    } else {
-      event_->awaitCompletion();
-    }
+    event_->awaitCompletion();
   }
   return hipSuccess;
 }
 
+// ================================================================================================
 bool Event::awaitEventCompletion() {
   return event_->awaitCompletion();
 }
@@ -222,8 +216,9 @@ hipError_t Event::streamWait(hipStream_t stream, uint flags) {
   return hipSuccess;
 }
 
+// ================================================================================================
 hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
-                                uint32_t ext_flags ) {
+                                uint32_t ext_flags, bool batch_flush) {
   if (command == nullptr) {
     int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) &
                             (hipEventReleaseToDevice | hipEventReleaseToSystem |
@@ -234,11 +229,12 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
       releaseFlags = amd::Device::kCacheStateInvalid;
     }
     // Always submit a EventMarker.
-    command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags);
+    command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags, batch_flush);
   }
   return hipSuccess;
 }
 
+// ================================================================================================
 hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
   command->enqueue();
   if (event_ == &command->event()) return hipSuccess;
@@ -251,11 +247,13 @@ hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command
   return hipSuccess;
 }
 
-hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) {
+// ================================================================================================
+hipError_t Event::addMarker(hipStream_t stream, amd::Command* command,
+                            bool record, bool batch_flush) {
   hip::Stream* hip_stream = hip::getStream(stream);
   // Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
   amd::ScopedLock lock(lock_);
-  hipError_t status = recordCommand(command, hip_stream);
+  hipError_t status = recordCommand(command, hip_stream, 0, batch_flush);
   if (status != hipSuccess) {
     return hipSuccess;
   }
@@ -415,7 +413,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) {
     if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) {
       return hipErrorInvalidHandle;
     }
-    status = e->addMarker(stream, nullptr, true);
+    status = e->addMarker(stream, nullptr, true, !hip::Event::kBatchFlush);
   }
   return status;
 }
diff --git a/projects/clr/hipamd/src/hip_event.hpp b/projects/clr/hipamd/src/hip_event.hpp
index 9ebb3cb3c6..2b75ab869e 100644
--- a/projects/clr/hipamd/src/hip_event.hpp
+++ b/projects/clr/hipamd/src/hip_event.hpp
@@ -81,10 +81,11 @@ typedef struct ihipIpcEventShmem_s {
 class EventMarker : public amd::Marker {
  public:
   EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false,
-              int32_t scope = amd::Device::kCacheStateInvalid)
+              int32_t scope = amd::Device::kCacheStateInvalid, bool batch_flush = true)
       : amd::Marker(stream, disableFlush) {
     profilingInfo_.enabled_ = true;
     profilingInfo_.marker_ts_ = markerTs;
+    profilingInfo_.batch_flush_ = batch_flush;
     profilingInfo_.clear();
     setEventScope(scope);
   }
@@ -101,6 +102,8 @@ class Event {
   }
 
  public:
+  constexpr static bool kBatchFlush = true;  //!< Flushes CPU command batch in direct dispatch mode
+
   Event(uint32_t flags) : flags_(flags), lock_(true) /* hipEvent_t lock*/,
                               event_(nullptr), unrecorded_(false), stream_(nullptr) {
     // No need to init event_ here as addMarker does that
@@ -123,9 +126,10 @@ class Event {
   virtual hipError_t streamWait(hipStream_t stream, uint flags);
 
   virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream,
-                                   uint32_t flags = 0);
+                                   uint32_t flags = 0, bool batch_flush = true);
   virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
-  hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record);
+  hipError_t addMarker(hipStream_t stream, amd::Command* command,
+                       bool record, bool batch_flush = true);
 
   void BindCommand(amd::Command& command, bool record) {
     amd::ScopedLock lock(lock_);
@@ -217,7 +221,8 @@ class IPCEvent : public Event {
   hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
   hipError_t streamWait(hipStream_t stream, uint flags);
 
-  hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags = 0);
+  hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue,
+                           uint32_t flags = 0, bool batch_flush = true) override;
   hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
 };
 
diff --git a/projects/clr/hipamd/src/hip_event_ipc.cpp b/projects/clr/hipamd/src/hip_event_ipc.cpp
index 9c68e9f669..c7204619cd 100644
--- a/projects/clr/hipamd/src/hip_event_ipc.cpp
+++ b/projects/clr/hipamd/src/hip_event_ipc.cpp
@@ -139,16 +139,19 @@ hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) {
   return hipSuccess;
 }
 
-hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) {
+// ================================================================================================
+hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream,
+                                   uint32_t flags, bool batch_flush) {
   bool unrecorded = isUnRecorded();
   if (unrecorded) {
     command = new amd::Marker(*stream, kMarkerDisableFlush);
   } else {
-    return Event::recordCommand(command, stream);
+    return Event::recordCommand(command, stream, batch_flush);
   }
   return hipSuccess;
 }
 
+// ================================================================================================
 hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
   bool unrecorded = isUnRecorded();
   if (unrecorded) {
diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp
index 91f0472112..40ffc6ce2c 100644
--- a/projects/clr/hipamd/src/hip_stream.cpp
+++ b/projects/clr/hipamd/src/hip_stream.cpp
@@ -23,6 +23,7 @@
 #include "hip_event.hpp"
 #include "thread/monitor.hpp"
 #include "hip_prof_api.h"
+#include <atomic>
 
 namespace hip {
 
@@ -358,11 +359,15 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) {
     }
   }
   bool wait = (stream == nullptr || stream == hipStreamLegacy) ? true : false;
-  constexpr bool kDontWaitForCpu = false;
-
   auto hip_stream = hip::getStream(stream, wait);
+  bool wait_for_cpu = false;
+  // Force blocking wait if requested. That allows to avoid a build up of unreleased CPU commands
+  if (DEBUG_HIP_BLOCK_SYNC != 0) {
+    static std::atomic<uint64_t> flush = 0;
+    wait_for_cpu = ((++flush % DEBUG_HIP_BLOCK_SYNC) == 0) ? true : false;
+  }
   // Wait for the current host queue
-  hip_stream->finish(kDontWaitForCpu);
+  hip_stream->finish(wait_for_cpu);
   if (stream == nullptr) {
     // null stream will sync with other streams.
     ReleaseGraphExec(hip_stream->DeviceId());
diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index ef82630325..160eb84caa 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -1323,9 +1323,6 @@ class VirtualDevice : public amd::HeapObject {
   //! Returns true if device has active wait setting
   bool ActiveWait() const;
 
-  //! Returns the status of queue handler callback
-  virtual bool isHandlerPending() const = 0;
-
   //! Returns fence state of the VirtualGPU
   virtual bool isFenceDirty() const = 0;
   //! Init hidden heap for device memory allocations
diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp
index 1d1ac66cbe..05258e961c 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.hpp
@@ -353,8 +353,6 @@ class VirtualGPU : public device::VirtualDevice {
 
   void profilerAttach(bool enable = false) {}
 
-  bool isHandlerPending() const { return false; }
-
   bool isFenceDirty() const { return false; }
 
   void HiddenHeapInit() {}
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index d18e3c4c5c..23a2fd22d9 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -127,6 +127,9 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
 
 // ================================================================================================
 void Timestamp::checkGpuTime() {
+  if (amd::IS_HIP && !amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
+    return;
+  }
   amd::ScopedLock s(lock_);
   if (HwProfiling()) {
     uint64_t  start = std::numeric_limits<uint64_t>::max();
@@ -479,12 +482,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
           ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
             prof_signal->signal_.handle, prof_signal);
         }
-        SetHandlerPending(false);
         // Update the current command/marker with HW event
         prof_signal->retain();
         ts->command().SetHwEvent(prof_signal);
       } else if (ts->command().profilingInfo().marker_ts_) {
-        SetHandlerPending(true);
         // Update the current command/marker with HW event
         prof_signal->retain();
         ts->command().SetHwEvent(prof_signal);
@@ -1652,7 +1653,7 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
   // also true for any command B, which falls between A and C.
   current = list;
   while (current != nullptr) {
-    if (current->profilingInfo().enabled_) {
+      if (current->profilingInfo().enabled_) {
       if (!current->data().empty()) {
         for (auto i = 0; i < current->data().size(); i++) {
           // Since this is a valid command to get a timestamp, we use the
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 3aeebe02b4..d015b321d2 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -248,7 +248,7 @@ class VirtualGPU : public device::VirtualDevice {
 
   class HwQueueTracker : public amd::EmbeddedObject {
    public:
-    HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu), handlerPending_(false) {}
+    HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {}
 
     ~HwQueueTracker();
 
@@ -289,12 +289,6 @@ class VirtualGPU : public device::VirtualDevice {
     //! Empty check for external signals
     bool IsExternalSignalListEmpty() const { return external_signals_.empty(); }
 
-    //! Set the status to indicate a pending handler
-    void SetHandlerPending(bool pending) { handlerPending_ = pending; }
-
-    //! Check if callback has been queued
-    bool IsHandlerPending() const { return handlerPending_; }
-
     //! Get/Set SDMA profiling
     bool GetSDMAProfiling() { return sdma_profiling_; }
     void SetSDMAProfiling(bool profile) {
@@ -319,7 +313,6 @@ class VirtualGPU : public device::VirtualDevice {
     const VirtualGPU& gpu_;       //!< VirtualGPU, associated with this tracker
     std::vector<ProfilingSignal*> external_signals_; //!< External signals for a wait in this queue
     std::vector<hsa_signal_t> waiting_signals_;   //!< Current waiting signals in this queue
-    bool handlerPending_;         //!< This indicates if we have queued a callback handler
   };
 
   VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
@@ -434,10 +427,6 @@ class VirtualGPU : public device::VirtualDevice {
 
   Timestamp* timestamp() const { return timestamp_; }
 
-  //! Indicates the status of the callback handler. The callback would process the commands
-  //! and would collect profiling data, update refcounts
-  bool isHandlerPending() const { return barriers_.IsHandlerPending(); }
-
   void* allocKernArg(size_t size, size_t alignment);
   bool isFenceDirty() const { return fence_dirty_; }
   void HiddenHeapInit();
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index a4133bb957..7dd5aa27f4 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -366,7 +366,9 @@ void Command::enqueue() {
     ScopedLock sl(queue_->vdev()->execution());
     queue_->FormSubmissionBatch(this);
 
-    if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) {
+    // Enqueue flushes, except profiling markers to avoid frequent expensive callbacks
+    if (((type() == 0) && profilingInfo().batch_flush_) ||
+        (type() == CL_COMMAND_MARKER) || (type() == CL_COMMAND_TASK)) {
       // The current HSA signal tracking logic requires profiling enabled for the markers
       EnableProfiling();
       // Update batch head for the current marker. Hence the status of all commands can be
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index 3bccbbd2ea..e825bfff98 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -114,6 +114,7 @@ class Event : public RuntimeObject {
     uint64_t correlation_id_;
     bool enabled_;        //!< Profiling enabled for the wave limiter
     bool marker_ts_;      //!< TS marker
+    bool batch_flush_ = true; //!< Command can flush the batch in direct dispatch mode
 
    void clear() {
       queued_ = 0ULL;
diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp
index bc38121273..33ee53c8c8 100644
--- a/projects/clr/rocclr/platform/commandqueue.cpp
+++ b/projects/clr/rocclr/platform/commandqueue.cpp
@@ -65,6 +65,15 @@ bool HostQueue::terminate() {
       // destroyed.
       Command* lastCommand = getLastQueuedCommand(true);
       if (lastCommand != nullptr) {
+        // Check if CPU batch wasn't flushed for completion with the last command
+        if (GetSubmittionBatch() != nullptr) {
+            auto command = new Marker(*this, false);
+            if (command != nullptr) {
+              ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to ensure finish");
+              command->enqueue();
+              lastCommand = command;
+            }
+        }
         lastCommand->awaitCompletion();
         // Note that if lastCommand isn't a marker, it may not be lastEnqueueCommand_ now
         // after lastCommand->awaitCompletion() is called.
@@ -128,19 +137,13 @@ void HostQueue::finish(bool cpu_wait) {
   if (IS_HIP) {
     command = getLastQueuedCommand(true);
     if (command == nullptr) {
+      assert(GetSubmittionBatch() == nullptr &&
+        "Can't claim the queue is finished with the active batch!");
       return;
     }
   }
-  // If command doesn't contain HW event and runtime didn't request CPU wait,
-  // then force marker submit
-  bool force_marker = false;
-  if (AMD_DIRECT_DISPATCH && (command != nullptr) && !cpu_wait) {
-    void* hw_event =
-      (command->NotifyEvent() != nullptr) ? command->NotifyEvent()->HwEvent() : command->HwEvent();
-    force_marker = (hw_event == nullptr);
-  }
-  if (nullptr == command || force_marker ||
-      vdev()->isHandlerPending() || vdev()->isFenceDirty()) {
+  // Force marker if the batch wasn't sent for CPU update or fence is dirty
+  if (nullptr == command || (GetSubmittionBatch() != nullptr) || vdev()->isFenceDirty()) {
     if (nullptr != command) {
       command->release();
     }
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index c057a8cd6c..0f50070442 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -257,6 +257,8 @@ release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4,                                \
         "Forces the number of streams for the graph parallel execution")      \
 release(bool, HIP_ALWAYS_USE_NEW_COMGR_UNBUNDLING_ACTION, false,              \
         "Force to always use new comgr unbundling action")                    \
+release(uint, DEBUG_HIP_BLOCK_SYNC, 50,                                       \
+        "Blocks synchronization on CPU until the callback processing is done")\
 release(bool, DEBUG_HIP_KERNARG_COPY_OPT, true,                               \
         "Enable/Disable multiple kern arg copies")                            \
 release(bool, DEBUG_CLR_USE_STDMUTEX_IN_AMD_MONITOR, false,                   \