diff --git a/hipamd/src/hip_stream.cpp b/hipamd/src/hip_stream.cpp
index 4db1dc77cc..9373749772 100644
--- a/hipamd/src/hip_stream.cpp
+++ b/hipamd/src/hip_stream.cpp
@@ -359,14 +359,9 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) {
   }
   bool wait = (stream == nullptr || stream == hipStreamLegacy) ? true : false;
   auto hip_stream = hip::getStream(stream, wait);
-  bool wait_for_cpu = false;
-  // Force blocking wait if requested. That allows to avoid a build up of unreleased CPU commands
-  if (DEBUG_HIP_BLOCK_SYNC != 0) {
-    static std::atomic<uint64_t> flush = 0;
-    wait_for_cpu = ((++flush % DEBUG_HIP_BLOCK_SYNC) == 0) ? true : false;
-  }
+
   // Wait for the current host queue
-  hip_stream->finish(wait_for_cpu);
+  hip_stream->finish();
   // Release freed memory for all memory pools on the device
   hip_stream->GetDevice()->ReleaseFreedMemory();
   return hipSuccess;
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index 99ba26d4a9..933ef7cd49 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -1332,6 +1332,9 @@ class VirtualDevice : public amd::HeapObject {
                                  const std::string& kernelName,
                                  amd::AccumulateCommand* vcmd = nullptr) = 0;
 
+  //! Returns the number of outstanding HSA async handlers
+  std::atomic<uint64_t>& QueuedAsyncHandlers() const { return queued_async_handlers_; }
+
  private:
   //! Disable default copy constructor
   VirtualDevice& operator=(const VirtualDevice&);
@@ -1347,6 +1350,7 @@ class VirtualDevice : public amd::HeapObject {
 
   amd::Monitor execution_;  //!< Lock to serialise access to all device objects
   uint index_;              //!< The virtual device unique index
+  mutable std::atomic<uint64_t> queued_async_handlers_ = 0; //!< Outstanding HSA async handlers
 };
 
 }  // namespace amd::device
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index 23a2fd22d9..edbabd0ca2 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -232,11 +232,14 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
   // Save callback signal
   hsa_signal_t callback_signal = ts->GetCallbackSignal();
 
+  auto gpu = ts->gpu();
+  gpu->QueuedAsyncHandlers()--;
+
   // Reset last used SDMA engine mask
-  ts->gpu()->setLastUsedSdmaEngine(0);
+  gpu->setLastUsedSdmaEngine(0);
 
   // Update the batch, since signal is complete
-  ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
+  gpu->updateCommandsState(ts->command().GetBatchHead());
 
   // Reset API callback signal. It will release AQL queue and start commands processing
   if (callback_signal.handle != 0) {
@@ -474,6 +477,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
           hsa_signal_add_relaxed(prof_signal->signal_, 1);
           init_value += 1;
         }
+        gpu_.QueuedAsyncHandlers()++;
         hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
             HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
         if (HSA_STATUS_SUCCESS != result) {
diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp
index 49f3029c36..bcdb649c3b 100644
--- a/rocclr/platform/commandqueue.cpp
+++ b/rocclr/platform/commandqueue.cpp
@@ -141,6 +141,11 @@ void HostQueue::finish(bool cpu_wait) {
         "Can't claim the queue is finished with the active batch!");
       return;
     }
+    // Force blocking wait if requested. That allows to avoid a build up of unreleased CPU commands
+    if ((DEBUG_HIP_BLOCK_SYNC > 0) &&
+        (vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) {
+      cpu_wait = true;
+    }
   }
   // Force marker if the batch wasn't sent for CPU update or fence is dirty
   if (nullptr == command || (GetSubmissionBatch() != nullptr) || vdev()->isFenceDirty()) {