diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp
index 4500c16036..e34eaa8785 100644
--- a/projects/clr/rocclr/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp
@@ -750,7 +750,7 @@ bool Buffer::create() {
                     "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed");
             return false;
           }
-          volatile hsa_signal_value_t* signalValuePtr;
+          volatile hsa_signal_value_t* signalValuePtr = nullptr;
           if (HSA_STATUS_SUCCESS != hsa_amd_signal_value_pointer(signal_, &signalValuePtr)) {
             ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
                     "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY pointer query failed");
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index c2a988d1fa..2ff4c3483b 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -146,15 +146,13 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
       ((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) {
     return false;
   }
+  amd::ScopedLock sl(ts->gpu()->execution());
+  ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n",
+    static_cast<uint32_t>(value), arg, ts->HwProfiling() ? ts->Signals()[0]->signal_.handle : 0);
+
   // Update the batch, since signal is complete
-  if (ts->Signals().size() > 0) {
-    amd::ScopedLock sl(ts->gpu()->execution());
-    ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n",
-        static_cast<uint32_t>(value), arg, ts->Signals()[0]->signal_.handle);
-    ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
-  } else {
-    LogError("Error: ROCr handler was called for untracked signal!");
-  }
+  ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
+
   // Return false, so the callback will not be called again for this signal
   return false;
 }
@@ -331,6 +329,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
   prof_signal->done_ = false;
   prof_signal->engine_ = engine_;
   if (ts != 0) {
+    // Save HSA signal earlier to make sure the possible callback will have a valid
+    // value for processing
+    prof_signal->ts_ = ts;
+    ts->AddProfilingSignal(prof_signal);
     // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
     // requires the batch update upon HSA signal completion
     if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
@@ -347,8 +349,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
       hsa_amd_profiling_async_copy_enable(true);
       sdma_profiling_ = true;
     }
-    prof_signal->ts_ = ts;
-    ts->AddProfilingSignal(prof_signal);
   }
   return prof_signal->signal_;
 }
@@ -1127,7 +1127,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
 }
 
 // ================================================================================================
-void VirtualGPU::updateCommandsState(amd::Command* list) {
+void VirtualGPU::updateCommandsState(amd::Command* list) const {
   Timestamp* ts = nullptr;
 
   amd::Command* current = list;
@@ -2802,6 +2802,7 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
   // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
 }
 
+// ================================================================================================
 void VirtualGPU::submitMarker(amd::Marker& vcmd) {
   if (vcmd.profilingInfo().marker_ts_) {
     profilingBegin(vcmd);
@@ -2811,11 +2812,16 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
       // for cache flushes explicitly and helps wall time
       uint16_t header = kNopPacketHeader;
       dispatchBarrierPacket(&barrier_packet_, header);
+      // Direct dispatch requires a barrier with callback and hasPendingDispatch_ triggers that
+      if (AMD_DIRECT_DISPATCH) {
+        hasPendingDispatch_ = true;
+      }
     }
     profilingEnd(vcmd);
   }
 }
 
+// ================================================================================================
 void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
@@ -2856,8 +2862,9 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
     // If runtime didn't submit a barrier, then it can't track the completion of the batch.
     // Hence runtime either has to insert a barrier unconditionally or have a CPU wait.
     // Due to performance impact of extra barriers CPU wait is selected.
-    // CPU wait is also forced if pinned buffers were used
-    skip_cpu_wait &= hasPendingDispatch_ && (pinnedMems_.size() == 0);
+    // Note: if callback will be selected to update the batch status,
+    // then the host thread can't update it also, otherwise double free may occur
+    skip_cpu_wait &= hasPendingDispatch_;
 
     releaseGpuMemoryFence(force_barrier, skip_cpu_wait);
     profilingEnd(*current);
@@ -2910,14 +2917,18 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
   //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
   //!        unconditionally, before it can release pinned memory
   releaseGpuMemoryFence();
-  if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
-    if (pinnedMems_.size() > 7) {
-      pinnedMems_.front()->release();
-      pinnedMems_.erase(pinnedMems_.begin());
-    }
+  if (!AMD_DIRECT_DISPATCH) {
+    if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
+      if (pinnedMems_.size() > 7) {
+        pinnedMems_.front()->release();
+        pinnedMems_.erase(pinnedMems_.begin());
+      }
 
-    // Delay destruction
-    pinnedMems_.push_back(mem);
+      // Delay destruction
+      pinnedMems_.push_back(mem);
+    }
+  } else {
+    mem->release();
   }
 }
 
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 74f3b5f0f3..d77390092c 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -36,7 +36,6 @@ class Device;
 class Memory;
 class Timestamp;
 
-
 struct ProfilingSignal : public amd::HeapObject {
   hsa_signal_t  signal_;  //!< HSA signal to track profiling information
   Timestamp*    ts_;      //!< Timestamp object associated with the signal
@@ -98,7 +97,7 @@ class Timestamp : public amd::HeapObject {
     , gpu_(gpu)
     , command_(command) {}
 
-  virtual ~Timestamp() {}
+  ~Timestamp() {}
 
   uint64_t getStart() {
     checkGpuTime();
@@ -236,7 +235,7 @@ class VirtualGPU : public device::VirtualDevice {
   void profilingBegin(amd::Command& command, bool drmProfiling = false);
   void profilingEnd(amd::Command& command);
 
-  void updateCommandsState(amd::Command* list);
+  void updateCommandsState(amd::Command* list) const;
 
   void submitReadMemory(amd::ReadMemoryCommand& cmd);
   void submitWriteMemory(amd::WriteMemoryCommand& cmd);
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index 24764754dd..405383a621 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -213,13 +213,17 @@ bool Event::awaitCompletion() {
 // ================================================================================================
 bool Event::notifyCmdQueue() {
   HostQueue* queue = command().queue();
-  if ((status() > CL_COMPLETE) &&
-      // Don't need to notify any marker with direct dispatch,
-      // because all markers are blocking
+  if ((status() > CL_COMPLETE) && (nullptr != queue) &&
       (!AMD_DIRECT_DISPATCH ||
+       // Don't need to notify any marker with direct dispatch,
+       // because all markers are blocking.
        ((command().type() != CL_COMMAND_MARKER) &&
-        (command().type() != 0))) &&
-      (nullptr != queue) && !notified_.test_and_set()) {
+        (command().type() != 0)) ||
+        // Don't need to notify if the current batch is empty,
+        // because that means the command was processed and extra notification
+        // will cause a stall on the host.
+        (queue->GetSubmittionBatch() != nullptr)) &&
+        !notified_.test_and_set()) {
     // Make sure the queue is draining the enqueued commands.
     amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
     if (command == NULL) {