From ce8dad2ecc8d5514d80bd91da28c05dfd0e85a6f Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Fri, 18 Jun 2021 17:07:40 -0400
Subject: [PATCH] SWDEV-290160 - Switch to global HSA signals

Runtime can't assign internal HSA signals for HIP events, because
HIP application can destroy the HIP stream or signal reuse may
occur internally. Switch to global HSA signals for HIP events.

Change-Id: Ieaea2d6b039e492b2e7c5112782a8f4e601e50a1
---
 rocclr/device/device.hpp          |   2 +
 rocclr/device/rocm/rocdevice.cpp  |  37 ++++++++
 rocclr/device/rocm/rocdevice.hpp  |  19 +++++
 rocclr/device/rocm/rocvirtual.cpp | 137 +++++++++++++-----------------
 rocclr/device/rocm/rocvirtual.hpp |  25 +-----
 rocclr/platform/command.cpp       |  61 ++++++++-----
 rocclr/platform/command.hpp       |   1 +
 7 files changed, 162 insertions(+), 120 deletions(-)
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index 29346d2380..7c0e13db1f 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -49,6 +49,7 @@
 #include <map>
 #include <list>
 #include <set>
+#include <unordered_set>
 #include <utility>
 
 namespace amd {
@@ -1705,6 +1706,7 @@ class Device : public RuntimeObject {
       ) const {
     return false;
   };
+  virtual void ReleaseGlobalSignal(void* signal) const {}
 
   //! Returns TRUE if the device is available for computations
   bool isOnline() const { return online_; }
diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 0551e0e983..c6e94732a6 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -2919,6 +2919,7 @@ bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool,
   return true;
 }
 
+// ================================================================================================
 void Device::getGlobalCUMask(std::string cuMaskStr) {
   if (cuMaskStr.length() != 0) {
     std::string pre = cuMaskStr.substr(0, 2);
@@ -2971,10 +2972,12 @@ void Device::getGlobalCUMask(std::string cuMaskStr) {
   }
 }
 
+// ================================================================================================
 device::Signal* Device::createSignal() const {
   return new roc::Signal();
 }
 
+// ================================================================================================
 amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
   // If arena_mem_obj_ is null, then HMM and Xnack is disabled. Return nullptr.
   if (arena_mem_obj_ == nullptr) {
@@ -2989,5 +2992,39 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
   return arena_mem_obj_;
 }
 
+// ================================================================================================
+ProfilingSignal* Device::GetGlobalSignal(Timestamp* ts) const {
+  std::unique_ptr<ProfilingSignal> prof_signal(new ProfilingSignal());
+  if (prof_signal != nullptr) {
+    hsa_agent_t agent = getBackendDevice();
+    hsa_agent_t* agents = (settings().system_scope_signal_) ? nullptr : &agent;
+    uint32_t num_agents = (settings().system_scope_signal_) ? 0 : 1;
+
+    if (ts != 0) {
+      // Save HSA signal earlier to make sure the possible callback will have a valid
+      // value for processing
+      prof_signal->ts_ = ts;
+      ts->AddProfilingSignal(prof_signal.get());
+    }
+
+    if (HSA_STATUS_SUCCESS == hsa_signal_create(kInitSignalValueOne,
+                                                num_agents, agents, &prof_signal->signal_)) {
+      return prof_signal.release();
+    }
+  }
+  return nullptr;
+}
+
+// ================================================================================================
+void Device::ReleaseGlobalSignal(void* signal) const {
+  if (signal != nullptr) {
+    ProfilingSignal* prof_signal = reinterpret_cast<ProfilingSignal*>(signal);
+    if (prof_signal->signal_.handle != 0) {
+      hsa_signal_destroy(prof_signal->signal_);
+    }
+    delete prof_signal;
+  }
+}
+
 } // namespace roc
 #endif  // WITHOUT_HSA_BACKEND
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index 78cfbb6241..0fb6f0b8e0 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -77,6 +77,21 @@ class VirtualDevice;
 class PrintfDbg;
 class IProDevice;
 
+struct ProfilingSignal : public amd::HeapObject {
+  hsa_signal_t  signal_;  //!< HSA signal to track profiling information
+  Timestamp*    ts_;      //!< Timestamp object associated with the signal
+  HwQueueEngine engine_;  //!< Engine used with this signal
+  bool          done_;    //!< True if signal is done
+  amd::Monitor  lock_;    //!< Signal lock for update
+  ProfilingSignal()
+    : ts_(nullptr)
+    , engine_(HwQueueEngine::Compute)
+    , done_(true)
+    , lock_("Signal Ops Lock", true)
+    { signal_.handle = 0; }
+  amd::Monitor& LockSignalOps() { return lock_; }
+};
+
 class Sampler : public device::Sampler {
  public:
   //! Constructor
@@ -237,6 +252,7 @@ class NullDevice : public amd::Device {
       cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
+  virtual void ReleaseGlobalSignal(void* signal) const {}
 
  protected:
   //! Initialize compiler instance and handle
@@ -405,6 +421,7 @@ class Device : public NullDevice {
                             cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
+  virtual void ReleaseGlobalSignal(void* signal) const;
 
   //! Allocate host memory in terms of numa policy set by user
   void* hostNumaAlloc(size_t size, size_t alignment, bool atomics = false) const;
@@ -505,6 +522,8 @@ class Device : public NullDevice {
 
   virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset);
 
+  ProfilingSignal* GetGlobalSignal(Timestamp* ts) const;
+
  private:
   bool create();
 
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index 5f2659c76a..d31f01982c 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -109,7 +109,7 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
 };
 
 // ================================================================================================
-void Timestamp::checkGpuTime(bool event_recycle) {
+void Timestamp::checkGpuTime() {
   if (HwProfiling()) {
     uint64_t  start = std::numeric_limits<uint64_t>::max();
     uint64_t  end = 0;
@@ -140,10 +140,6 @@ void Timestamp::checkGpuTime(bool event_recycle) {
         ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
           "end = %ld", it->signal_.handle, start, end);
       }
-      // The signal is reused and the upper layer can't rely on it.
-      if (event_recycle) {
-        const_cast<amd::Command&>(it->ts_->command_).SetHwEvent(nullptr);
-      }
       it->ts_ = nullptr;
       it->done_ = true;
     }
@@ -390,23 +386,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
     // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
     // requires the batch update upon HSA signal completion
     if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
-      uint32_t init_value = kInitSignalValueOne;
-      // If API callback is enabled, then use a blocking signal for AQL queue.
-      // HSA signal will be acquired in SW and released after HSA signal callback
-      if (ts->command().Callback() != nullptr) {
-        ts->SetCallbackSignal(prof_signal->signal_);
-        // Blocks AQL queue from further processing
-        hsa_signal_add_relaxed(prof_signal->signal_, 1);
-        init_value += 1;
-      }
-      hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
-          HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
-      if (HSA_STATUS_SUCCESS != result) {
-        LogError("hsa_amd_signal_async_handler() failed to set the handler!");
-      } else {
-        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
-            prof_signal->signal_.handle, prof_signal);
-      }
+      assert(false && "Runtime should not have batch command in ActiveSignal!");
     }
     if (!sdma_profiling_) {
       hsa_amd_profiling_async_copy_enable(true);
@@ -486,8 +466,7 @@ bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
   if (!signal->done_) {
     // Update timestamp values if requested
     if (signal->ts_ != nullptr) {
-      static constexpr bool kEventRecycle = true;
-      signal->ts_->checkGpuTime(kEventRecycle);
+      signal->ts_->checkGpuTime();
     } else {
       ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "[%zx]!\t Host wait on completion_signal=0x%zx",
               std::this_thread::get_id(), signal->signal_.handle);
@@ -892,7 +871,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
 }
 
 // ================================================================================================
-void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
+void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
+  bool skipSignal, const ProfilingSignal* global_signal) {
   const uint32_t queueSize = gpu_queue_->size;
   const uint32_t queueMask = queueSize - 1;
 
@@ -915,12 +895,16 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
   barrier_packet_.completion_signal.handle = 0;
 
   if (!skipSignal) {
-    // Pool size must grow to the size of pending AQL packets
-    const uint32_t pool_size = index - read;
+    if (global_signal != nullptr) {
+      barrier_packet_.completion_signal = global_signal->signal_;
+    } else {
+      // Pool size must grow to the size of pending AQL packets
+      const uint32_t pool_size = index - read;
 
-    // Get active signal for current dispatch if profiling is necessary
-    barrier_packet_.completion_signal =
-      Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
+      // Get active signal for current dispatch if profiling is necessary
+      barrier_packet_.completion_signal =
+        Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
+    }
   }
 
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1218,7 +1202,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
         (*it)->NotifyEvent()->HwEvent() : (*it)->HwEvent();
       if (hw_event != nullptr) {
         Barriers().AddExternalSignal(reinterpret_cast<ProfilingSignal*>(hw_event));
-      } else if (static_cast<amd::Command*>(*it)->queue() != command.queue()) {
+      } else if (static_cast<amd::Command*>(*it)->queue() != command.queue() &&
+                 ((*it)->status() != CL_COMPLETE)) {
         LogPrintfError("Waiting event(%p) doesn't have a HSA signal!\n", *it);
       } else {
         // Assume serialization on the same queue...
@@ -1239,10 +1224,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
       timestamp_->end();
     }
     command.setData(timestamp_);
-    // Update HW event only for batches
-    if ((AMD_DIRECT_DISPATCH) && (command.GetBatchHead() != nullptr)) {
-      command.SetHwEvent(timestamp_->Signals().back());
-    }
+
     timestamp_ = nullptr;
   }
 }
@@ -2926,13 +2908,46 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
 
 // ================================================================================================
 void VirtualGPU::submitMarker(amd::Marker& vcmd) {
-  if (vcmd.profilingInfo().marker_ts_) {
+  if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) {
     profilingBegin(vcmd);
     if (timestamp_ != nullptr) {
+      ProfilingSignal* prof_signal = nullptr;
+      // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
+      // requires the batch update upon HSA signal completion
+      if (AMD_DIRECT_DISPATCH) {
+        assert(vcmd.GetBatchHead() != nullptr && "Marker doesn't have batch!");
+
+        prof_signal = dev().GetGlobalSignal(timestamp_);
+        prof_signal->done_ = false;
+
+        assert(prof_signal != nullptr && "Failed to allocate the global HSA signal!");
+        uint32_t init_value = kInitSignalValueOne;
+        // If API callback is enabled, then use a blocking signal for AQL queue.
+        // HSA signal will be acquired in SW and released after HSA signal callback
+        if (vcmd.Callback() != nullptr) {
+          timestamp_->SetCallbackSignal(prof_signal->signal_);
+          // Blocks AQL queue from further processing
+          hsa_signal_add_relaxed(prof_signal->signal_, 1);
+          init_value += 1;
+        }
+
+        hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
+            HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, timestamp_);
+        if (HSA_STATUS_SUCCESS != result) {
+          LogError("hsa_amd_signal_async_handler() failed to set the handler!");
+        } else {
+          ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
+              prof_signal->signal_.handle, prof_signal);
+        }
+        // Update HW event only for batches
+        vcmd.SetHwEvent(timestamp_->Signals().back());
+      }
       // Submit a barrier with a cache flushes.
-      dispatchBarrierPacket(kBarrierPacketHeader);
-      // Reset this flag since we already enable system scope for kBarrierPacketHeader
-      hasPendingDispatch_ = false;
+      dispatchBarrierPacket(kBarrierPacketHeader, false, prof_signal);
+
+      // Don't reset the flag for direct dispatch, because the global signals are out of scope
+      // for internal barrier tracking and SDMA could lose a wait for compute
+      hasPendingDispatch_ = AMD_DIRECT_DISPATCH;
     }
     profilingEnd(vcmd);
   }
@@ -2958,45 +2973,13 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
 
 // ================================================================================================
 void VirtualGPU::flush(amd::Command* list, bool wait) {
-  // Direct dispatch relies on HSA signal callback
-  bool skip_cpu_wait = AMD_DIRECT_DISPATCH;
+  // If barrier is requested, then wait for everything, otherwise
+  // a per disaptch wait will occur later in updateCommandsState()
+  releaseGpuMemoryFence();
+  updateCommandsState(list);
 
-  if (skip_cpu_wait) {
-    // Search for the last command in the batch to track GPU state
-    amd::Command* current = list;
-    assert(current != nullptr && "Empty batch for processing!");
-
-    // Find the last command
-    while (current->getNext() != nullptr) {
-      current = current->getNext();
-    }
-    // Always insert a barrier. Some tests rquire async SDMA wait
-    hasPendingDispatch_ = true;
-    // Enable profiling, so runtime can track TS
-    profilingBegin(*current);
-
-    // If runtime didn't submit a barrier, then it can't track the completion of the batch.
-    // Hence runtime either has to insert a barrier unconditionally or have a CPU wait.
-    // Due to performance impact of extra barriers CPU wait is selected.
-    // Note: if callback will be selected to update the batch status,
-    // then the host thread can't update it also, otherwise double free may occur
-    skip_cpu_wait &= hasPendingDispatch_;
-
-    releaseGpuMemoryFence(skip_cpu_wait);
-    profilingEnd(*current);
-  } else {
-    // If barrier is requested, then wait for everything, otherwise
-    // a per disaptch wait will occur later in updateCommandsState()
-    releaseGpuMemoryFence();
-  }
-
-  // If CPU waited for GPU, then the queue is idle
-  if (!skip_cpu_wait) {
-    updateCommandsState(list);
-
-    // Release all pinned memory
-    releasePinnedMem();
-  }
+  // Release all pinned memory
+  releasePinnedMem();
 }
 
 // ================================================================================================
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index aaa05acf67..64f556e412 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -34,23 +34,9 @@
 namespace roc {
 class Device;
 class Memory;
+struct ProfilingSignal;
 class Timestamp;
 
-struct ProfilingSignal : public amd::HeapObject {
-  amd::Monitor  lock_;    //!< Signal lock for update
-  hsa_signal_t  signal_;  //!< HSA signal to track profiling information
-  Timestamp*    ts_;      //!< Timestamp object associated with the signal
-  HwQueueEngine engine_;  //!< Engine used with this signal
-  bool          done_;    //!< True if signal is done
-  ProfilingSignal()
-    : lock_("Signal Ops Lock", true)
-    , ts_(nullptr)
-    , engine_(HwQueueEngine::Compute)
-    , done_(true)
-    { signal_.handle = 0; }
-  amd::Monitor& LockSignalOps() { return lock_; }
-};
-
 // Initial HSA signal value
 constexpr static hsa_signal_value_t kInitSignalValueOne = 1;
 
@@ -139,7 +125,7 @@ class Timestamp : public amd::HeapObject {
   const bool HwProfiling() const { return !signals_.empty(); }
 
   //! Finds execution ticks on GPU
-  void checkGpuTime(bool event_recycle = false);
+  void checkGpuTime();
 
   // Start a timestamp (get timestamp from OS)
   void start() { start_ = amd::Os::timeNanos(); }
@@ -225,8 +211,6 @@ class VirtualGPU : public device::VirtualDevice {
     //! Wait for the curent active signal. Can idle the queue
     bool WaitCurrent() {
       ProfilingSignal* signal = signal_list_[current_id_];
-      ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitCurret completion_signal=0x%zx",
-        std::this_thread::get_id(), signal->signal_.handle);
       return CpuWaitForSignal(signal);
     }
 
@@ -253,8 +237,6 @@ class VirtualGPU : public device::VirtualDevice {
     void WaitNext() {
       size_t next = (current_id_ + 1) % signal_list_.size();
       ProfilingSignal* signal = signal_list_[next];
-      ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitNext completion_signal=0x%zx",
-        std::this_thread::get_id(), signal->signal_.handle);
       CpuWaitForSignal(signal);
     }
 
@@ -396,7 +378,8 @@ class VirtualGPU : public device::VirtualDevice {
   template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
                                                               uint16_t rest, bool blocking,
                                                               size_t size = 1);
-  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false);
+  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
+                             const ProfilingSignal* global_signal = nullptr);
   bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
                                 bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
   void dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t* packet,
diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
index 082deaacc9..19c29cce6b 100644
--- a/rocclr/platform/command.cpp
+++ b/rocclr/platform/command.cpp
@@ -48,6 +48,7 @@ Event::Event(HostQueue& queue)
       status_(CL_INT_MAX),
       hw_event_(nullptr),
       notify_event_(nullptr),
+      device_(&queue.device()),
       profilingInfo_(IS_PROFILER_ON || queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
                      Agent::shouldPostEventEvents()) {
   notified_.clear();
@@ -55,7 +56,7 @@ Event::Event(HostQueue& queue)
 
 // ================================================================================================
 Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
-    hw_event_(nullptr), notify_event_(nullptr) { notified_.clear(); }
+    hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); }
 
 // ================================================================================================
 Event::~Event() {
@@ -69,6 +70,10 @@ Event::~Event() {
   if (notify_event_ != nullptr) {
     notify_event_->release();
   }
+  // Destroy global HW event if available
+  if ((hw_event_ != nullptr) && (device_ != nullptr)) {
+    device_->ReleaseGlobalSignal(hw_event_);
+  }
 }
 
 // ================================================================================================
@@ -259,21 +264,35 @@ bool Event::awaitCompletion() {
 // ================================================================================================
 bool Event::notifyCmdQueue() {
   HostQueue* queue = command().queue();
-  if ((status() > CL_COMPLETE) && (nullptr != queue) &&
-      (!AMD_DIRECT_DISPATCH ||
-       // If HW event was assigned, then notification can be ignored, since a barrier was issued
-       (HwEvent() == nullptr)) &&
-      !notified_.test_and_set()) {
-    // Make sure the queue is draining the enqueued commands.
-    amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
-    if (command == NULL) {
-      notified_.clear();
-      return false;
+  if (AMD_DIRECT_DISPATCH) {
+    ScopedLock l(lock_);
+    if ((status() > CL_COMPLETE) && (nullptr != queue) &&
+        // If HW event was assigned, then notification can be ignored, since a barrier was issued
+        (HwEvent() == nullptr) &&
+        !notified_.test_and_set()) {
+      // Make sure the queue is draining the enqueued commands.
+      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
+      if (command == NULL) {
+        notified_.clear();
+        return false;
+      }
+      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      command->enqueue();
+      // Save notification, associated with the current event
+      notify_event_ = command;
+    }
+  } else {
+    if ((status() > CL_COMPLETE) && (nullptr != queue) && !notified_.test_and_set()) {
+      // Make sure the queue is draining the enqueued commands.
+      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
+      if (command == NULL) {
+        notified_.clear();
+        return false;
+      }
+      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      command->enqueue();
+      command->release();
     }
-    ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
-    command->enqueue();
-    // Save notification, associated with the current event
-    notify_event_ = command;
   }
   return true;
 }
@@ -318,6 +337,7 @@ void Command::enqueue() {
   // update will occur later after flush() with a wait
   if (AMD_DIRECT_DISPATCH) {
     setStatus(CL_QUEUED);
+
     // Notify all commands about the waiter. Barrier will be sent in order to obtain
     // HSA signal for a wait on the current queue
     std::for_each(eventWaitList().begin(), eventWaitList().end(),
@@ -333,13 +353,10 @@ void Command::enqueue() {
       // Update batch head for the current marker. Hence the status of all commands can be
       // updated upon the marker completion
       SetBatchHead(queue_->GetSubmittionBatch());
-      if (profilingInfo().marker_ts_) {
-        setStatus(CL_SUBMITTED);
-        submit(*queue_->vdev());
-      } else {
-        // Flush the current batch, but skip the wait on CPU if possible to avoid a stall
-        queue_->vdev()->flush(queue_->GetSubmittionBatch());
-      }
+
+      setStatus(CL_SUBMITTED);
+      submit(*queue_->vdev());
+
       // The batch will be tracked with the marker now
       queue_->ResetSubmissionBatch();
     } else {
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index ec9843a1bf..02945b19ad 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -95,6 +95,7 @@ class Event : public RuntimeObject {
   std::atomic_flag notified_;              //!< Command queue was notified
   void*  hw_event_;                        //!< HW event ID associated with SW event
   Event* notify_event_;                    //!< Notify event, which should contain HW signal
+  const Device* device_;                   //!< Device, this event associated with
 
  protected:
   static const EventWaitList nullWaitList;