diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index 29346d2380..7c0e13db1f 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -49,6 +49,7 @@
 #include <map>
 #include <list>
 #include <set>
+#include <unordered_set>
 #include <utility>
 
 namespace amd {
@@ -1705,6 +1706,7 @@ class Device : public RuntimeObject {
       ) const {
     return false;
   };
+  virtual void ReleaseGlobalSignal(void* signal) const {}
 
   //! Returns TRUE if the device is available for computations
   bool isOnline() const { return online_; }
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 0551e0e983..c6e94732a6 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -2919,6 +2919,7 @@ bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool,
   return true;
 }
 
+// ================================================================================================
 void Device::getGlobalCUMask(std::string cuMaskStr) {
   if (cuMaskStr.length() != 0) {
     std::string pre = cuMaskStr.substr(0, 2);
@@ -2971,10 +2972,12 @@ void Device::getGlobalCUMask(std::string cuMaskStr) {
   }
 }
 
+// ================================================================================================
 device::Signal* Device::createSignal() const {
   return new roc::Signal();
 }
 
+// ================================================================================================
 amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
   // If arena_mem_obj_ is null, then HMM and Xnack is disabled. Return nullptr.
   if (arena_mem_obj_ == nullptr) {
@@ -2989,5 +2992,39 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
   return arena_mem_obj_;
 }
 
+// ================================================================================================
+ProfilingSignal* Device::GetGlobalSignal(Timestamp* ts) const {
+  std::unique_ptr<ProfilingSignal> prof_signal(new ProfilingSignal());
+  if (prof_signal != nullptr) {
+    hsa_agent_t agent = getBackendDevice();
+    hsa_agent_t* agents = (settings().system_scope_signal_) ? nullptr : &agent;
+    uint32_t num_agents = (settings().system_scope_signal_) ? 0 : 1;
+
+    if (ts != 0) {
+      // Save HSA signal earlier to make sure the possible callback will have a valid
+      // value for processing
+      prof_signal->ts_ = ts;
+      ts->AddProfilingSignal(prof_signal.get());
+    }
+
+    if (HSA_STATUS_SUCCESS == hsa_signal_create(kInitSignalValueOne,
+                                                num_agents, agents, &prof_signal->signal_)) {
+      return prof_signal.release();
+    }
+  }
+  return nullptr;
+}
+
+// ================================================================================================
+void Device::ReleaseGlobalSignal(void* signal) const {
+  if (signal != nullptr) {
+    ProfilingSignal* prof_signal = reinterpret_cast<ProfilingSignal*>(signal);
+    if (prof_signal->signal_.handle != 0) {
+      hsa_signal_destroy(prof_signal->signal_);
+    }
+    delete prof_signal;
+  }
+}
+
 } // namespace roc
 #endif  // WITHOUT_HSA_BACKEND
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 78cfbb6241..0fb6f0b8e0 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -77,6 +77,21 @@ class VirtualDevice;
 class PrintfDbg;
 class IProDevice;
 
+struct ProfilingSignal : public amd::HeapObject {
+  hsa_signal_t  signal_;  //!< HSA signal to track profiling information
+  Timestamp*    ts_;      //!< Timestamp object associated with the signal
+  HwQueueEngine engine_;  //!< Engine used with this signal
+  bool          done_;    //!< True if signal is done
+  amd::Monitor  lock_;    //!< Signal lock for update
+  ProfilingSignal()
+    : ts_(nullptr)
+    , engine_(HwQueueEngine::Compute)
+    , done_(true)
+    , lock_("Signal Ops Lock", true)
+    { signal_.handle = 0; }
+  amd::Monitor& LockSignalOps() { return lock_; }
+};
+
 class Sampler : public device::Sampler {
  public:
   //! Constructor
@@ -237,6 +252,7 @@ class NullDevice : public amd::Device {
       cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
+  virtual void ReleaseGlobalSignal(void* signal) const {}
 
  protected:
   //! Initialize compiler instance and handle
@@ -405,6 +421,7 @@ class Device : public NullDevice {
                             cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
+  virtual void ReleaseGlobalSignal(void* signal) const;
 
   //! Allocate host memory in terms of numa policy set by user
   void* hostNumaAlloc(size_t size, size_t alignment, bool atomics = false) const;
@@ -505,6 +522,8 @@ class Device : public NullDevice {
 
   virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset);
 
+  ProfilingSignal* GetGlobalSignal(Timestamp* ts) const;
+
  private:
   bool create();
 
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 5f2659c76a..d31f01982c 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -109,7 +109,7 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
 };
 
 // ================================================================================================
-void Timestamp::checkGpuTime(bool event_recycle) {
+void Timestamp::checkGpuTime() {
   if (HwProfiling()) {
     uint64_t  start = std::numeric_limits<uint64_t>::max();
     uint64_t  end = 0;
@@ -140,10 +140,6 @@ void Timestamp::checkGpuTime(bool event_recycle) {
         ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
           "end = %ld", it->signal_.handle, start, end);
       }
-      // The signal is reused and the upper layer can't rely on it.
-      if (event_recycle) {
-        const_cast<amd::Command&>(it->ts_->command_).SetHwEvent(nullptr);
-      }
       it->ts_ = nullptr;
       it->done_ = true;
     }
@@ -390,23 +386,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
     // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
     // requires the batch update upon HSA signal completion
     if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
-      uint32_t init_value = kInitSignalValueOne;
-      // If API callback is enabled, then use a blocking signal for AQL queue.
-      // HSA signal will be acquired in SW and released after HSA signal callback
-      if (ts->command().Callback() != nullptr) {
-        ts->SetCallbackSignal(prof_signal->signal_);
-        // Blocks AQL queue from further processing
-        hsa_signal_add_relaxed(prof_signal->signal_, 1);
-        init_value += 1;
-      }
-      hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
-          HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
-      if (HSA_STATUS_SUCCESS != result) {
-        LogError("hsa_amd_signal_async_handler() failed to set the handler!");
-      } else {
-        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
-            prof_signal->signal_.handle, prof_signal);
-      }
+      assert(false && "Runtime should not have batch command in ActiveSignal!");
     }
     if (!sdma_profiling_) {
       hsa_amd_profiling_async_copy_enable(true);
@@ -486,8 +466,7 @@ bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
   if (!signal->done_) {
     // Update timestamp values if requested
     if (signal->ts_ != nullptr) {
-      static constexpr bool kEventRecycle = true;
-      signal->ts_->checkGpuTime(kEventRecycle);
+      signal->ts_->checkGpuTime();
     } else {
       ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "[%zx]!\t Host wait on completion_signal=0x%zx",
               std::this_thread::get_id(), signal->signal_.handle);
@@ -892,7 +871,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
 }
 
 // ================================================================================================
-void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
+void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
+  bool skipSignal, const ProfilingSignal* global_signal) {
   const uint32_t queueSize = gpu_queue_->size;
   const uint32_t queueMask = queueSize - 1;
 
@@ -915,12 +895,16 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
   barrier_packet_.completion_signal.handle = 0;
 
   if (!skipSignal) {
-    // Pool size must grow to the size of pending AQL packets
-    const uint32_t pool_size = index - read;
+    if (global_signal != nullptr) {
+      barrier_packet_.completion_signal = global_signal->signal_;
+    } else {
+      // Pool size must grow to the size of pending AQL packets
+      const uint32_t pool_size = index - read;
 
-    // Get active signal for current dispatch if profiling is necessary
-    barrier_packet_.completion_signal =
-      Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
+      // Get active signal for current dispatch if profiling is necessary
+      barrier_packet_.completion_signal =
+        Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
+    }
   }
 
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1218,7 +1202,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
         (*it)->NotifyEvent()->HwEvent() : (*it)->HwEvent();
       if (hw_event != nullptr) {
         Barriers().AddExternalSignal(reinterpret_cast<ProfilingSignal*>(hw_event));
-      } else if (static_cast<amd::Command*>(*it)->queue() != command.queue()) {
+      } else if (static_cast<amd::Command*>(*it)->queue() != command.queue() &&
+                 ((*it)->status() != CL_COMPLETE)) {
         LogPrintfError("Waiting event(%p) doesn't have a HSA signal!\n", *it);
       } else {
         // Assume serialization on the same queue...
@@ -1239,10 +1224,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
       timestamp_->end();
     }
     command.setData(timestamp_);
-    // Update HW event only for batches
-    if ((AMD_DIRECT_DISPATCH) && (command.GetBatchHead() != nullptr)) {
-      command.SetHwEvent(timestamp_->Signals().back());
-    }
+
     timestamp_ = nullptr;
   }
 }
@@ -2926,13 +2908,46 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
 
 // ================================================================================================
 void VirtualGPU::submitMarker(amd::Marker& vcmd) {
-  if (vcmd.profilingInfo().marker_ts_) {
+  if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) {
     profilingBegin(vcmd);
     if (timestamp_ != nullptr) {
+      ProfilingSignal* prof_signal = nullptr;
+      // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
+      // requires the batch update upon HSA signal completion
+      if (AMD_DIRECT_DISPATCH) {
+        assert(vcmd.GetBatchHead() != nullptr && "Marker doesn't have batch!");
+
+        prof_signal = dev().GetGlobalSignal(timestamp_);
+        prof_signal->done_ = false;
+
+        assert(prof_signal != nullptr && "Failed to allocate the global HSA signal!");
+        uint32_t init_value = kInitSignalValueOne;
+        // If API callback is enabled, then use a blocking signal for AQL queue.
+        // HSA signal will be acquired in SW and released after HSA signal callback
+        if (vcmd.Callback() != nullptr) {
+          timestamp_->SetCallbackSignal(prof_signal->signal_);
+          // Blocks AQL queue from further processing
+          hsa_signal_add_relaxed(prof_signal->signal_, 1);
+          init_value += 1;
+        }
+
+        hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
+            HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, timestamp_);
+        if (HSA_STATUS_SUCCESS != result) {
+          LogError("hsa_amd_signal_async_handler() failed to set the handler!");
+        } else {
+          ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
+              prof_signal->signal_.handle, prof_signal);
+        }
+        // Update HW event only for batches
+        vcmd.SetHwEvent(timestamp_->Signals().back());
+      }
       // Submit a barrier with a cache flushes.
-      dispatchBarrierPacket(kBarrierPacketHeader);
-      // Reset this flag since we already enable system scope for kBarrierPacketHeader
-      hasPendingDispatch_ = false;
+      dispatchBarrierPacket(kBarrierPacketHeader, false, prof_signal);
+
+      // Don't reset the flag for direct dispatch, because the global signals are out of scope
+      // for internal barrier tracking and SDMA could lose a wait for compute
+      hasPendingDispatch_ = AMD_DIRECT_DISPATCH;
     }
     profilingEnd(vcmd);
   }
@@ -2958,45 +2973,13 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
 
 // ================================================================================================
 void VirtualGPU::flush(amd::Command* list, bool wait) {
-  // Direct dispatch relies on HSA signal callback
-  bool skip_cpu_wait = AMD_DIRECT_DISPATCH;
+  // If barrier is requested, then wait for everything, otherwise
+  // a per disaptch wait will occur later in updateCommandsState()
+  releaseGpuMemoryFence();
+  updateCommandsState(list);
 
-  if (skip_cpu_wait) {
-    // Search for the last command in the batch to track GPU state
-    amd::Command* current = list;
-    assert(current != nullptr && "Empty batch for processing!");
-
-    // Find the last command
-    while (current->getNext() != nullptr) {
-      current = current->getNext();
-    }
-    // Always insert a barrier. Some tests rquire async SDMA wait
-    hasPendingDispatch_ = true;
-    // Enable profiling, so runtime can track TS
-    profilingBegin(*current);
-
-    // If runtime didn't submit a barrier, then it can't track the completion of the batch.
-    // Hence runtime either has to insert a barrier unconditionally or have a CPU wait.
-    // Due to performance impact of extra barriers CPU wait is selected.
-    // Note: if callback will be selected to update the batch status,
-    // then the host thread can't update it also, otherwise double free may occur
-    skip_cpu_wait &= hasPendingDispatch_;
-
-    releaseGpuMemoryFence(skip_cpu_wait);
-    profilingEnd(*current);
-  } else {
-    // If barrier is requested, then wait for everything, otherwise
-    // a per disaptch wait will occur later in updateCommandsState()
-    releaseGpuMemoryFence();
-  }
-
-  // If CPU waited for GPU, then the queue is idle
-  if (!skip_cpu_wait) {
-    updateCommandsState(list);
-
-    // Release all pinned memory
-    releasePinnedMem();
-  }
+  // Release all pinned memory
+  releasePinnedMem();
 }
 
 // ================================================================================================
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index aaa05acf67..64f556e412 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -34,23 +34,9 @@
 namespace roc {
 class Device;
 class Memory;
+struct ProfilingSignal;
 class Timestamp;
 
-struct ProfilingSignal : public amd::HeapObject {
-  amd::Monitor  lock_;    //!< Signal lock for update
-  hsa_signal_t  signal_;  //!< HSA signal to track profiling information
-  Timestamp*    ts_;      //!< Timestamp object associated with the signal
-  HwQueueEngine engine_;  //!< Engine used with this signal
-  bool          done_;    //!< True if signal is done
-  ProfilingSignal()
-    : lock_("Signal Ops Lock", true)
-    , ts_(nullptr)
-    , engine_(HwQueueEngine::Compute)
-    , done_(true)
-    { signal_.handle = 0; }
-  amd::Monitor& LockSignalOps() { return lock_; }
-};
-
 // Initial HSA signal value
 constexpr static hsa_signal_value_t kInitSignalValueOne = 1;
 
@@ -139,7 +125,7 @@ class Timestamp : public amd::HeapObject {
   const bool HwProfiling() const { return !signals_.empty(); }
 
   //! Finds execution ticks on GPU
-  void checkGpuTime(bool event_recycle = false);
+  void checkGpuTime();
 
   // Start a timestamp (get timestamp from OS)
   void start() { start_ = amd::Os::timeNanos(); }
@@ -225,8 +211,6 @@ class VirtualGPU : public device::VirtualDevice {
     //! Wait for the curent active signal. Can idle the queue
     bool WaitCurrent() {
       ProfilingSignal* signal = signal_list_[current_id_];
-      ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitCurret completion_signal=0x%zx",
-        std::this_thread::get_id(), signal->signal_.handle);
       return CpuWaitForSignal(signal);
     }
 
@@ -253,8 +237,6 @@ class VirtualGPU : public device::VirtualDevice {
     void WaitNext() {
       size_t next = (current_id_ + 1) % signal_list_.size();
       ProfilingSignal* signal = signal_list_[next];
-      ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitNext completion_signal=0x%zx",
-        std::this_thread::get_id(), signal->signal_.handle);
       CpuWaitForSignal(signal);
     }
 
@@ -396,7 +378,8 @@ class VirtualGPU : public device::VirtualDevice {
   template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
                                                               uint16_t rest, bool blocking,
                                                               size_t size = 1);
-  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false);
+  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
+                             const ProfilingSignal* global_signal = nullptr);
   bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
                                 bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
   void dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t* packet,
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index 082deaacc9..19c29cce6b 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -48,6 +48,7 @@ Event::Event(HostQueue& queue)
       status_(CL_INT_MAX),
       hw_event_(nullptr),
       notify_event_(nullptr),
+      device_(&queue.device()),
       profilingInfo_(IS_PROFILER_ON || queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
                      Agent::shouldPostEventEvents()) {
   notified_.clear();
@@ -55,7 +56,7 @@ Event::Event(HostQueue& queue)
 
 // ================================================================================================
 Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
-    hw_event_(nullptr), notify_event_(nullptr) { notified_.clear(); }
+    hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); }
 
 // ================================================================================================
 Event::~Event() {
@@ -69,6 +70,10 @@ Event::~Event() {
   if (notify_event_ != nullptr) {
     notify_event_->release();
   }
+  // Destroy global HW event if available
+  if ((hw_event_ != nullptr) && (device_ != nullptr)) {
+    device_->ReleaseGlobalSignal(hw_event_);
+  }
 }
 
 // ================================================================================================
@@ -259,21 +264,35 @@ bool Event::awaitCompletion() {
 // ================================================================================================
 bool Event::notifyCmdQueue() {
   HostQueue* queue = command().queue();
-  if ((status() > CL_COMPLETE) && (nullptr != queue) &&
-      (!AMD_DIRECT_DISPATCH ||
-       // If HW event was assigned, then notification can be ignored, since a barrier was issued
-       (HwEvent() == nullptr)) &&
-      !notified_.test_and_set()) {
-    // Make sure the queue is draining the enqueued commands.
-    amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
-    if (command == NULL) {
-      notified_.clear();
-      return false;
+  if (AMD_DIRECT_DISPATCH) {
+    ScopedLock l(lock_);
+    if ((status() > CL_COMPLETE) && (nullptr != queue) &&
+        // If HW event was assigned, then notification can be ignored, since a barrier was issued
+        (HwEvent() == nullptr) &&
+        !notified_.test_and_set()) {
+      // Make sure the queue is draining the enqueued commands.
+      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
+      if (command == NULL) {
+        notified_.clear();
+        return false;
+      }
+      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      command->enqueue();
+      // Save notification, associated with the current event
+      notify_event_ = command;
+    }
+  } else {
+    if ((status() > CL_COMPLETE) && (nullptr != queue) && !notified_.test_and_set()) {
+      // Make sure the queue is draining the enqueued commands.
+      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
+      if (command == NULL) {
+        notified_.clear();
+        return false;
+      }
+      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      command->enqueue();
+      command->release();
     }
-    ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
-    command->enqueue();
-    // Save notification, associated with the current event
-    notify_event_ = command;
   }
   return true;
 }
@@ -318,6 +337,7 @@ void Command::enqueue() {
   // update will occur later after flush() with a wait
   if (AMD_DIRECT_DISPATCH) {
     setStatus(CL_QUEUED);
+
     // Notify all commands about the waiter. Barrier will be sent in order to obtain
     // HSA signal for a wait on the current queue
     std::for_each(eventWaitList().begin(), eventWaitList().end(),
@@ -333,13 +353,10 @@ void Command::enqueue() {
       // Update batch head for the current marker. Hence the status of all commands can be
       // updated upon the marker completion
       SetBatchHead(queue_->GetSubmittionBatch());
-      if (profilingInfo().marker_ts_) {
-        setStatus(CL_SUBMITTED);
-        submit(*queue_->vdev());
-      } else {
-        // Flush the current batch, but skip the wait on CPU if possible to avoid a stall
-        queue_->vdev()->flush(queue_->GetSubmittionBatch());
-      }
+
+      setStatus(CL_SUBMITTED);
+      submit(*queue_->vdev());
+
       // The batch will be tracked with the marker now
       queue_->ResetSubmissionBatch();
     } else {
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index ec9843a1bf..02945b19ad 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -95,6 +95,7 @@ class Event : public RuntimeObject {
   std::atomic_flag notified_;              //!< Command queue was notified
   void*  hw_event_;                        //!< HW event ID associated with SW event
   Event* notify_event_;                    //!< Notify event, which should contain HW signal
+  const Device* device_;                   //!< Device, this event associated with
 
  protected:
   static const EventWaitList nullWaitList;