diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index 4db0dc6bc8..8e60af681b 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -1527,6 +1527,13 @@ class Device : public RuntimeObject {
     kKernArg = 2
   } MemorySegment;
 
+  typedef enum CacheState {
+    kCacheStateInvalid = -1,
+    kCacheStateIgnore = 0,
+    kCacheStateAgent = 1,
+    kCacheStateSystem = 2
+  } CacheState;
+
   typedef std::pair<LinkAttribute, int32_t /* value */> LinkAttrType;
 
   static constexpr size_t kP2PStagingSize = 4 * Mi;
@@ -1749,6 +1756,11 @@ class Device : public RuntimeObject {
     return false;
   };
 
+  // Returns bool value if the device cache is equal to the parameter
+  virtual bool IsCacheFlushed(CacheState state) const {
+    return false;
+  };
+
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
 
   virtual const uint32_t getPreferredNumaNode() const { return 0; }
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 8b7c33f7cc..4045d7cc99 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -180,6 +180,7 @@ Device::Device(hsa_agent_t bkendDevice)
   gpuvm_segment_.handle = 0;
   gpu_fine_grained_segment_.handle = 0;
   prefetch_signal_.handle = 0;
+  cache_state_ = Device::CacheState::kCacheStateInvalid;
 }
 
 void Device::setupCpuAgent() {
@@ -2635,6 +2636,17 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t*
                     start, end);
   }
 }
+
+// ================================================================================================
+bool Device::IsCacheFlushed(Device::CacheState state) const {
+  return (static_cast<int>(state) == cache_state_.load(std::memory_order_relaxed));
+}
+
+// ================================================================================================
+void Device::SetCacheState(Device::CacheState state) {
+  cache_state_.store(static_cast<int>(state), std::memory_order_relaxed);
+}
+
 // ================================================================================================
 static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
   if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 5004941a90..b908cad8d0 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -260,6 +260,8 @@ class NullDevice : public amd::Device {
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
+  virtual bool IsCacheFlushed(Device::CacheState state) const { return false; };
+  virtual void SetCacheState(Device::CacheState state) {};
   virtual void ReleaseGlobalSignal(void* signal) const {}
 
 #if defined(__clang__)
@@ -440,6 +442,8 @@ class Device : public NullDevice {
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
+  virtual bool IsCacheFlushed(Device::CacheState state) const;
+  virtual void SetCacheState(Device::CacheState state);
   virtual void ReleaseGlobalSignal(void* signal) const;
 
   //! Allocate host memory in terms of numa policy set by user
@@ -583,6 +587,7 @@ class Device : public NullDevice {
   hsa_amd_memory_pool_t gpuvm_segment_;
   hsa_amd_memory_pool_t gpu_fine_grained_segment_;
   hsa_signal_t prefetch_signal_;    //!< Prefetch signal, used to explicitly prefetch SVM on device
+  std::atomic<int> cache_state_;    //!< State of cache, kUnknown/kFlushedToDevice/kFlushedToSystem
 
   size_t gpuvm_segment_max_alloc_;
   size_t alloc_granularity_;
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index b8b5d14341..7619abd224 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -77,6 +77,11 @@ static constexpr uint16_t kBarrierPacketHeader =
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
+static constexpr uint16_t kBarrierPacketAgentScopeHeader =
+    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+
 static constexpr uint16_t kNopPacketHeader =
     (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
     (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
@@ -126,7 +131,7 @@ void Timestamp::checkGpuTime() {
       }
       // Avoid profiling data for the sync barrier, in tiny performance tests the first call
       // to ROCr is very slow and that also affects the overall performance of the callback thread
-      if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_) {
+      if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ > 0) {
         hsa_amd_profiling_dispatch_time_t time = {};
         if (it->engine_ == HwQueueEngine::Compute) {
           hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
@@ -442,7 +447,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
         // Update the current command/marker with HW event
         prof_signal->retain();
         ts->command().SetHwEvent(prof_signal);
-      } else if (ts->command().profilingInfo().marker_ts_ ) {
+      } else if (ts->command().profilingInfo().marker_ts_ > 0 ) {
         // Update the current command/marker with HW event
         prof_signal->retain();
         ts->command().SetHwEvent(prof_signal);
@@ -803,11 +808,15 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
   uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
 
+  auto cache_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+
   if (timestamp_ != nullptr) {
     // Pool size must grow to the size of pending AQL packets
     const uint32_t pool_size = index - read;
     // Get active signal for current dispatch if profiling is necessary
-    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
+    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
+                                                        pool_size);
   }
 
   // Make sure the slot is free for usage
@@ -845,8 +854,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
                            HSA_PACKET_HEADER_WIDTH_BARRIER),
             extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                            HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
-            extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                           HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+            cache_state,
             rest, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
             reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
             reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
@@ -863,6 +871,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   //hsa_queue_store_write_index_release(gpu_queue_, index);
   hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1);
 
+  roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
+
   // Wait on signal ?
   if (blocking) {
     LogInfo("Runtime reachead the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
@@ -957,6 +967,8 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
   uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
 
+  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
   if (!skipSignal) {
     // Pool size must grow to the size of pending AQL packets
     const uint32_t pool_size = index - read;
@@ -987,11 +999,13 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
                          HSA_PACKET_HEADER_WIDTH_BARRIER),
           extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                          HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
-          extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+          cache_state,
           barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1],
           barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3],
           barrier_packet_.dep_signal[4], barrier_packet_.completion_signal);
+
+  roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
+
   // Clear dependent signals for the next packet
   barrier_packet_.dep_signal[0] = hsa_signal_t{};
   barrier_packet_.dep_signal[1] = hsa_signal_t{};
@@ -2303,7 +2317,12 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
   unsigned int* headerPtr = reinterpret_cast<unsigned int*>(&header);
   __atomic_store_n(reinterpret_cast<uint32_t*>(aql_loc), *headerPtr, __ATOMIC_RELEASE);
 
+  auto cache_state = extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+
   hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
+  roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
+
   ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
           "HWq=0x%zx, BarrierValue Header = 0x%x AmdFormat = 0x%x ",
           "(type=%d, barrier=%d, acquire=%d, release=%d), "
@@ -2313,8 +2332,7 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
           extractAqlBits(header.header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
           extractAqlBits(header.header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                          HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
-          extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+          cache_state,
           packet->completion_signal, packet->value, packet->mask, packet->cond,
           HSA_SIGNAL_CONDITION_GTE, HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_NE);
 }
@@ -3096,10 +3114,16 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
     } else {
       profilingBegin(vcmd);
       if (timestamp_ != nullptr) {
-        // Submit a barrier with a cache flushes.
-        dispatchBarrierPacket(kBarrierPacketHeader, false);
-
-        hasPendingDispatch_ = false;
+        uint32_t releaseFlags = vcmd.profilingInfo().marker_ts_;
+        if (ROC_EVENT_NO_FLUSH && releaseFlags == Device::CacheState::kCacheStateIgnore) {
+          dispatchBarrierPacket(kNopPacketHeader, false);
+        } else if (releaseFlags == Device::CacheState::kCacheStateAgent) {
+          dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false);
+        } else {
+          // Submit a barrier with a cache flushes.
+          dispatchBarrierPacket(kBarrierPacketHeader, false);
+          hasPendingDispatch_ = false;
+        }
       }
       profilingEnd(vcmd);
     }
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index eec70f483d..cdc09f4215 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -55,8 +55,14 @@ Event::Event(HostQueue& queue)
 }
 
 // ================================================================================================
-Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
-    hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); }
+Event::Event()
+    : callbacks_(NULL),
+      status_(CL_SUBMITTED),
+      hw_event_(nullptr),
+      notify_event_(nullptr),
+      device_(nullptr) {
+  notified_.clear();
+}
 
 // ================================================================================================
 Event::~Event() {
@@ -241,7 +247,7 @@ bool Event::awaitCompletion() {
       return false;
     }
 
-    ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d",
+    ClPrint(LOG_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d",
       this, status());
     auto* queue = command().queue();
     if ((queue != nullptr) && queue->vdev()->ActiveWait()) {
@@ -256,7 +262,7 @@ bool Event::awaitCompletion() {
         lock_.wait();
       }
     }
-    ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this);
+    ClPrint(LOG_DEBUG, LOG_WAIT, "Event %p wait completed", this);
   }
 
   return status() == CL_COMPLETE;
@@ -277,7 +283,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
         notified_.clear();
         return false;
       }
-      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
       command->enqueue();
       // Save notification, associated with the current event
       notify_event_ = command;
@@ -290,7 +296,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
         notified_.clear();
         return false;
       }
-      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
       command->enqueue();
       command->release();
     }
@@ -336,7 +342,7 @@ void Command::enqueue() {
     Agent::postEventCreate(as_cl(static_cast<Event*>(this)), type_);
   }
 
-  ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this);
+  ClPrint(LOG_DEBUG, LOG_CMD, "Command enqueued: %p", this);
 
   // Direct dispatch logic below will submit the command immediately, but the command status
   // update will occur later after flush() with a wait
@@ -360,7 +366,7 @@ void Command::enqueue() {
       EnableProfiling();
     }
 
-    if (isMarker && !profilingInfo().marker_ts_) {
+    if (isMarker && (profilingInfo().marker_ts_ == 0)) {
       // Update batch head for the current marker. Hence the status of all commands can be
       // updated upon the marker completion
       SetBatchHead(queue_->GetSubmittionBatch());
@@ -414,6 +420,7 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
     profilingInfo_.enabled_ = true;
     profilingInfo_.clear();
     profilingInfo_.callback_ = nullptr;
+    profilingInfo_.marker_ts_ = 1;
   }
   kernel_.retain();
 }
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index 395d4642e0..4ea2636ddc 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -102,7 +102,7 @@ class Event : public RuntimeObject {
   static const EventWaitList nullWaitList;
 
   struct ProfilingInfo {
-    ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) {
+    ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(0) {
       if (enabled) {
         clear();
         callback_ = nullptr;
@@ -113,10 +113,11 @@ class Event : public RuntimeObject {
     uint64_t submitted_;
     uint64_t start_;
     uint64_t end_;
-    bool enabled_;    //!< Profiling enabled for the wave limiter
-    uint32_t waves_;  //!< The number of waves used in a dispatch
+    bool enabled_;        //!< Profiling enabled for the wave limiter
+    uint32_t waves_;      //!< The number of waves used in a dispatch
     ProfilingCallback* callback_;
-    bool marker_ts_;
+    uint32_t marker_ts_;  //!< Marker with release scope
+                          //!< 5 - system scope, 3 - device scope, 1 - no scopes
     void clear() {
       queued_ = 0ULL;
       submitted_ = 0ULL;
diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp
index e8b914971d..daae776bd5 100644
--- a/projects/clr/rocclr/platform/commandqueue.cpp
+++ b/projects/clr/rocclr/platform/commandqueue.cpp
@@ -106,20 +106,20 @@ bool HostQueue::terminate() {
 
 void HostQueue::finish() {
   Command* command = nullptr;
+  bool isCacheFlushed = device().IsCacheFlushed(Device::CacheState::kCacheStateSystem);
   if (IS_HIP) {
     command = getLastQueuedCommand(true);
-    // Check if the queue has nothing to process and return
-    if (AMD_DIRECT_DISPATCH &&  command == nullptr) {
+    if (AMD_DIRECT_DISPATCH && isCacheFlushed && command == nullptr) {
       return;
     }
   }
-  if (nullptr == command) {
+  if (nullptr == command || !isCacheFlushed) {
     // Send a finish to make sure we finished all commands
     command = new Marker(*this, false);
     if (command == NULL) {
       return;
     }
-    ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
+    ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued, Cache Flushed = %d", isCacheFlushed);
     command->enqueue();
   }
   // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
@@ -194,7 +194,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
       continue;
     }
 
-    ClPrint(LOG_DEBUG, LOG_CMD, "command (%s) is submitted: %p", getOclCommandKindString(command->type()), command);
+    ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) submitted: %p", getOclCommandKindString(command->type()), command);
 
     command->setStatus(CL_SUBMITTED);
 
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index 147349cc6b..0912657d2e 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -273,7 +273,9 @@ release(uint, ROC_AQL_QUEUE_SIZE, 4096,                                       \
 release(bool, ROC_SKIP_KERNEL_ARG_COPY, false,                                \
         "If true, then runtime can skip kernel arg copy")                     \
 release(bool, GPU_STREAMOPS_CP_WAIT, false,                                   \
-        "Force the stream wait memory operation to wait on CP.")
+        "Force the stream wait memory operation to wait on CP.")              \
+release(bool, ROC_EVENT_NO_FLUSH, false,                                      \
+        "Use NOP AQL packet for event records with no explicit flags.")
 
 namespace amd {