diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp
index fb2d592f80..92bf3d32b5 100644
--- a/projects/clr/hipamd/src/hip_event.cpp
+++ b/projects/clr/hipamd/src/hip_event.cpp
@@ -175,7 +175,7 @@ hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream)
   command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList);
   // Since we only need to have a dependency on an existing event,
   // we may not need to flush any caches.
-  command->setEventScope(amd::Device::kCacheStateIgnore);
+  command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
 
   if (command == NULL) {
     return hipErrorOutOfMemory;
diff --git a/projects/clr/hipamd/src/hip_event.hpp b/projects/clr/hipamd/src/hip_event.hpp
index 84bb300891..7740b5ea37 100644
--- a/projects/clr/hipamd/src/hip_event.hpp
+++ b/projects/clr/hipamd/src/hip_event.hpp
@@ -87,7 +87,7 @@ class EventMarker : public amd::Marker {
     profilingInfo_.marker_ts_ = markerTs;
     profilingInfo_.batch_flush_ = batch_flush;
     profilingInfo_.clear();
-    setEventScope(scope);
+    setCommandEntryScope(scope);
   }
 };
 
diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp
index 589430fc46..0ad1144b2f 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.cpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.cpp
@@ -739,7 +739,7 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
   this->retain();
   amd::Command* CallbackCommand = new amd::Marker(*launch_stream, kMarkerDisableFlush, {});
   // we may not need to flush any caches.
-  CallbackCommand->setEventScope(amd::Device::kCacheStateIgnore);
+  CallbackCommand->setCommandEntryScope(amd::Device::kCacheStateIgnore);
   amd::Event& event = CallbackCommand->event();
   constexpr bool kBlocking = false;
   if (!event.setCallback(CL_COMPLETE, GraphExec::DecrementRefCount, this, kBlocking)) {
diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp
index 2f07a5227f..c0db1a2548 100644
--- a/projects/clr/hipamd/src/hip_memory.cpp
+++ b/projects/clr/hipamd/src/hip_memory.cpp
@@ -664,13 +664,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
     return hipSuccess;
   } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
              ((srcMemory != nullptr) && (dstMemory == nullptr))) {
-    // Don't wait for unpinned H2D copy if staging is used for copy. If dstMemory is not null, it
-    // can still be a pinned host memory, hence the check on dst memory type.
-    isHostAsync &=
-        ((srcMemory == nullptr) && (dstMemory != nullptr && dstMemoryType == hipMemoryTypeDevice) &&
-         AMD_DIRECT_DISPATCH && (sizeBytes <= stream.device().settings().stagedXferSize_))
-        ? true
-        : false;
+    // Unpinned copy wait behavior is enforced in the lower copy layers so skip
+    // wait at top level except for MT path
+    isHostAsync &= AMD_DIRECT_DISPATCH ? true : false;
   } else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
     // Device to Device copies do not need to host side synchronization.
     if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) &&
@@ -690,7 +686,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
   }
   command->enqueue();
   if (!isHostAsync) {
-    command->queue()->finish();
+    command->queue()->finishCommand(command);
   } else if (!isGPUAsync) {
     hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
     amd::Command::EventWaitList waitList;
@@ -1826,7 +1822,7 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* dstHost, amd::Coo
   amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset);
 
   amd::Coord3D srcStart(srcRect.start_, 0, 0);
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
   if (dstMemory) {
     amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
       *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
@@ -1874,7 +1870,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, void* dstDevice, amd::C
   amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset);
 
   amd::Coord3D dstStart(dstRect.start_, 0, 0);
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
   if (srcMemory) {
     amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
       *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
@@ -1962,7 +1958,7 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, amd::Image* dstImage,
   size_t start = ihipGetbufferStart(static_cast<size_t*>(srcOrigin),
                                     static_cast<size_t*>(copyRegion), srcRowPitch, srcSlicePitch);
 
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
   if (srcMemory) {
     amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
       *stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{},
@@ -2012,7 +2008,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, void* dstHost, amd::Coo
   size_t start = ihipGetbufferStart(static_cast<size_t*>(dstOrigin),
                                     static_cast<size_t*>(copyRegion), dstRowPitch, dstSlicePitch);
 
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
   if (dstMemory) {
     amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
       *stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{},
@@ -2334,7 +2330,7 @@ inline hipError_t ihipMemcpyCmdEnqueue(amd::Command* command, bool isAsync = fal
   }
   command->enqueue();
   if (!isAsync) {
-    command->queue()->finish();
+    command->queue()->finishCommand(command);
   } else if (stream != nullptr) {
     auto* newQueue = command->queue();
     if (newQueue != stream) {
diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp
index 5e6e136755..22913231e1 100644
--- a/projects/clr/hipamd/src/hip_module.cpp
+++ b/projects/clr/hipamd/src/hip_module.cpp
@@ -472,9 +472,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
   if (stopEvent != nullptr) {
     hip::Event* eStop = reinterpret_cast<hip::Event*>(stopEvent);
     if (eStop->flags_ & hipEventDisableSystemFence) {
-      command->setEventScope(amd::Device::kCacheStateIgnore);
+      command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
     } else {
-      command->setEventScope(amd::Device::kCacheStateSystem);
+      command->setCommandEntryScope(amd::Device::kCacheStateSystem);
     }
     // Enqueue Dispatch and bind the stop event
     command->enqueue();
diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp
index a150bf260e..5ec63c2495 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.cpp
@@ -309,7 +309,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
 
       // Copy memory line by line
       ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-              "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx "
+              "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx, "
               "completion_signal=0x%zx", dstMem.base, srcMem.base,
               (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
 
@@ -1841,11 +1841,17 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
         }
         ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
                 "dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
-        // No cache flush is needed here as we use a staging buffer, and the acquire logic
-        // ensures that the cacheline is different and re-used only when L2 is flushed
+        bool kAttachSignal = false;
+        if (copyMetadata.isAsync_ == false) {
+          // If its a blocking call, attach signal to the packet which we can track for
+          // completion. Also flush caches as we may not need another packet to flush caches.
+          kAttachSignal = true;
+          gpu().addSystemScope();
+        }
         result = shaderCopyBuffer(dstAddr, stagingBuffer,
                                   origin, srcOrigin, copySize,
-                                  entire, dev().settings().limit_blit_wg_, copyMetadata);
+                                  entire, dev().settings().limit_blit_wg_,
+                                  copyMetadata, kAttachSignal);
         if (!result) {
           break;
         }
@@ -2233,19 +2239,23 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
   }
 
   if (!result) {
-    // Flush caches for coherency as the MTYPE of the src buffer may be
-    // non-coherent which mean we need to read it again from memory.
-    // Also if its a device to device copy(intra device), we dont need flush
     // Check CL_MEM_SVM_ATOMICS flag to see if we used system_coarse_segment_
     auto memFlags = srcMemory.owner()->getMemFlags();
     bool srcSvmAtomics = (memFlags & CL_MEM_SVM_ATOMICS) != 0;
-    if (!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) {
+    if ((!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) ||
+        (!copyMetadata.isAsync_)) {
+      // Flush caches for coherency as the MTYPE of the src buffer is
+      // non-coherent(ie read it again from memory).
+      // For device to device copy(intra device), we dont need a flush.
+      // If the source is host memory and the copy is blocking(aka memory need
+      // to be coherent), then add system scope. For non blocking rely on the release
+      // scope issued by synchronization packet.
       gpu().addSystemScope();
     }
     result = shaderCopyBuffer(reinterpret_cast<address>(dstMemory.virtualAddress()),
                               reinterpret_cast<address>(srcMemory.virtualAddress()),
                               dstOrigin, srcOrigin, sizeIn,
-                              entire, blitWg, copyMetadata);
+                              entire, blitWg, copyMetadata, !copyMetadata.isAsync_);
   }
 
   synchronize();
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 6e50b09e9d..7f43aaef35 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -502,7 +502,19 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
   prof_signal->flags_.done_ = false;
   prof_signal->engine_ = engine_;
   prof_signal->flags_.isPacketDispatch_ = false;
-  if (ts != 0) {
+
+  // Store the HW event
+  amd::Command* cmd = gpu_.command();
+  if (nullptr != cmd) {
+    // Release any existing HwEvent before setting new one for the same command
+    if (cmd->HwEvent() != nullptr) {
+      reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
+    }
+    cmd->SetHwEvent(prof_signal);
+    prof_signal->retain();
+  }
+
+  if (ts != nullptr) {
     // Save HSA signal earlier to make sure the possible callback will have a valid
     // value for processing
     ts->retain();
@@ -533,13 +545,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
           ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
             prof_signal->signal_.handle, prof_signal);
         }
-        // Update the current command/marker with HW event
-        prof_signal->retain();
-        ts->command().SetHwEvent(prof_signal);
-      } else if (ts->command().profilingInfo().marker_ts_) {
-        // Update the current command/marker with HW event
-        prof_signal->retain();
-        ts->command().SetHwEvent(prof_signal);
       }
     }
   }
@@ -1133,7 +1138,7 @@ inline bool VirtualGPU::dispatchAqlPacket(
   dispatchGenericAqlPacket(packet, packetHeader, packet->setup, false);
   packet->header = packetHeader;
 
-  profilingEnd(*vcmd);
+  profilingEnd();
 
   return true;
 }
@@ -1379,6 +1384,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
 
   // Initialize the last signal and dispatch flags
   timestamp_ = nullptr;
+  command_ = nullptr;
   hasPendingDispatch_ = false;
   profiling_ = profiling;
   cooperative_ = cooperative;
@@ -1631,6 +1637,9 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
 * and then calls start() to get the current host timestamp.
 */
 void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
+  // Track the current command
+  command_ = &command;
+
   // Disable profiling when command is being captured to prevent memory leak from created timestamp_
   // which won't get freed, since the command is not being executed until graph launch
   if (!command.getPktCapturingState() && command.profilingInfo().enabled_) {
@@ -1669,9 +1678,6 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
       }
     }
   }
-  if (command.getPktCapturingState()) {
-    currCmd_ = &command;
-  }
 }
 
 // ================================================================================================
@@ -1679,8 +1685,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available.
 */
-void VirtualGPU::profilingEnd(amd::Command& command) {
-  if (!command.getPktCapturingState() && command.profilingInfo().enabled_) {
+void VirtualGPU::profilingEnd(bool clearHwEvent) {
+  if (!command_->getPktCapturingState() && command_->profilingInfo().enabled_) {
     if (timestamp_->HwProfiling() == false) {
       timestamp_->end();
     }
@@ -1689,7 +1695,19 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
   if (AMD_DIRECT_DISPATCH) {
     assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty());
   }
-  currCmd_ = nullptr;
+
+  // Certain commands like map/unmap memory may not need hw_events as its not a
+  // queue operation. In such cases clear already set events which may have been for sync
+  // before some memory map/unmap operation
+  if (clearHwEvent) {
+    if (command_->HwEvent() != nullptr) {
+      reinterpret_cast<ProfilingSignal*>(command_->HwEvent())->release();
+      command_->SetHwEvent(nullptr);
+    }
+  }
+
+  // Clear the command tracking
+  command_ = nullptr;
 }
 
 // ================================================================================================
@@ -1877,7 +1895,7 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
     cmd.setStatus(CL_OUT_OF_RESOURCES);
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
@@ -1973,7 +1991,7 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
     cmd.destination().signalWrite(&dev());
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -1995,7 +2013,7 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
     cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(),
                       (void**)(&(svmPointers[0])), cmd.userData());
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2018,9 +2036,12 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
     hsa_status_t status = hsa_amd_svm_prefetch_async(
         const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent,
         wait_events.size(), wait_events.data(), active);
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+      "HSA prefetch async dev_ptr=0x%zx, count=%d, wait_event=0x%zx, "
+      "completion_signal=0x%zx", const_cast<void*>(cmd.dev_ptr()), cmd.count(),
+      (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
 
-    // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
-    if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
+    if ((status != HSA_STATUS_SUCCESS)) {
       Barriers().ResetCurrentSignal();
       LogError("hsa_amd_svm_prefetch_async failed");
       cmd.setStatus(CL_INVALID_OPERATION);
@@ -2031,7 +2052,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
   } else {
     LogWarning("hsa_amd_svm_prefetch_async is ignored, because no HMM support");
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2145,7 +2166,7 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
     cmd.OverrrideCommandType(copy_command_type_);
     copy_command_type_ = 0;
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2227,7 +2248,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
     // direct memcpy for FGS enabled system
     amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2385,7 +2406,7 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
 
   cmd.destination().signalWrite(&dstDevMem->dev());
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2423,7 +2444,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
     }
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2463,7 +2484,7 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
     memory->clearUnmapInfo(cmd.svmPtr());
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2567,7 +2588,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
     }
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2659,7 +2680,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
 
   devMemory->clearUnmapInfo(cmd.mapPtr());
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2735,18 +2756,15 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
 
   bool force_blit = false;
   if (amd::IS_HIP) {
-    constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR);
-    // In case of HMM, use blit kernel instead of CPU memcpy
-    if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) {
-      force_blit = true;
-    }
+    // Always use blit for memset for HIP.
+    force_blit = true;
   }
 
   if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
     cmd.surface(), cmd.origin(), cmd.size(), force_blit)) {
     cmd.setStatus(CL_INVALID_OPERATION);
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2825,7 +2843,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
   } else {
     ShouldNotReachHere();
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2838,8 +2856,9 @@ void VirtualGPU::submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cm
   if (!result) {
     LogError("submitBatchMemoryOperation failed!");
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
+
 // ================================================================================================
 void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
@@ -2850,7 +2869,7 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
   // Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr.
   amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
   if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
-    profilingEnd(vcmd);
+    profilingEnd();
     return;
   }
 
@@ -2906,7 +2925,10 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
     }
   }
 
-  profilingEnd(vcmd);
+  // Since this is a memory operation, the HW event set for barrier packet
+  // may not encapsulate what the command wants to do. Hence clear the hw_event
+  constexpr bool kClearHwEvent = true;
+  profilingEnd(kClearHwEvent);
 }
 
 // ================================================================================================
@@ -2945,7 +2967,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
     amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2976,7 +2998,7 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
     }
   }
 
-  profilingEnd(vcmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -3264,7 +3286,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
 
   amd::Memory* const* memories =
       reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
-  bool isGraphCapture = currCmd_ != nullptr && currCmd_->getPktCapturingState();
+  bool isGraphCapture = command_ != nullptr && command_->getPktCapturingState();
   for (int j = 0; j < iteration; j++) {
     // Reset global size for dimension dim if split is needed
     if (dim != -1) {
@@ -3485,9 +3507,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
       // Allocate buffer to hold kernel arguments
       if (isGraphCapture) {
-        argBuffer = currCmd_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(),
+        argBuffer = command_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(),
                                                gpuKernel.KernargSegmentAlignment());
-        currCmd_->SetKernelName(gpuKernel.name());
+        command_->SetKernelName(gpuKernel.name());
       } else {
         ClPrint(amd::LOG_INFO, amd::LOG_KERN, "KernargSegmentByteSize = %lu "
                 "KernargSegmentAlignment = %lu", gpuKernel.KernargSegmentByteSize(),
@@ -3558,7 +3580,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       aqlHeaderWithOrder &= kAqlHeaderMask;
     }
 
-    if (vcmd != nullptr && vcmd->getEventScope() == amd::Device::kCacheStateSystem) {
+    if (vcmd != nullptr && vcmd->getCommandEntryScope() == amd::Device::kCacheStateSystem) {
       addSystemScope_ = true;
     }
 
@@ -3576,8 +3598,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       // Dispatch the packet
       if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                              (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION, currCmd_->getPktCapturingState(),
-                             currCmd_->getAqlPacket())) {
+                             GPU_FLUSH_ON_EXECUTION, command_->getPktCapturingState(),
+                             command_->getAqlPacket())) {
         return false;
       }
     } else {
@@ -3676,7 +3698,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
     hasPendingDispatch_ = true;
     retainExternalSignals_ = true;
 
-    queue->profilingEnd(vcmd);
+    queue->profilingEnd();
   } else {
     // Make sure VirtualGPU has an exclusive access to the resources
     amd::ScopedLock lock(execution());
@@ -3690,7 +3712,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
       vcmd.setStatus(CL_INVALID_OPERATION);
     }
 
-    profilingEnd(vcmd);
+    profilingEnd();
   }
 }
 
@@ -3711,7 +3733,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
       profilingBegin(vcmd);
       if (timestamp_ != nullptr) {
         const Settings& settings = dev().settings();
-        int32_t releaseFlags = vcmd.getEventScope();
+        int32_t releaseFlags = vcmd.getCommandEntryScope();
         if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
           if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
             dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
@@ -3728,7 +3750,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
           hasPendingDispatch_ = false;
         }
       }
-      profilingEnd(vcmd);
+      profilingEnd();
     }
 
   }
@@ -3747,7 +3769,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
     dispatchBarrierPacket(kNopPacketHeader, false);
   }
 
-  profilingEnd(vcmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -3757,7 +3779,7 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
 
   profilingBegin(vcmd);
   addSystemScope();
-  profilingEnd(vcmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -3765,7 +3787,7 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
   profilingBegin(vcmd);
-  profilingEnd(vcmd);
+  profilingEnd();
 }
 
 // ================================================================================================
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 76f9b8ecb7..c243f4cb71 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -327,7 +327,7 @@ class VirtualGPU : public device::VirtualDevice {
   const Device& dev() const { return roc_device_; }
 
   void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
-  void profilingEnd(amd::Command& command);
+  void profilingEnd(bool clearHwEvent = false);
 
   void updateCommandsState(amd::Command* list) const;
 
@@ -428,6 +428,7 @@ class VirtualGPU : public device::VirtualDevice {
   HwQueueTracker& Barriers() { return barriers_; }
 
   Timestamp* timestamp() const { return timestamp_; }
+  amd::Command* command() const { return command_; }
 
   void* allocKernArg(size_t size, size_t alignment);
   bool isFenceDirty() const { return fence_dirty_; }
@@ -528,6 +529,7 @@ class VirtualGPU : public device::VirtualDevice {
   };
 
   Timestamp* timestamp_;
+  amd::Command* command_;   //!< Current command
   hsa_agent_t gpu_device_;  //!< Physical device
   hsa_queue_t* gpu_queue_;  //!< Queue associated with a gpu
   hsa_barrier_and_packet_t barrier_packet_;
@@ -579,7 +581,5 @@ class VirtualGPU : public device::VirtualDevice {
   std::atomic<uint> lastUsedSdmaEngineMask_;     //!< Last Used SDMA Engine mask
 
   using KernelArgImpl = device::Settings::KernelArgImpl;
-
-  amd::Command* currCmd_ = nullptr;  //!< Current command under capture
 };
 }
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index c77bfa2ac0..ebf47b8eb4 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -42,8 +42,8 @@ Event::Event(HostQueue& queue, bool profilingEnabled)
       hw_event_(nullptr),
       notify_event_(nullptr),
       device_(&queue.device()),
-      profilingInfo_(profilingEnabled),
-      event_scope_(Device::kCacheStateInvalid) {
+      profilingInfo_(profilingEnabled) {
+  event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed);
   notified_.clear();
 }
 
@@ -53,8 +53,8 @@ Event::Event()
       status_(CL_SUBMITTED),
       hw_event_(nullptr),
       notify_event_(nullptr),
-      device_(nullptr),
-      event_scope_(Device::kCacheStateInvalid) {
+      device_(nullptr) {
+  event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed);
   notified_.clear();
 }
 
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index 6e5b5f6eda..68cad257f6 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -88,10 +88,13 @@ class Event : public RuntimeObject {
   std::atomic<CallBackEntry*> callbacks_;  //!< linked list of callback entries.
   std::atomic<int32_t> status_;            //!< current execution status.
   std::atomic_flag notified_;              //!< Command queue was notified
+
   void*  hw_event_;                        //!< HW event ID associated with SW event
   Event* notify_event_;                    //!< Notify event, which should contain HW signal
   const Device* device_;                   //!< Device, this event associated with
-  int32_t event_scope_;                    //!< 2 - system scope, 1 - device scope,
+
+  std::atomic<int32_t> event_entry_scope_; //!< Command entry scope
+                                           //!< 2 - system scope, 1 - device scope,
                                            //!< 0 - ignore, -1 - invalid
 
  protected:
@@ -219,11 +222,15 @@ class Event : public RuntimeObject {
   //! Returns notify even associated with the current command
   Event* NotifyEvent() const { return notify_event_; }
 
-  //! Get release scope of the event
-  int32_t getEventScope() const { return event_scope_; }
+  //! Get entry scope of the event
+  int32_t getCommandEntryScope() const {
+    return event_entry_scope_.load(std::memory_order_relaxed);
+  }
 
-  //! Set release scope for the event
-  void setEventScope(int32_t scope) { event_scope_ = scope; }
+  //! Set entry scope for the event
+  void setCommandEntryScope(int32_t scope) {
+    event_entry_scope_.store(scope, std::memory_order_relaxed);
+  }
 };
 
 union CopyMetadata {
diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp
index ce5d7ae65d..a6d7077358 100644
--- a/projects/clr/rocclr/platform/commandqueue.cpp
+++ b/projects/clr/rocclr/platform/commandqueue.cpp
@@ -133,6 +133,24 @@ bool HostQueue::terminate() {
   return true;
 }
 
+void HostQueue::finishCommand(Command* command) {
+  if (command == nullptr) {
+    command = getLastQueuedCommand(true);
+    if (command != nullptr) {
+      ClPrint(LOG_DEBUG, LOG_CMD, "No command, awaiting complete status on host");
+      command->awaitCompletion();
+      command->release();
+    }
+    return;
+  }
+  // Check hardware event status for the specific command
+  static constexpr bool kWaitCompletion = true;
+  if (!device().IsHwEventReady(command->event(), kWaitCompletion)) {
+    ClPrint(LOG_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host");
+    command->awaitCompletion();
+  }
+}
+
 void HostQueue::finish(bool cpu_wait) {
   Command* command = nullptr;
   if (IS_HIP) {
diff --git a/projects/clr/rocclr/platform/commandqueue.hpp b/projects/clr/rocclr/platform/commandqueue.hpp
index f3e4f45c26..94c63fca53 100644
--- a/projects/clr/rocclr/platform/commandqueue.hpp
+++ b/projects/clr/rocclr/platform/commandqueue.hpp
@@ -233,6 +233,9 @@ class HostQueue : public CommandQueue {
   //! Finish all queued commands
   void finish(bool cpu_wait = false);
 
+  //! Wait until finish of one command
+  void finishCommand(Command* command);
+
   //! Check if hostQueue empty snapshot
   bool isEmpty();