From c8f39ec2b0cbae7291849853ad3f0620f512fcc0 Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Tue, 3 Dec 2024 23:45:31 +0000
Subject: [PATCH] SWDEV-502365 - Track last used command

- This change tries to save extra synchronization packets we may insert
  as we didnt track the completion signals for every command. We track
the current enqueued command until it exits the enqueue stage. We also
record the exit scope to know if we flushed the caches
- Handle correct release scopes and store completion signal as HW events
- Use a new finishCommand implementation to only wait for the command
  passed as the argument

Change-Id: Ie4350c5dd24f5d48dfa6ccbabd892f0544caadcc


[ROCm/clr commit: e03e4f3b5d44a52318a85421b343a0c26fe6619a]
---
 projects/clr/hipamd/src/hip_event.cpp         |   2 +-
 projects/clr/hipamd/src/hip_event.hpp         |   2 +-
 .../clr/hipamd/src/hip_graph_internal.cpp     |   2 +-
 projects/clr/hipamd/src/hip_memory.cpp        |  22 ++-
 projects/clr/hipamd/src/hip_module.cpp        |   4 +-
 projects/clr/rocclr/device/rocm/rocblit.cpp   |  28 ++--
 .../clr/rocclr/device/rocm/rocvirtual.cpp     | 128 ++++++++++--------
 .../clr/rocclr/device/rocm/rocvirtual.hpp     |   6 +-
 projects/clr/rocclr/platform/command.cpp      |   8 +-
 projects/clr/rocclr/platform/command.hpp      |  17 ++-
 projects/clr/rocclr/platform/commandqueue.cpp |  18 +++
 projects/clr/rocclr/platform/commandqueue.hpp |   3 +
 12 files changed, 148 insertions(+), 92 deletions(-)

diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp
index fb2d592f80..92bf3d32b5 100644
--- a/projects/clr/hipamd/src/hip_event.cpp
+++ b/projects/clr/hipamd/src/hip_event.cpp
@@ -175,7 +175,7 @@ hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream)
   command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList);
   // Since we only need to have a dependency on an existing event,
   // we may not need to flush any caches.
-  command->setEventScope(amd::Device::kCacheStateIgnore);
+  command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
 
   if (command == NULL) {
     return hipErrorOutOfMemory;
diff --git a/projects/clr/hipamd/src/hip_event.hpp b/projects/clr/hipamd/src/hip_event.hpp
index 84bb300891..7740b5ea37 100644
--- a/projects/clr/hipamd/src/hip_event.hpp
+++ b/projects/clr/hipamd/src/hip_event.hpp
@@ -87,7 +87,7 @@ class EventMarker : public amd::Marker {
     profilingInfo_.marker_ts_ = markerTs;
     profilingInfo_.batch_flush_ = batch_flush;
     profilingInfo_.clear();
-    setEventScope(scope);
+    setCommandEntryScope(scope);
   }
 };
 
diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp
index 589430fc46..0ad1144b2f 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.cpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.cpp
@@ -739,7 +739,7 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
   this->retain();
   amd::Command* CallbackCommand = new amd::Marker(*launch_stream, kMarkerDisableFlush, {});
   // we may not need to flush any caches.
-  CallbackCommand->setEventScope(amd::Device::kCacheStateIgnore);
+  CallbackCommand->setCommandEntryScope(amd::Device::kCacheStateIgnore);
   amd::Event& event = CallbackCommand->event();
   constexpr bool kBlocking = false;
   if (!event.setCallback(CL_COMPLETE, GraphExec::DecrementRefCount, this, kBlocking)) {
diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp
index 2f07a5227f..c0db1a2548 100644
--- a/projects/clr/hipamd/src/hip_memory.cpp
+++ b/projects/clr/hipamd/src/hip_memory.cpp
@@ -664,13 +664,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
     return hipSuccess;
   } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
              ((srcMemory != nullptr) && (dstMemory == nullptr))) {
-    // Don't wait for unpinned H2D copy if staging is used for copy. If dstMemory is not null, it
-    // can still be a pinned host memory, hence the check on dst memory type.
-    isHostAsync &=
-        ((srcMemory == nullptr) && (dstMemory != nullptr && dstMemoryType == hipMemoryTypeDevice) &&
-         AMD_DIRECT_DISPATCH && (sizeBytes <= stream.device().settings().stagedXferSize_))
-        ? true
-        : false;
+    // Unpinned copy wait behavior is enforced in the lower copy layers so skip
+    // wait at top level except for MT path
+    isHostAsync &= AMD_DIRECT_DISPATCH ? true : false;
   } else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
     // Device to Device copies do not need to host side synchronization.
     if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) &&
@@ -690,7 +686,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
   }
   command->enqueue();
   if (!isHostAsync) {
-    command->queue()->finish();
+    command->queue()->finishCommand(command);
   } else if (!isGPUAsync) {
     hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
     amd::Command::EventWaitList waitList;
@@ -1826,7 +1822,7 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* dstHost, amd::Coo
   amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset);
 
   amd::Coord3D srcStart(srcRect.start_, 0, 0);
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
   if (dstMemory) {
     amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
       *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
@@ -1874,7 +1870,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, void* dstDevice, amd::C
   amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset);
 
   amd::Coord3D dstStart(dstRect.start_, 0, 0);
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
   if (srcMemory) {
     amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
       *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
@@ -1962,7 +1958,7 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, amd::Image* dstImage,
   size_t start = ihipGetbufferStart(static_cast<size_t*>(srcOrigin),
                                     static_cast<size_t*>(copyRegion), srcRowPitch, srcSlicePitch);
 
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
   if (srcMemory) {
     amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
       *stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{},
@@ -2012,7 +2008,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, void* dstHost, amd::Coo
   size_t start = ihipGetbufferStart(static_cast<size_t*>(dstOrigin),
                                     static_cast<size_t*>(copyRegion), dstRowPitch, dstSlicePitch);
 
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
   if (dstMemory) {
     amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
       *stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{},
@@ -2334,7 +2330,7 @@ inline hipError_t ihipMemcpyCmdEnqueue(amd::Command* command, bool isAsync = fal
   }
   command->enqueue();
   if (!isAsync) {
-    command->queue()->finish();
+    command->queue()->finishCommand(command);
   } else if (stream != nullptr) {
     auto* newQueue = command->queue();
     if (newQueue != stream) {
diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp
index 5e6e136755..22913231e1 100644
--- a/projects/clr/hipamd/src/hip_module.cpp
+++ b/projects/clr/hipamd/src/hip_module.cpp
@@ -472,9 +472,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
   if (stopEvent != nullptr) {
     hip::Event* eStop = reinterpret_cast<hip::Event*>(stopEvent);
     if (eStop->flags_ & hipEventDisableSystemFence) {
-      command->setEventScope(amd::Device::kCacheStateIgnore);
+      command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
     } else {
-      command->setEventScope(amd::Device::kCacheStateSystem);
+      command->setCommandEntryScope(amd::Device::kCacheStateSystem);
     }
     // Enqueue Dispatch and bind the stop event
     command->enqueue();
diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp
index a150bf260e..5ec63c2495 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.cpp
@@ -309,7 +309,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
 
       // Copy memory line by line
       ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-              "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx "
+              "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx, "
               "completion_signal=0x%zx", dstMem.base, srcMem.base,
               (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
 
@@ -1841,11 +1841,17 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
         }
         ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
                 "dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
-        // No cache flush is needed here as we use a staging buffer, and the acquire logic
-        // ensures that the cacheline is different and re-used only when L2 is flushed
+        bool kAttachSignal = false;
+        if (copyMetadata.isAsync_ == false) {
+          // If its a blocking call, attach signal to the packet which we can track for
+          // completion. Also flush caches as we may not need another packet to flush caches.
+          kAttachSignal = true;
+          gpu().addSystemScope();
+        }
         result = shaderCopyBuffer(dstAddr, stagingBuffer,
                                   origin, srcOrigin, copySize,
-                                  entire, dev().settings().limit_blit_wg_, copyMetadata);
+                                  entire, dev().settings().limit_blit_wg_,
+                                  copyMetadata, kAttachSignal);
         if (!result) {
           break;
         }
@@ -2233,19 +2239,23 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
   }
 
   if (!result) {
-    // Flush caches for coherency as the MTYPE of the src buffer may be
-    // non-coherent which mean we need to read it again from memory.
-    // Also if its a device to device copy(intra device), we dont need flush
     // Check CL_MEM_SVM_ATOMICS flag to see if we used system_coarse_segment_
     auto memFlags = srcMemory.owner()->getMemFlags();
     bool srcSvmAtomics = (memFlags & CL_MEM_SVM_ATOMICS) != 0;
-    if (!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) {
+    if ((!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) ||
+        (!copyMetadata.isAsync_)) {
+      // Flush caches for coherency as the MTYPE of the src buffer is
+      // non-coherent(ie read it again from memory).
+      // For device to device copy(intra device), we dont need a flush.
+      // If the source is host memory and the copy is blocking(aka memory need
+      // to be coherent), then add system scope. For non blocking rely on the release
+      // scope issued by synchronization packet.
       gpu().addSystemScope();
     }
     result = shaderCopyBuffer(reinterpret_cast<address>(dstMemory.virtualAddress()),
                               reinterpret_cast<address>(srcMemory.virtualAddress()),
                               dstOrigin, srcOrigin, sizeIn,
-                              entire, blitWg, copyMetadata);
+                              entire, blitWg, copyMetadata, !copyMetadata.isAsync_);
   }
 
   synchronize();
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 6e50b09e9d..7f43aaef35 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -502,7 +502,19 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
   prof_signal->flags_.done_ = false;
   prof_signal->engine_ = engine_;
   prof_signal->flags_.isPacketDispatch_ = false;
-  if (ts != 0) {
+
+  // Store the HW event
+  amd::Command* cmd = gpu_.command();
+  if (nullptr != cmd) {
+    // Release any existing HwEvent before setting new one for the same command
+    if (cmd->HwEvent() != nullptr) {
+      reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
+    }
+    cmd->SetHwEvent(prof_signal);
+    prof_signal->retain();
+  }
+
+  if (ts != nullptr) {
     // Save HSA signal earlier to make sure the possible callback will have a valid
     // value for processing
     ts->retain();
@@ -533,13 +545,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
           ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
             prof_signal->signal_.handle, prof_signal);
         }
-        // Update the current command/marker with HW event
-        prof_signal->retain();
-        ts->command().SetHwEvent(prof_signal);
-      } else if (ts->command().profilingInfo().marker_ts_) {
-        // Update the current command/marker with HW event
-        prof_signal->retain();
-        ts->command().SetHwEvent(prof_signal);
       }
     }
   }
@@ -1133,7 +1138,7 @@ inline bool VirtualGPU::dispatchAqlPacket(
   dispatchGenericAqlPacket(packet, packetHeader, packet->setup, false);
   packet->header = packetHeader;
 
-  profilingEnd(*vcmd);
+  profilingEnd();
 
   return true;
 }
@@ -1379,6 +1384,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
 
   // Initialize the last signal and dispatch flags
   timestamp_ = nullptr;
+  command_ = nullptr;
   hasPendingDispatch_ = false;
   profiling_ = profiling;
   cooperative_ = cooperative;
@@ -1631,6 +1637,9 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
 * and then calls start() to get the current host timestamp.
 */
 void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
+  // Track the current command
+  command_ = &command;
+
   // Disable profiling when command is being captured to prevent memory leak from created timestamp_
   // which won't get freed, since the command is not being executed until graph launch
   if (!command.getPktCapturingState() && command.profilingInfo().enabled_) {
@@ -1669,9 +1678,6 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
       }
     }
   }
-  if (command.getPktCapturingState()) {
-    currCmd_ = &command;
-  }
 }
 
 // ================================================================================================
@@ -1679,8 +1685,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available.
 */
-void VirtualGPU::profilingEnd(amd::Command& command) {
-  if (!command.getPktCapturingState() && command.profilingInfo().enabled_) {
+void VirtualGPU::profilingEnd(bool clearHwEvent) {
+  if (!command_->getPktCapturingState() && command_->profilingInfo().enabled_) {
     if (timestamp_->HwProfiling() == false) {
       timestamp_->end();
     }
@@ -1689,7 +1695,19 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
   if (AMD_DIRECT_DISPATCH) {
     assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty());
   }
-  currCmd_ = nullptr;
+
+  // Certain commands like map/unmap memory may not need hw_events as its not a
+  // queue operation. In such cases clear already set events which may have been for sync
+  // before some memory map/unmap operation
+  if (clearHwEvent) {
+    if (command_->HwEvent() != nullptr) {
+      reinterpret_cast<ProfilingSignal*>(command_->HwEvent())->release();
+      command_->SetHwEvent(nullptr);
+    }
+  }
+
+  // Clear the command tracking
+  command_ = nullptr;
 }
 
 // ================================================================================================
@@ -1877,7 +1895,7 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
     cmd.setStatus(CL_OUT_OF_RESOURCES);
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
@@ -1973,7 +1991,7 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
     cmd.destination().signalWrite(&dev());
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -1995,7 +2013,7 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
     cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(),
                       (void**)(&(svmPointers[0])), cmd.userData());
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2018,9 +2036,12 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
     hsa_status_t status = hsa_amd_svm_prefetch_async(
         const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent,
         wait_events.size(), wait_events.data(), active);
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+      "HSA prefetch async dev_ptr=0x%zx, count=%d, wait_event=0x%zx, "
+      "completion_signal=0x%zx", const_cast<void*>(cmd.dev_ptr()), cmd.count(),
+      (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
 
-    // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
-    if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
+    if ((status != HSA_STATUS_SUCCESS)) {
       Barriers().ResetCurrentSignal();
       LogError("hsa_amd_svm_prefetch_async failed");
       cmd.setStatus(CL_INVALID_OPERATION);
@@ -2031,7 +2052,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
   } else {
     LogWarning("hsa_amd_svm_prefetch_async is ignored, because no HMM support");
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2145,7 +2166,7 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
     cmd.OverrrideCommandType(copy_command_type_);
     copy_command_type_ = 0;
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2227,7 +2248,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
     // direct memcpy for FGS enabled system
     amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2385,7 +2406,7 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
 
   cmd.destination().signalWrite(&dstDevMem->dev());
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2423,7 +2444,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
     }
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2463,7 +2484,7 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
     memory->clearUnmapInfo(cmd.svmPtr());
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2567,7 +2588,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
     }
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2659,7 +2680,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
 
   devMemory->clearUnmapInfo(cmd.mapPtr());
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2735,18 +2756,15 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
 
   bool force_blit = false;
   if (amd::IS_HIP) {
-    constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR);
-    // In case of HMM, use blit kernel instead of CPU memcpy
-    if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) {
-      force_blit = true;
-    }
+    // Always use blit for memset for HIP.
+    force_blit = true;
   }
 
   if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
     cmd.surface(), cmd.origin(), cmd.size(), force_blit)) {
     cmd.setStatus(CL_INVALID_OPERATION);
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2825,7 +2843,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
   } else {
     ShouldNotReachHere();
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2838,8 +2856,9 @@ void VirtualGPU::submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cm
   if (!result) {
     LogError("submitBatchMemoryOperation failed!");
   }
-  profilingEnd(cmd);
+  profilingEnd();
 }
+
 // ================================================================================================
 void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
@@ -2850,7 +2869,7 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
   // Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr.
   amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
   if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
-    profilingEnd(vcmd);
+    profilingEnd();
     return;
   }
 
@@ -2906,7 +2925,10 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
     }
   }
 
-  profilingEnd(vcmd);
+  // Since this is a memory operation, the HW event set for barrier packet
+  // may not encapsulate what the command wants to do. Hence clear the hw_event
+  constexpr bool kClearHwEvent = true;
+  profilingEnd(kClearHwEvent);
 }
 
 // ================================================================================================
@@ -2945,7 +2967,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
     amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
   }
 
-  profilingEnd(cmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -2976,7 +2998,7 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
     }
   }
 
-  profilingEnd(vcmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -3264,7 +3286,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
 
   amd::Memory* const* memories =
       reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
-  bool isGraphCapture = currCmd_ != nullptr && currCmd_->getPktCapturingState();
+  bool isGraphCapture = command_ != nullptr && command_->getPktCapturingState();
   for (int j = 0; j < iteration; j++) {
     // Reset global size for dimension dim if split is needed
     if (dim != -1) {
@@ -3485,9 +3507,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
       // Allocate buffer to hold kernel arguments
       if (isGraphCapture) {
-        argBuffer = currCmd_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(),
+        argBuffer = command_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(),
                                                gpuKernel.KernargSegmentAlignment());
-        currCmd_->SetKernelName(gpuKernel.name());
+        command_->SetKernelName(gpuKernel.name());
       } else {
         ClPrint(amd::LOG_INFO, amd::LOG_KERN, "KernargSegmentByteSize = %lu "
                 "KernargSegmentAlignment = %lu", gpuKernel.KernargSegmentByteSize(),
@@ -3558,7 +3580,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       aqlHeaderWithOrder &= kAqlHeaderMask;
     }
 
-    if (vcmd != nullptr && vcmd->getEventScope() == amd::Device::kCacheStateSystem) {
+    if (vcmd != nullptr && vcmd->getCommandEntryScope() == amd::Device::kCacheStateSystem) {
       addSystemScope_ = true;
     }
 
@@ -3576,8 +3598,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       // Dispatch the packet
       if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                              (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION, currCmd_->getPktCapturingState(),
-                             currCmd_->getAqlPacket())) {
+                             GPU_FLUSH_ON_EXECUTION, command_->getPktCapturingState(),
+                             command_->getAqlPacket())) {
         return false;
       }
     } else {
@@ -3676,7 +3698,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
     hasPendingDispatch_ = true;
     retainExternalSignals_ = true;
 
-    queue->profilingEnd(vcmd);
+    queue->profilingEnd();
   } else {
     // Make sure VirtualGPU has an exclusive access to the resources
     amd::ScopedLock lock(execution());
@@ -3690,7 +3712,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
       vcmd.setStatus(CL_INVALID_OPERATION);
     }
 
-    profilingEnd(vcmd);
+    profilingEnd();
   }
 }
 
@@ -3711,7 +3733,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
       profilingBegin(vcmd);
       if (timestamp_ != nullptr) {
         const Settings& settings = dev().settings();
-        int32_t releaseFlags = vcmd.getEventScope();
+        int32_t releaseFlags = vcmd.getCommandEntryScope();
         if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
           if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
             dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
@@ -3728,7 +3750,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
           hasPendingDispatch_ = false;
         }
       }
-      profilingEnd(vcmd);
+      profilingEnd();
     }
 
   }
@@ -3747,7 +3769,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
     dispatchBarrierPacket(kNopPacketHeader, false);
   }
 
-  profilingEnd(vcmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -3757,7 +3779,7 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
 
   profilingBegin(vcmd);
   addSystemScope();
-  profilingEnd(vcmd);
+  profilingEnd();
 }
 
 // ================================================================================================
@@ -3765,7 +3787,7 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
   profilingBegin(vcmd);
-  profilingEnd(vcmd);
+  profilingEnd();
 }
 
 // ================================================================================================
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 76f9b8ecb7..c243f4cb71 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -327,7 +327,7 @@ class VirtualGPU : public device::VirtualDevice {
   const Device& dev() const { return roc_device_; }
 
   void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
-  void profilingEnd(amd::Command& command);
+  void profilingEnd(bool clearHwEvent = false);
 
   void updateCommandsState(amd::Command* list) const;
 
@@ -428,6 +428,7 @@ class VirtualGPU : public device::VirtualDevice {
   HwQueueTracker& Barriers() { return barriers_; }
 
   Timestamp* timestamp() const { return timestamp_; }
+  amd::Command* command() const { return command_; }
 
   void* allocKernArg(size_t size, size_t alignment);
   bool isFenceDirty() const { return fence_dirty_; }
@@ -528,6 +529,7 @@ class VirtualGPU : public device::VirtualDevice {
   };
 
   Timestamp* timestamp_;
+  amd::Command* command_;   //!< Current command
   hsa_agent_t gpu_device_;  //!< Physical device
   hsa_queue_t* gpu_queue_;  //!< Queue associated with a gpu
   hsa_barrier_and_packet_t barrier_packet_;
@@ -579,7 +581,5 @@ class VirtualGPU : public device::VirtualDevice {
   std::atomic<uint> lastUsedSdmaEngineMask_;     //!< Last Used SDMA Engine mask
 
   using KernelArgImpl = device::Settings::KernelArgImpl;
-
-  amd::Command* currCmd_ = nullptr;  //!< Current command under capture
 };
 }
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index c77bfa2ac0..ebf47b8eb4 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -42,8 +42,8 @@ Event::Event(HostQueue& queue, bool profilingEnabled)
       hw_event_(nullptr),
       notify_event_(nullptr),
       device_(&queue.device()),
-      profilingInfo_(profilingEnabled),
-      event_scope_(Device::kCacheStateInvalid) {
+      profilingInfo_(profilingEnabled) {
+  event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed);
   notified_.clear();
 }
 
@@ -53,8 +53,8 @@ Event::Event()
       status_(CL_SUBMITTED),
       hw_event_(nullptr),
       notify_event_(nullptr),
-      device_(nullptr),
-      event_scope_(Device::kCacheStateInvalid) {
+      device_(nullptr) {
+  event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed);
   notified_.clear();
 }
 
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index 6e5b5f6eda..68cad257f6 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -88,10 +88,13 @@ class Event : public RuntimeObject {
   std::atomic<CallBackEntry*> callbacks_;  //!< linked list of callback entries.
   std::atomic<int32_t> status_;            //!< current execution status.
   std::atomic_flag notified_;              //!< Command queue was notified
+
   void*  hw_event_;                        //!< HW event ID associated with SW event
   Event* notify_event_;                    //!< Notify event, which should contain HW signal
   const Device* device_;                   //!< Device, this event associated with
-  int32_t event_scope_;                    //!< 2 - system scope, 1 - device scope,
+
+  std::atomic<int32_t> event_entry_scope_; //!< Command entry scope
+                                           //!< 2 - system scope, 1 - device scope,
                                            //!< 0 - ignore, -1 - invalid
 
  protected:
@@ -219,11 +222,15 @@ class Event : public RuntimeObject {
   //! Returns notify even associated with the current command
   Event* NotifyEvent() const { return notify_event_; }
 
-  //! Get release scope of the event
-  int32_t getEventScope() const { return event_scope_; }
+  //! Get entry scope of the event
+  int32_t getCommandEntryScope() const {
+    return event_entry_scope_.load(std::memory_order_relaxed);
+  }
 
-  //! Set release scope for the event
-  void setEventScope(int32_t scope) { event_scope_ = scope; }
+  //! Set entry scope for the event
+  void setCommandEntryScope(int32_t scope) {
+    event_entry_scope_.store(scope, std::memory_order_relaxed);
+  }
 };
 
 union CopyMetadata {
diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp
index ce5d7ae65d..a6d7077358 100644
--- a/projects/clr/rocclr/platform/commandqueue.cpp
+++ b/projects/clr/rocclr/platform/commandqueue.cpp
@@ -133,6 +133,24 @@ bool HostQueue::terminate() {
   return true;
 }
 
+void HostQueue::finishCommand(Command* command) {
+  if (command == nullptr) {
+    command = getLastQueuedCommand(true);
+    if (command != nullptr) {
+      ClPrint(LOG_DEBUG, LOG_CMD, "No command, awaiting complete status on host");
+      command->awaitCompletion();
+      command->release();
+    }
+    return;
+  }
+  // Check hardware event status for the specific command
+  static constexpr bool kWaitCompletion = true;
+  if (!device().IsHwEventReady(command->event(), kWaitCompletion)) {
+    ClPrint(LOG_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host");
+    command->awaitCompletion();
+  }
+}
+
 void HostQueue::finish(bool cpu_wait) {
   Command* command = nullptr;
   if (IS_HIP) {
diff --git a/projects/clr/rocclr/platform/commandqueue.hpp b/projects/clr/rocclr/platform/commandqueue.hpp
index f3e4f45c26..94c63fca53 100644
--- a/projects/clr/rocclr/platform/commandqueue.hpp
+++ b/projects/clr/rocclr/platform/commandqueue.hpp
@@ -233,6 +233,9 @@ class HostQueue : public CommandQueue {
   //! Finish all queued commands
   void finish(bool cpu_wait = false);
 
+  //! Wait until finish of one command
+  void finishCommand(Command* command);
+
   //! Check if hostQueue empty snapshot
   bool isEmpty();