SWDEV-502365 - Track last used command

- This change tries to save extra synchronization packets we may insert as we didnt track the completion signals for every command. We track the current enqueued command until it exits the enqueue stage. We also record the exit scope to know if we flushed the caches - Handle correct release scopes and store completion signal as HW events - Use a new finishCommand implementation to only wait for the command passed as the argument Change-Id: Ie4350c5dd24f5d48dfa6ccbabd892f0544caadcc [ROCm/clr commit: e03e4f3b5d]
2024-12-03 23:45:31 +00:00
@@ -175,7 +175,7 @@ hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream)
  command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList);
  // Since we only need to have a dependency on an existing event,
  // we may not need to flush any caches.
-  command->setEventScope(amd::Device::kCacheStateIgnore);
+  command->setCommandEntryScope(amd::Device::kCacheStateIgnore);

  if (command == NULL) {
    return hipErrorOutOfMemory;
@@ -87,7 +87,7 @@ class EventMarker : public amd::Marker {
    profilingInfo_.marker_ts_ = markerTs;
    profilingInfo_.batch_flush_ = batch_flush;
    profilingInfo_.clear();
-    setEventScope(scope);
+    setCommandEntryScope(scope);
  }
 };

@@ -739,7 +739,7 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
  this->retain();
  amd::Command* CallbackCommand = new amd::Marker(*launch_stream, kMarkerDisableFlush, {});
  // we may not need to flush any caches.
-  CallbackCommand->setEventScope(amd::Device::kCacheStateIgnore);
+  CallbackCommand->setCommandEntryScope(amd::Device::kCacheStateIgnore);
  amd::Event& event = CallbackCommand->event();
  constexpr bool kBlocking = false;
  if (!event.setCallback(CL_COMPLETE, GraphExec::DecrementRefCount, this, kBlocking)) {
@@ -664,13 +664,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
    return hipSuccess;
  } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
             ((srcMemory != nullptr) && (dstMemory == nullptr))) {
-    // Don't wait for unpinned H2D copy if staging is used for copy. If dstMemory is not null, it
-    // can still be a pinned host memory, hence the check on dst memory type.
-    isHostAsync &=
-        ((srcMemory == nullptr) && (dstMemory != nullptr && dstMemoryType == hipMemoryTypeDevice) &&
-         AMD_DIRECT_DISPATCH && (sizeBytes <= stream.device().settings().stagedXferSize_))
-        ? true
-        : false;
+    // Unpinned copy wait behavior is enforced in the lower copy layers so skip
+    // wait at top level except for MT path
+    isHostAsync &= AMD_DIRECT_DISPATCH ? true : false;
  } else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
    // Device to Device copies do not need to host side synchronization.
    if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice) &&
@@ -690,7 +686,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
  }
  command->enqueue();
  if (!isHostAsync) {
-    command->queue()->finish();
+    command->queue()->finishCommand(command);
  } else if (!isGPUAsync) {
    hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
    amd::Command::EventWaitList waitList;
@@ -1826,7 +1822,7 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* dstHost, amd::Coo
  amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset);

  amd::Coord3D srcStart(srcRect.start_, 0, 0);
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
  if (dstMemory) {
    amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
      *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
@@ -1874,7 +1870,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, void* dstDevice, amd::C
  amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset);

  amd::Coord3D dstStart(dstRect.start_, 0, 0);
-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
  if (srcMemory) {
    amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
      *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{},
@@ -1962,7 +1958,7 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, amd::Image* dstImage,
  size_t start = ihipGetbufferStart(static_cast<size_t*>(srcOrigin),
                                    static_cast<size_t*>(copyRegion), srcRowPitch, srcSlicePitch);

-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
  if (srcMemory) {
    amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
      *stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{},
@@ -2012,7 +2008,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, void* dstHost, amd::Coo
  size_t start = ihipGetbufferStart(static_cast<size_t*>(dstOrigin),
                                    static_cast<size_t*>(copyRegion), dstRowPitch, dstSlicePitch);

-  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
+  amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
  if (dstMemory) {
    amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand(
      *stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{},
@@ -2334,7 +2330,7 @@ inline hipError_t ihipMemcpyCmdEnqueue(amd::Command* command, bool isAsync = fal
  }
  command->enqueue();
  if (!isAsync) {
-    command->queue()->finish();
+    command->queue()->finishCommand(command);
  } else if (stream != nullptr) {
    auto* newQueue = command->queue();
    if (newQueue != stream) {
@@ -472,9 +472,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
  if (stopEvent != nullptr) {
    hip::Event* eStop = reinterpret_cast<hip::Event*>(stopEvent);
    if (eStop->flags_ & hipEventDisableSystemFence) {
-      command->setEventScope(amd::Device::kCacheStateIgnore);
+      command->setCommandEntryScope(amd::Device::kCacheStateIgnore);
    } else {
-      command->setEventScope(amd::Device::kCacheStateSystem);
+      command->setCommandEntryScope(amd::Device::kCacheStateSystem);
    }
    // Enqueue Dispatch and bind the stop event
    command->enqueue();
@@ -309,7 +309,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d

      // Copy memory line by line
      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-              "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx "
+              "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx, "
              "completion_signal=0x%zx", dstMem.base, srcMem.base,
              (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);

@@ -1841,11 +1841,17 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
        }
        ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
                "dstOrigin=0x%x, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
-        // No cache flush is needed here as we use a staging buffer, and the acquire logic
-        // ensures that the cacheline is different and re-used only when L2 is flushed
+        bool kAttachSignal = false;
+        if (copyMetadata.isAsync_ == false) {
+          // If its a blocking call, attach signal to the packet which we can track for
+          // completion. Also flush caches as we may not need another packet to flush caches.
+          kAttachSignal = true;
+          gpu().addSystemScope();
+        }
        result = shaderCopyBuffer(dstAddr, stagingBuffer,
                                  origin, srcOrigin, copySize,
-                                  entire, dev().settings().limit_blit_wg_, copyMetadata);
+                                  entire, dev().settings().limit_blit_wg_,
+                                  copyMetadata, kAttachSignal);
        if (!result) {
          break;
        }
@@ -2233,19 +2239,23 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
  }

  if (!result) {
-    // Flush caches for coherency as the MTYPE of the src buffer may be
-    // non-coherent which mean we need to read it again from memory.
-    // Also if its a device to device copy(intra device), we dont need flush
    // Check CL_MEM_SVM_ATOMICS flag to see if we used system_coarse_segment_
    auto memFlags = srcMemory.owner()->getMemFlags();
    bool srcSvmAtomics = (memFlags & CL_MEM_SVM_ATOMICS) != 0;
-    if (!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) {
+    if ((!srcSvmAtomics && srcMemory.isHostMemDirectAccess()) ||
+        (!copyMetadata.isAsync_)) {
+      // Flush caches for coherency as the MTYPE of the src buffer is
+      // non-coherent(ie read it again from memory).
+      // For device to device copy(intra device), we dont need a flush.
+      // If the source is host memory and the copy is blocking(aka memory need
+      // to be coherent), then add system scope. For non blocking rely on the release
+      // scope issued by synchronization packet.
      gpu().addSystemScope();
    }
    result = shaderCopyBuffer(reinterpret_cast<address>(dstMemory.virtualAddress()),
                              reinterpret_cast<address>(srcMemory.virtualAddress()),
                              dstOrigin, srcOrigin, sizeIn,
-                              entire, blitWg, copyMetadata);
+                              entire, blitWg, copyMetadata, !copyMetadata.isAsync_);
  }

  synchronize();
@@ -502,7 +502,19 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
  prof_signal->flags_.done_ = false;
  prof_signal->engine_ = engine_;
  prof_signal->flags_.isPacketDispatch_ = false;
-  if (ts != 0) {
+
+  // Store the HW event
+  amd::Command* cmd = gpu_.command();
+  if (nullptr != cmd) {
+    // Release any existing HwEvent before setting new one for the same command
+    if (cmd->HwEvent() != nullptr) {
+      reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
+    }
+    cmd->SetHwEvent(prof_signal);
+    prof_signal->retain();
+  }
+
+  if (ts != nullptr) {
    // Save HSA signal earlier to make sure the possible callback will have a valid
    // value for processing
    ts->retain();
@@ -533,13 +545,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
          ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
            prof_signal->signal_.handle, prof_signal);
        }
-        // Update the current command/marker with HW event
-        prof_signal->retain();
-        ts->command().SetHwEvent(prof_signal);
-      } else if (ts->command().profilingInfo().marker_ts_) {
-        // Update the current command/marker with HW event
-        prof_signal->retain();
-        ts->command().SetHwEvent(prof_signal);
      }
    }
  }
@@ -1133,7 +1138,7 @@ inline bool VirtualGPU::dispatchAqlPacket(
  dispatchGenericAqlPacket(packet, packetHeader, packet->setup, false);
  packet->header = packetHeader;

-  profilingEnd(*vcmd);
+  profilingEnd();

  return true;
 }
@@ -1379,6 +1384,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,

  // Initialize the last signal and dispatch flags
  timestamp_ = nullptr;
+  command_ = nullptr;
  hasPendingDispatch_ = false;
  profiling_ = profiling;
  cooperative_ = cooperative;
@@ -1631,6 +1637,9 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
 * and then calls start() to get the current host timestamp.
 */
 void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
+  // Track the current command
+  command_ = &command;
+
  // Disable profiling when command is being captured to prevent memory leak from created timestamp_
  // which won't get freed, since the command is not being executed until graph launch
  if (!command.getPktCapturingState() && command.profilingInfo().enabled_) {
@@ -1669,9 +1678,6 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
      }
    }
  }
-  if (command.getPktCapturingState()) {
-    currCmd_ = &command;
-  }
 }

 // ================================================================================================
@@ -1679,8 +1685,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available.
 */
-void VirtualGPU::profilingEnd(amd::Command& command) {
-  if (!command.getPktCapturingState() && command.profilingInfo().enabled_) {
+void VirtualGPU::profilingEnd(bool clearHwEvent) {
+  if (!command_->getPktCapturingState() && command_->profilingInfo().enabled_) {
    if (timestamp_->HwProfiling() == false) {
      timestamp_->end();
    }
@@ -1689,7 +1695,19 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
  if (AMD_DIRECT_DISPATCH) {
    assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty());
  }
-  currCmd_ = nullptr;
+
+  // Certain commands like map/unmap memory may not need hw_events as its not a
+  // queue operation. In such cases clear already set events which may have been for sync
+  // before some memory map/unmap operation
+  if (clearHwEvent) {
+    if (command_->HwEvent() != nullptr) {
+      reinterpret_cast<ProfilingSignal*>(command_->HwEvent())->release();
+      command_->SetHwEvent(nullptr);
+    }
+  }
+
+  // Clear the command tracking
+  command_ = nullptr;
 }

 // ================================================================================================
@@ -1877,7 +1895,7 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
    cmd.setStatus(CL_OUT_OF_RESOURCES);
  }

-  profilingEnd(cmd);
+  profilingEnd();
 }

 void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
@@ -1973,7 +1991,7 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
    cmd.destination().signalWrite(&dev());
  }

-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -1995,7 +2013,7 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
    cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(),
                      (void**)(&(svmPointers[0])), cmd.userData());
  }
-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2018,9 +2036,12 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
    hsa_status_t status = hsa_amd_svm_prefetch_async(
        const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent,
        wait_events.size(), wait_events.data(), active);
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+      "HSA prefetch async dev_ptr=0x%zx, count=%d, wait_event=0x%zx, "
+      "completion_signal=0x%zx", const_cast<void*>(cmd.dev_ptr()), cmd.count(),
+      (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);

-    // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
-    if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
+    if ((status != HSA_STATUS_SUCCESS)) {
      Barriers().ResetCurrentSignal();
      LogError("hsa_amd_svm_prefetch_async failed");
      cmd.setStatus(CL_INVALID_OPERATION);
@@ -2031,7 +2052,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
  } else {
    LogWarning("hsa_amd_svm_prefetch_async is ignored, because no HMM support");
  }
-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2145,7 +2166,7 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
    cmd.OverrrideCommandType(copy_command_type_);
    copy_command_type_ = 0;
  }
-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2227,7 +2248,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
    // direct memcpy for FGS enabled system
    amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
  }
-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2385,7 +2406,7 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {

  cmd.destination().signalWrite(&dstDevMem->dev());

-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2423,7 +2444,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
    }
  }

-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2463,7 +2484,7 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
    memory->clearUnmapInfo(cmd.svmPtr());
  }

-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2567,7 +2588,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
    }
  }

-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2659,7 +2680,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {

  devMemory->clearUnmapInfo(cmd.mapPtr());

-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2735,18 +2756,15 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {

  bool force_blit = false;
  if (amd::IS_HIP) {
-    constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR);
-    // In case of HMM, use blit kernel instead of CPU memcpy
-    if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) {
-      force_blit = true;
-    }
+    // Always use blit for memset for HIP.
+    force_blit = true;
  }

  if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
    cmd.surface(), cmd.origin(), cmd.size(), force_blit)) {
    cmd.setStatus(CL_INVALID_OPERATION);
  }
-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2825,7 +2843,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
  } else {
    ShouldNotReachHere();
  }
-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2838,8 +2856,9 @@ void VirtualGPU::submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cm
  if (!result) {
    LogError("submitBatchMemoryOperation failed!");
  }
-  profilingEnd(cmd);
+  profilingEnd();
 }
+
 // ================================================================================================
 void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
@@ -2850,7 +2869,7 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
  // Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr.
  amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
  if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
-    profilingEnd(vcmd);
+    profilingEnd();
    return;
  }

@@ -2906,7 +2925,10 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
    }
  }

-  profilingEnd(vcmd);
+  // Since this is a memory operation, the HW event set for barrier packet
+  // may not encapsulate what the command wants to do. Hence clear the hw_event
+  constexpr bool kClearHwEvent = true;
+  profilingEnd(kClearHwEvent);
 }

 // ================================================================================================
@@ -2945,7 +2967,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
    amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
  }

-  profilingEnd(cmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -2976,7 +2998,7 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
    }
  }

-  profilingEnd(vcmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -3264,7 +3286,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,

  amd::Memory* const* memories =
      reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
-  bool isGraphCapture = currCmd_ != nullptr && currCmd_->getPktCapturingState();
+  bool isGraphCapture = command_ != nullptr && command_->getPktCapturingState();
  for (int j = 0; j < iteration; j++) {
    // Reset global size for dimension dim if split is needed
    if (dim != -1) {
@@ -3485,9 +3507,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
      // Allocate buffer to hold kernel arguments
      if (isGraphCapture) {
-        argBuffer = currCmd_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(),
+        argBuffer = command_->getKernArgOffset(gpuKernel.KernargSegmentByteSize(),
                                               gpuKernel.KernargSegmentAlignment());
-        currCmd_->SetKernelName(gpuKernel.name());
+        command_->SetKernelName(gpuKernel.name());
      } else {
        ClPrint(amd::LOG_INFO, amd::LOG_KERN, "KernargSegmentByteSize = %lu "
                "KernargSegmentAlignment = %lu", gpuKernel.KernargSegmentByteSize(),
@@ -3558,7 +3580,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
      aqlHeaderWithOrder &= kAqlHeaderMask;
    }

-    if (vcmd != nullptr && vcmd->getEventScope() == amd::Device::kCacheStateSystem) {
+    if (vcmd != nullptr && vcmd->getCommandEntryScope() == amd::Device::kCacheStateSystem) {
      addSystemScope_ = true;
    }

@@ -3576,8 +3598,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
      // Dispatch the packet
      if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                             (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
-                             GPU_FLUSH_ON_EXECUTION, currCmd_->getPktCapturingState(),
-                             currCmd_->getAqlPacket())) {
+                             GPU_FLUSH_ON_EXECUTION, command_->getPktCapturingState(),
+                             command_->getAqlPacket())) {
        return false;
      }
    } else {
@@ -3676,7 +3698,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
    hasPendingDispatch_ = true;
    retainExternalSignals_ = true;

-    queue->profilingEnd(vcmd);
+    queue->profilingEnd();
  } else {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());
@@ -3690,7 +3712,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }

-    profilingEnd(vcmd);
+    profilingEnd();
  }
 }

@@ -3711,7 +3733,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
      profilingBegin(vcmd);
      if (timestamp_ != nullptr) {
        const Settings& settings = dev().settings();
-        int32_t releaseFlags = vcmd.getEventScope();
+        int32_t releaseFlags = vcmd.getCommandEntryScope();
        if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
          if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
            dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
@@ -3728,7 +3750,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
          hasPendingDispatch_ = false;
        }
      }
-      profilingEnd(vcmd);
+      profilingEnd();
    }

  }
@@ -3747,7 +3769,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
    dispatchBarrierPacket(kNopPacketHeader, false);
  }

-  profilingEnd(vcmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -3757,7 +3779,7 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {

  profilingBegin(vcmd);
  addSystemScope();
-  profilingEnd(vcmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -3765,7 +3787,7 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);
-  profilingEnd(vcmd);
+  profilingEnd();
 }

 // ================================================================================================
@@ -327,7 +327,7 @@ class VirtualGPU : public device::VirtualDevice {
  const Device& dev() const { return roc_device_; }

  void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
-  void profilingEnd(amd::Command& command);
+  void profilingEnd(bool clearHwEvent = false);

  void updateCommandsState(amd::Command* list) const;

@@ -428,6 +428,7 @@ class VirtualGPU : public device::VirtualDevice {
  HwQueueTracker& Barriers() { return barriers_; }

  Timestamp* timestamp() const { return timestamp_; }
+  amd::Command* command() const { return command_; }

  void* allocKernArg(size_t size, size_t alignment);
  bool isFenceDirty() const { return fence_dirty_; }
@@ -528,6 +529,7 @@ class VirtualGPU : public device::VirtualDevice {
  };

  Timestamp* timestamp_;
+  amd::Command* command_;   //!< Current command
  hsa_agent_t gpu_device_;  //!< Physical device
  hsa_queue_t* gpu_queue_;  //!< Queue associated with a gpu
  hsa_barrier_and_packet_t barrier_packet_;
@@ -579,7 +581,5 @@ class VirtualGPU : public device::VirtualDevice {
  std::atomic<uint> lastUsedSdmaEngineMask_;     //!< Last Used SDMA Engine mask

  using KernelArgImpl = device::Settings::KernelArgImpl;
-
-  amd::Command* currCmd_ = nullptr;  //!< Current command under capture
 };
 }
@@ -42,8 +42,8 @@ Event::Event(HostQueue& queue, bool profilingEnabled)
      hw_event_(nullptr),
      notify_event_(nullptr),
      device_(&queue.device()),
-      profilingInfo_(profilingEnabled),
-      event_scope_(Device::kCacheStateInvalid) {
+      profilingInfo_(profilingEnabled) {
+  event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed);
  notified_.clear();
 }

@@ -53,8 +53,8 @@ Event::Event()
      status_(CL_SUBMITTED),
      hw_event_(nullptr),
      notify_event_(nullptr),
-      device_(nullptr),
-      event_scope_(Device::kCacheStateInvalid) {
+      device_(nullptr) {
+  event_entry_scope_.store(Device::kCacheStateInvalid, std::memory_order_relaxed);
  notified_.clear();
 }

@@ -88,10 +88,13 @@ class Event : public RuntimeObject {
  std::atomic<CallBackEntry*> callbacks_;  //!< linked list of callback entries.
  std::atomic<int32_t> status_;            //!< current execution status.
  std::atomic_flag notified_;              //!< Command queue was notified
+
  void*  hw_event_;                        //!< HW event ID associated with SW event
  Event* notify_event_;                    //!< Notify event, which should contain HW signal
  const Device* device_;                   //!< Device, this event associated with
-  int32_t event_scope_;                    //!< 2 - system scope, 1 - device scope,
+
+  std::atomic<int32_t> event_entry_scope_; //!< Command entry scope
+                                           //!< 2 - system scope, 1 - device scope,
                                           //!< 0 - ignore, -1 - invalid

 protected:
@@ -219,11 +222,15 @@ class Event : public RuntimeObject {
  //! Returns notify even associated with the current command
  Event* NotifyEvent() const { return notify_event_; }

-  //! Get release scope of the event
-  int32_t getEventScope() const { return event_scope_; }
+  //! Get entry scope of the event
+  int32_t getCommandEntryScope() const {
+    return event_entry_scope_.load(std::memory_order_relaxed);
+  }

-  //! Set release scope for the event
-  void setEventScope(int32_t scope) { event_scope_ = scope; }
+  //! Set entry scope for the event
+  void setCommandEntryScope(int32_t scope) {
+    event_entry_scope_.store(scope, std::memory_order_relaxed);
+  }
 };

 union CopyMetadata {
@@ -133,6 +133,24 @@ bool HostQueue::terminate() {
  return true;
 }

+void HostQueue::finishCommand(Command* command) {
+  if (command == nullptr) {
+    command = getLastQueuedCommand(true);
+    if (command != nullptr) {
+      ClPrint(LOG_DEBUG, LOG_CMD, "No command, awaiting complete status on host");
+      command->awaitCompletion();
+      command->release();
+    }
+    return;
+  }
+  // Check hardware event status for the specific command
+  static constexpr bool kWaitCompletion = true;
+  if (!device().IsHwEventReady(command->event(), kWaitCompletion)) {
+    ClPrint(LOG_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host");
+    command->awaitCompletion();
+  }
+}
+
 void HostQueue::finish(bool cpu_wait) {
  Command* command = nullptr;
  if (IS_HIP) {
@@ -233,6 +233,9 @@ class HostQueue : public CommandQueue {
  //! Finish all queued commands
  void finish(bool cpu_wait = false);

+  //! Wait until finish of one command
+  void finishCommand(Command* command);
+
  //! Check if hostQueue empty snapshot
  bool isEmpty();