SWDEV-465041 - Add support for user events with DD (#321)

* SWDEV-465041 - Add support for user events with DD User events can be replaced with HSA signals. Add the interface to allocate HSA signal for user events and update the status on CL_COMPLETE. Force pinned path with DD to avoid blocking calls. Pinned memory can be released only when the command is complete. Simplify device enqueue path to use generic kernel arg buffer and signals * Fix notifyCmdQueue() logic for OCL * Avoid blocking calls in OCL with DD * Add event destruciton in a case of the failure. [ROCm/clr commit: 2305f8ae56]
2025-08-12 19:04:36 -04:00
@@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc
    return (cl_event)0;
  }

-  amd::Event* event = new amd::UserEvent(*as_amd(context));
-  if (event == NULL) {
+  auto event = new amd::UserEvent(*as_amd(context));
+  if (event == nullptr || !event->Create()) {
+    delete event;
    *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
    return (cl_event)0;
  }

  event->retain();
  *not_null(errcode_ret) = CL_SUCCESS;
-  return as_cl(event);
+  return as_cl(reinterpret_cast<amd::Event*>(event));
 }
 RUNTIME_EXIT

@@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st
  if (execution_status > CL_COMPLETE) {
    return CL_INVALID_VALUE;
  }
-
-  if (!as_amd(event)->setStatus(execution_status)) {
+  auto user_event = reinterpret_cast<amd::UserEvent*>(as_amd(event));
+  if (!user_event->SetExecutionStatus(execution_status)) {
    return CL_INVALID_OPERATION;
  }
  return CL_SUCCESS;
@@ -95,6 +95,7 @@ class StreamOperationCommand;
 class BatchMemoryOperationCommand;
 class VirtualMapCommand;
 class ExternalSemaphoreCmd;
+class UserEvent;
 class Isa;
 class Device;
 struct KernelParameterDescriptor;
@@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject {
    ShouldNotReachHere();
  }
  virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
+  virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }

  virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
  virtual void ReleaseAllHwQueues() {}
@@ -2037,6 +2039,13 @@ class Device : public RuntimeObject {
    return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
  }

+  /// @brief  Creates HW user event for OpenCL implementation
+  /// @return The pointer to a HW event structure, known to the HW backend
+  virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; }
+
+  /// @brief  Sets HW user event to the complete status
+  virtual void SetUserEvent(amd::UserEvent* event) const {}
+
  //! Returns TRUE if the device is available for computations
  bool isOnline() const { return online_; }

@@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                const amd::Coord3D& origin, const amd::Coord3D& size,
                                bool entire, amd::CopyMetadata copyMetadata) const {
  // Use host copy if memory has direct access
-  if (setup_.disableReadBuffer_ ||
-      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+  if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ ||
+      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) {
    // Stall GPU before CPU access
    gpu().releaseGpuMemoryFence();
    return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire, amd::CopyMetadata copyMetadata) const {
  // Use host copy if memory has direct access
-  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
-      gpuMem(dstMemory).IsPersistentDirectMap()) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
+       gpuMem(dstMemory).IsPersistentDirectMap())) {
    // Stall GPU before CPU access
    gpu().releaseGpuMemoryFence();
    return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size,
  buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
 }

+// ================================================================================================
 void DmaBlitManager::releaseBuffer(BufferState &buffer) const {
  if (buffer.pinnedMem_) {
    gpu().addPinnedMem(buffer.pinnedMem_);
@@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
                size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata,
                bool enablePin) const {
  // Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow
-  gpu().releaseGpuMemoryFence(hostToDev);
+  gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_);
  // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer
  // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning
  bool status = true;
@@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
              copyMetadata.isAsync_);
      const_address src = static_cast<const_address>(hostSrc) + copyOffset;
      status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata);
-      if (status ) {
-        // Wait for current signal of previous rocr copy if its not pinned mem
+      if (status) {
        if (outBuffer.pinnedMem_ == nullptr) {
+          // Wait for current signal of previous rocr copy if its not pinned mem
          gpu().Barriers().WaitCurrent();
          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
                  hostDst + copyOffset, stagingBuffer, copysize);
@@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
        break;
      }
    }
+
    // Release Pinned Memory back to pool if any
    releaseBuffer(outBuffer);
    // Update Offset and Transfer Size
@@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
    firstTx = false;
  }

+  // @note: HIP requires a blocking wait on D2H with the pageable system memory
+  if (amd::IS_HIP && !hostToDev) {
+    gpu().Barriers().WaitCurrent();
+  }
+
  if(!status) {
    return false;
  }

  return true;
 }
+
 // ================================================================================================
 KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
    : DmaBlitManager(gpu, setup),
@@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
  bool result = false;

  // Use host copy if memory has direct access
-  if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
-      !srcMemory.isCpuUncached())) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
+      !srcMemory.isCpuUncached()))) {
    // Stall GPU before CPU access
    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
  bool result = false;

  // Use host copy if memory has direct access
-  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
-      gpuMem(dstMemory).IsPersistentDirectMap()) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
+       gpuMem(dstMemory).IsPersistentDirectMap())) {
    // Stall GPU before CPU access
    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const {
 }

 // ================================================================================================
-bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
+bool KernelBlitManager::runScheduler(uint64_t vqVM,
                                     hsa_queue_t* schedulerQueue,
-                                     hsa_signal_t& schedulerSignal,
-                                     uint threads) {
+                                     uint threads, uint64_t aql_wrap) {
  size_t globalWorkOffset[1] = {0};
  size_t globalWorkSize[1] = {threads};
  size_t localWorkSize[1] = {1};
@@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
  device::Kernel* devKernel = const_cast<device::Kernel*>(kernels_[Scheduler]->getDeviceKernel(dev()));
  Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);

-  SchedulerParam* sp = reinterpret_cast<SchedulerParam*>(schedulerParam->getHostMem());
+  auto* sp = reinterpret_cast<SchedulerParam*>(
+    gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment));
  memset(sp, 0, sizeof(SchedulerParam));

-  Memory* schedulerMem = dev().getRocMemory(schedulerParam);
-  sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
+  sp->kernarg_address = reinterpret_cast<uint64_t>(sp);
  sp->thread_counter = 0;
  sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
-  sp->complete_signal = schedulerSignal;
-
-  hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
-
-
+  sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr);
  sp->vqueue_header = vqVM;
-
-  sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
+  sp->parentAQL = reinterpret_cast<uint64_t>(aql_wrap);

  if (dev().info().maxEngineClockFrequency_ > 0) {
    sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
  // Use a device side global atomics to workaround the reliance of PCIe 3 atomics
  sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);

-  cl_mem mem = as_cl<amd::Memory>(schedulerParam);
-  setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem);
+  constexpr bool kDirectVa = true;
+  setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa);

  address parameters = captureArguments(kernels_[Scheduler]);

@@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
    return false;
  }
  releaseArguments(parameters);
-
-  if (!WaitForSignal(schedulerSignal)) {
+  // Wait for the scheduler to finish all operations
+  gpu().WaitCompleteSignal(sp->complete_signal);
+  // @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper
+  // value without the wait for all previous commands (see the PCIE3 atomics workaround above).
+  // The scheduler can enqueue extra commands, but the real queue write index didn't have any
+  // progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode
+  // with device enqueue and user events, because device enqueue is blocking below
+  if (!WaitForSignal(sp->complete_signal)) {
    LogWarning("Failed schedulerSignal wait");
    return false;
  }
-
  return true;
 }

@@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager {
                         ) const;

  bool runScheduler(uint64_t vqVM,
-                    amd::Memory* schedulerParam,
                    hsa_queue_t* schedulerQueue,
-                    hsa_signal_t& schedulerSignal,
-                    uint threads);
+                    uint threads, uint64_t aql_wrap);

  //! Runs a blit kernel for GWS init
  bool RunGwsInit(uint32_t value             //!< Initial value for GWS resource
@@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
  }
 }

+// ================================================================================================
+bool Device::CreateUserEvent(amd::UserEvent* event) const {
+  std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+  if ((signal == nullptr) ||
+      (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) {
+    return false;
+  }
+  hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne);
+  event->SetHwEvent(signal.release());
+  return true;
+}
+
+// ================================================================================================
+void Device::SetUserEvent(amd::UserEvent* event) const {
+  auto signal = reinterpret_cast<ProfilingSignal*>(event->HwEvent());
+  assert(signal != nullptr && "Can't have user event without hw event!");
+  hsa_signal_silent_store_relaxed(signal->signal_, 0);
+}
+
 // ================================================================================================
 bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
  // Query ptr type to see if it's a HMM allocation
@@ -450,6 +450,8 @@ class Device : public NullDevice {
                              uint32_t hip_event_flags = 0) const;
  virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
  virtual void ReleaseGlobalSignal(void* signal) const;
+  virtual bool CreateUserEvent(amd::UserEvent* event) const;
+  virtual void SetUserEvent(amd::UserEvent* event) const;

  //! Allocate host memory in terms of numa policy set by user
  void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
@@ -97,6 +97,8 @@ Settings::Settings() {
  limit_blit_wg_ = 16;

  dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
+  // note: OCL user events don't allow CPU blocking calls in DD mode
+  blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH;
 }

 // ================================================================================================
@@ -51,7 +51,8 @@ class Settings : public device::Settings {
      uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
      uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
      uint dynamic_queues_ : 1;         //!< Dynamic queues management
-      uint reserved_ : 22;
+      uint blocking_blit_ : 1;          //!< Blit ops can be blocking on CPU
+      uint reserved_ : 21;
    };
    uint value_;
  };
@@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
  return false;
 }

+// ================================================================================================
+void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) {
+  barrier_packet_.dep_signal[0] = signal;
+  dispatchBarrierPacket(kBarrierPacketHeader, false);
+}
+
 // ================================================================================================
 void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
                                       hsa_signal_t signal) {
@@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
      deviceQueueSize_(0),
      maskGroups_(0),
      schedulerThreads_(0),
-      schedulerParam_(nullptr),
      schedulerQueue_(nullptr),
-      schedulerSignal_({0}),
      barriers_(*this),
      managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
      managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
@@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() {

  delete printfdbg_;

-  if (0 != schedulerSignal_.handle) {
-    hsa_signal_destroy(schedulerSignal_);
-  }
-
  if (nullptr != schedulerQueue_) {
    hsa_queue_destroy(schedulerQueue_);
  }

-  if (nullptr != schedulerParam_) {
-    schedulerParam_->release();
-  }
-
  if (nullptr != virtualQueue_) {
    virtualQueue_->release();
  }
@@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
  // If we have host memory, use it
  if ((devMemory->owner()->getHostMem() != nullptr) &&
      (devMemory->owner()->getSvmPtr() == nullptr)) {
-    if (!devMemory->isHostMemDirectAccess()) {
+    if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) {
      // Make sure GPU finished operation before synchronization with the backing store
      releaseGpuMemoryFence();
    }
@@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
 // ================================================================================================
 bool VirtualGPU::createSchedulerParam()
 {
-  if (nullptr != schedulerParam_) {
+  if (nullptr != schedulerQueue_) {
    return true;
  }

-  while(true) {
-    schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(),
-      CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap));
-
-    if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) {
-      break;
-    }
-
+  while (true) {
    // The queue is written by multiple threads of the scheduler kernel
    if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI,
        callbackQueue, &roc_device_, std::numeric_limits<uint>::max(),
@@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam()
      break;
    }

-    hsa_signal_t  signal0 = {0};
-
-    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) {
-      break;
-    }
-
-    schedulerSignal_ = signal0;
-
-    Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
-
-    if (nullptr == schedulerMem) {
-      break;
-    }
-
-    schedulerParam_->setVirtualDevice(this);
    return true;
  }

-  if (0 != schedulerSignal_.handle) {
-    hsa_signal_destroy(schedulerSignal_);
-    schedulerSignal_.handle = 0;
-  }
-
  if (nullptr != schedulerQueue_) {
    hsa_queue_destroy(schedulerQueue_);
    schedulerQueue_ = nullptr;
  }

-  if (nullptr != schedulerParam_) {
-    schedulerParam_->release();
-    schedulerParam_ = nullptr;
-  }
-
  return false;
 }

@@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    global[i] = static_cast<uint32_t>(sizes.global()[i]);
    local[i] = static_cast<uint16_t>(local_size[i]);
  }
-
+  uint64_t spVA = 0;
  // Check if runtime has to setup hidden arguments
  for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
    const auto& it = signature.at(i);
@@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
        break;
      }
      case amd::KernelParameterDescriptor::HiddenCompletionAction: {
-        uint64_t spVA = 0;
-        if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) {
-          Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
-          AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(
-                             reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
+        if (devKernel->dynamicParallelism()) {
+          auto params = allocKernArg(sizeof(AmdAqlWrap), 64);
+          AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(params);
          memset(wrap, 0, sizeof(AmdAqlWrap));
          wrap->state = AQL_WRAP_DONE;
-          spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
+          spVA = reinterpret_cast<uint64_t>(wrap);
        }
        WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_);
        break;
@@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    dispatchBarrierPacket(kBarrierPacketHeader, true);
    if (virtualQueue_ != nullptr) {
      static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
-          getVQVirtualAddress(), schedulerParam_, schedulerQueue_,
-          schedulerSignal_, schedulerThreads_);
+          getVQVirtualAddress(), schedulerQueue_,
+          schedulerThreads_, spVA);
    }
  }

@@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {

 // ================================================================================================
 void VirtualGPU::addPinnedMem(amd::Memory* mem) {
-  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
-  //!        unconditionally, before it can release pinned memory
-  releaseGpuMemoryFence();
  if (!AMD_DIRECT_DISPATCH) {
+    //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+    //!        unconditionally, before it can release pinned memory
+    releaseGpuMemoryFence();
    if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
      if (pinnedMems_.size() > 7) {
        pinnedMems_.front()->release();
@@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
      pinnedMems_.push_back(mem);
    }
  } else {
-    mem->release();
+    if (command_ != nullptr) {
+      command_->AddPinnedMemory(mem);
+    } else {
+      //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+      //!        unconditionally, before it can release pinned memory
+      releaseGpuMemoryFence();
+      mem->release();
+    }
  }
 }

@@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice {
  void* allocKernArg(size_t size, size_t alignment);
  bool isFenceDirty() const { return fence_dirty_; }
  void setFenceDirty(bool state) { fence_dirty_ = state; }
-  void HiddenHeapInit();
+  void WaitCompleteSignal(hsa_signal_t signal);

+  void HiddenHeapInit();
  void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
  uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
  uint64_t getQueueID();
@@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice {
                                  //!< one thread
  uint schedulerThreads_;         //!< The number of scheduler threads

-  amd::Memory* schedulerParam_;
  hsa_queue_t* schedulerQueue_;
-  hsa_signal_t schedulerSignal_;

  HwQueueTracker  barriers_;      //!< Tracks active barriers in ROCr

@@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
    ScopedLock l(notify_lock_);
    if ((status() > CL_COMPLETE) && (nullptr != queue) &&
        // If HW event was assigned, then notification can be ignored, since a barrier was issued
-        (HwEvent() == nullptr) &&
+        // @note: Force the marker always in OCL for now, since OCL events require precise
+        // sequence of the status update
+        ((HwEvent() == nullptr) || !amd::IS_HIP) &&
        !notified_.test_and_set()) {
      // Make sure the queue is draining the enqueued commands.
      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
@@ -364,8 +366,17 @@ void Command::enqueue() {

    // Notify all commands about the waiter. Barrier will be sent in order to obtain
    // HSA signal for a wait on the current queue
-    for (const auto& event : eventWaitList()) {
-      event->notifyCmdQueue(!kCpuWait);
+    for (const auto &event: eventWaitList()) {
+      if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) {
+        if (event->status() >= CL_COMPLETE) {
+          reinterpret_cast<amd::UserEvent*>(event)->AddDependent(this);
+        } else {
+          setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+          return;
+        }
+      } else {
+        event->notifyCmdQueue(!kCpuWait);
+      }
    }

    // The batch update must be lock protected to avoid a race condition
@@ -397,6 +397,9 @@ class Command : public Event {
  //! Release the resources associated with this event.
  virtual void releaseResources();

+  //! Empty function for adding pinned memory
+  virtual void AddPinnedMemory(Memory* pinned) {}
+
  //! Set the next GPU command
  void setNext(Command* next) { next_ = next; }

@@ -424,16 +427,46 @@ class Command : public Event {
 };

 class UserEvent : public Command {
-  const Context& context_;
+  const Context&        context_;     //!< OCL context associated with the event
+  std::vector<Command*> dependents_;  //!< Commands, which depends on this user event

 public:
  UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) {
    setStatus(CL_SUBMITTED);
  }

+  //! Creates a user event in the backend layer
+  bool Create() {
+    if (AMD_DIRECT_DISPATCH) {
+      return context_.devices()[0]->CreateUserEvent(this);
+    } else {
+      return true;
+    }
+  }
+
+  //! Sets the execution status of the user event
+  bool SetExecutionStatus(cl_int status) {
+    if (AMD_DIRECT_DISPATCH) {
+      // If it's invalid status, then mark dependent commands as invalid
+      if (status < CL_COMPLETE) {
+        for (auto it : dependents_) {
+          it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+        }
+      }
+      dependents_.clear();
+      context_.devices()[0]->SetUserEvent(this);
+    }
+    return setStatus(status);
+  }
+
+  //! Adds dependent commands for the user event
+  void AddDependent(Command* command) {
+    dependents_.push_back(command);
+  }
+
  virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); }

-  virtual const Context& context() const { return context_; }
+  const Context& context() const { return context_; }
 };

 class ClGlEvent : public Command {
@@ -450,7 +483,7 @@ class ClGlEvent : public Command {

  bool awaitCompletion() { return waitForFence(); }

-  virtual const Context& context() const { return context_; }
+  const Context& context() const { return context_; }
 };

 inline Command& Event::command() { return *static_cast<Command*>(this); }
@@ -465,6 +498,7 @@ class NDRangeContainer;
 class OneMemoryArgCommand : public Command {
 protected:
  Memory* memory_;
+  std::vector<Memory*>  pinned_memory_;   //!< Pinned memory object

 public:
  OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
@@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command {
    memory_->release();
    DEBUG_ONLY(memory_ = NULL);
    Command::releaseResources();
+    for (auto it : pinned_memory_) {
+      it->release();
+    }
  }

+  //! Adds pinned memory, used in this command for later release
+  virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); }
  bool validateMemory();
  bool validatePeerMemory();
 };
@@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) {
        (vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) {
      cpu_wait = true;
    }
+  } else {
+    // Force CPU wait for OpenCL, since the tests may check OCL command status after finish
+    cpu_wait = true;
  }

  size_t batchSize = GetSubmissionBatchSize();