diff --git a/opencl/amdocl/cl_event.cpp b/opencl/amdocl/cl_event.cpp
index d4f2b04e26..4dfc9c0b94 100644
--- a/opencl/amdocl/cl_event.cpp
+++ b/opencl/amdocl/cl_event.cpp
@@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc
     return (cl_event)0;
   }
 
-  amd::Event* event = new amd::UserEvent(*as_amd(context));
-  if (event == NULL) {
+  auto event = new amd::UserEvent(*as_amd(context));
+  if (event == nullptr || !event->Create()) {
+    delete event;
     *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
     return (cl_event)0;
   }
 
   event->retain();
   *not_null(errcode_ret) = CL_SUCCESS;
-  return as_cl(event);
+  return as_cl(reinterpret_cast<amd::Event*>(event));
 }
 RUNTIME_EXIT
 
@@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st
   if (execution_status > CL_COMPLETE) {
     return CL_INVALID_VALUE;
   }
-
-  if (!as_amd(event)->setStatus(execution_status)) {
+  auto user_event = reinterpret_cast<amd::UserEvent*>(as_amd(event));
+  if (!user_event->SetExecutionStatus(execution_status)) {
     return CL_INVALID_OPERATION;
   }
   return CL_SUCCESS;
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index e3b3f736df..4ecbd04e56 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -95,6 +95,7 @@ class StreamOperationCommand;
 class BatchMemoryOperationCommand;
 class VirtualMapCommand;
 class ExternalSemaphoreCmd;
+class UserEvent;
 class Isa;
 class Device;
 struct KernelParameterDescriptor;
@@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject {
     ShouldNotReachHere();
   }
   virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
+  virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }
 
   virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
   virtual void ReleaseAllHwQueues() {}
@@ -2037,6 +2039,13 @@ class Device : public RuntimeObject {
     return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
   }
 
+  /// @brief  Creates HW user event for OpenCL implementation
+  /// @return The pointer to a HW event structure, known to the HW backend
+  virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; }
+
+  /// @brief  Sets HW user event to the complete status
+  virtual void SetUserEvent(amd::UserEvent* event) const {}
+
   //! Returns TRUE if the device is available for computations
   bool isOnline() const { return online_; }
 
diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index 40ff3454ae..3ab0b06549 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire, amd::CopyMetadata copyMetadata) const {
   // Use host copy if memory has direct access
-  if (setup_.disableReadBuffer_ ||
-      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+  if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ ||
+      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) {
     // Stall GPU before CPU access
     gpu().releaseGpuMemoryFence();
     return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                  const amd::Coord3D& origin, const amd::Coord3D& size,
                                  bool entire, amd::CopyMetadata copyMetadata) const {
   // Use host copy if memory has direct access
-  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
-      gpuMem(dstMemory).IsPersistentDirectMap()) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
+       gpuMem(dstMemory).IsPersistentDirectMap())) {
     // Stall GPU before CPU access
     gpu().releaseGpuMemoryFence();
     return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size,
   buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
 }
 
+// ================================================================================================
 void DmaBlitManager::releaseBuffer(BufferState &buffer) const {
   if (buffer.pinnedMem_) {
     gpu().addPinnedMem(buffer.pinnedMem_);
@@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
                 size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata,
                 bool enablePin) const {
   // Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow
-  gpu().releaseGpuMemoryFence(hostToDev);
+  gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_);
   // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer
   // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning
   bool status = true;
@@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
               copyMetadata.isAsync_);
       const_address src = static_cast<const_address>(hostSrc) + copyOffset;
       status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata);
-      if (status ) {
-        // Wait for current signal of previous rocr copy if its not pinned mem
+      if (status) {
         if (outBuffer.pinnedMem_ == nullptr) {
+          // Wait for current signal of previous rocr copy if its not pinned mem
           gpu().Barriers().WaitCurrent();
           ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
                   hostDst + copyOffset, stagingBuffer, copysize);
@@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
         break;
       }
     }
+
     // Release Pinned Memory back to pool if any
     releaseBuffer(outBuffer);
     // Update Offset and Transfer Size
@@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
     firstTx = false;
   }
 
+  // @note: HIP requires a blocking wait on D2H with the pageable system memory
+  if (amd::IS_HIP && !hostToDev) {
+    gpu().Barriers().WaitCurrent();
+  }
+
   if(!status) {
     return false;
   }
 
   return true;
 }
+
 // ================================================================================================
 KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
     : DmaBlitManager(gpu, setup),
@@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
   bool result = false;
 
   // Use host copy if memory has direct access
-  if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
-      !srcMemory.isCpuUncached())) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
+      !srcMemory.isCpuUncached()))) {
     // Stall GPU before CPU access
     gpu().releaseGpuMemoryFence();
     result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
   bool result = false;
 
   // Use host copy if memory has direct access
-  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
-      gpuMem(dstMemory).IsPersistentDirectMap()) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
+       gpuMem(dstMemory).IsPersistentDirectMap())) {
     // Stall GPU before CPU access
     gpu().releaseGpuMemoryFence();
     result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const {
 }
 
 // ================================================================================================
-bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
+bool KernelBlitManager::runScheduler(uint64_t vqVM,
                                      hsa_queue_t* schedulerQueue,
-                                     hsa_signal_t& schedulerSignal,
-                                     uint threads) {
+                                     uint threads, uint64_t aql_wrap) {
   size_t globalWorkOffset[1] = {0};
   size_t globalWorkSize[1] = {threads};
   size_t localWorkSize[1] = {1};
@@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
   device::Kernel* devKernel = const_cast<device::Kernel*>(kernels_[Scheduler]->getDeviceKernel(dev()));
   Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
 
-  SchedulerParam* sp = reinterpret_cast<SchedulerParam*>(schedulerParam->getHostMem());
+  auto* sp = reinterpret_cast<SchedulerParam*>(
+    gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment));
   memset(sp, 0, sizeof(SchedulerParam));
 
-  Memory* schedulerMem = dev().getRocMemory(schedulerParam);
-  sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
+  sp->kernarg_address = reinterpret_cast<uint64_t>(sp);
   sp->thread_counter = 0;
   sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
-  sp->complete_signal = schedulerSignal;
-
-  hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
-
-
+  sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr);
   sp->vqueue_header = vqVM;
-
-  sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
+  sp->parentAQL = reinterpret_cast<uint64_t>(aql_wrap);
 
   if (dev().info().maxEngineClockFrequency_ > 0) {
     sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
   // Use a device side global atomics to workaround the reliance of PCIe 3 atomics
   sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);
 
-  cl_mem mem = as_cl<amd::Memory>(schedulerParam);
-  setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem);
+  constexpr bool kDirectVa = true;
+  setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa);
 
   address parameters = captureArguments(kernels_[Scheduler]);
 
@@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
     return false;
   }
   releaseArguments(parameters);
-
-  if (!WaitForSignal(schedulerSignal)) {
+  // Wait for the scheduler to finish all operations
+  gpu().WaitCompleteSignal(sp->complete_signal);
+  // @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper
+  // value without the wait for all previous commands (see the PCIE3 atomics workaround above).
+  // The scheduler can enqueue extra commands, but the real queue write index didn't have any
+  // progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode
+  // with device enqueue and user events, because device enqueue is blocking below
+  if (!WaitForSignal(sp->complete_signal)) {
     LogWarning("Failed schedulerSignal wait");
     return false;
   }
-
   return true;
 }
 
diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp
index 8a7f03f454..9c4b16d580 100644
--- a/rocclr/device/rocm/rocblit.hpp
+++ b/rocclr/device/rocm/rocblit.hpp
@@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager {
                          ) const;
 
   bool runScheduler(uint64_t vqVM,
-                    amd::Memory* schedulerParam,
                     hsa_queue_t* schedulerQueue,
-                    hsa_signal_t& schedulerSignal,
-                    uint threads);
+                    uint threads, uint64_t aql_wrap);
 
   //! Runs a blit kernel for GWS init
   bool RunGwsInit(uint32_t value             //!< Initial value for GWS resource
diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 44f3822d33..3274cb22f3 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
   }
 }
 
+// ================================================================================================
+bool Device::CreateUserEvent(amd::UserEvent* event) const {
+  std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+  if ((signal == nullptr) ||
+      (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) {
+    return false;
+  }
+  hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne);
+  event->SetHwEvent(signal.release());
+  return true;
+}
+
+// ================================================================================================
+void Device::SetUserEvent(amd::UserEvent* event) const {
+  auto signal = reinterpret_cast<ProfilingSignal*>(event->HwEvent());
+  assert(signal != nullptr && "Can't have user event without hw event!");
+  hsa_signal_silent_store_relaxed(signal->signal_, 0);
+}
+
 // ================================================================================================
 bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
   // Query ptr type to see if it's a HMM allocation
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index 32dc66726c..0c7ce24805 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -450,6 +450,8 @@ class Device : public NullDevice {
                               uint32_t hip_event_flags = 0) const;
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
   virtual void ReleaseGlobalSignal(void* signal) const;
+  virtual bool CreateUserEvent(amd::UserEvent* event) const;
+  virtual void SetUserEvent(amd::UserEvent* event) const;
 
   //! Allocate host memory in terms of numa policy set by user
   void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
diff --git a/rocclr/device/rocm/rocsettings.cpp b/rocclr/device/rocm/rocsettings.cpp
index d53231fdd9..9d89124cdb 100644
--- a/rocclr/device/rocm/rocsettings.cpp
+++ b/rocclr/device/rocm/rocsettings.cpp
@@ -97,6 +97,8 @@ Settings::Settings() {
   limit_blit_wg_ = 16;
 
   dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
+  // note: OCL user events don't allow CPU blocking calls in DD mode
+  blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH;
 }
 
 // ================================================================================================
diff --git a/rocclr/device/rocm/rocsettings.hpp b/rocclr/device/rocm/rocsettings.hpp
index dd323b86ab..b6f741db8e 100644
--- a/rocclr/device/rocm/rocsettings.hpp
+++ b/rocclr/device/rocm/rocsettings.hpp
@@ -51,7 +51,8 @@ class Settings : public device::Settings {
       uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
       uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
       uint dynamic_queues_ : 1;         //!< Dynamic queues management
-      uint reserved_ : 22;
+      uint blocking_blit_ : 1;          //!< Blit ops can be blocking on CPU
+      uint reserved_ : 21;
     };
     uint value_;
   };
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index d5247ad82c..91b8e5146e 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
   return false;
 }
 
+// ================================================================================================
+void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) {
+  barrier_packet_.dep_signal[0] = signal;
+  dispatchBarrierPacket(kBarrierPacketHeader, false);
+}
+
 // ================================================================================================
 void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
                                        hsa_signal_t signal) {
@@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
       deviceQueueSize_(0),
       maskGroups_(0),
       schedulerThreads_(0),
-      schedulerParam_(nullptr),
       schedulerQueue_(nullptr),
-      schedulerSignal_({0}),
       barriers_(*this),
       managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
       managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
@@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() {
 
   delete printfdbg_;
 
-  if (0 != schedulerSignal_.handle) {
-    hsa_signal_destroy(schedulerSignal_);
-  }
-
   if (nullptr != schedulerQueue_) {
     hsa_queue_destroy(schedulerQueue_);
   }
 
-  if (nullptr != schedulerParam_) {
-    schedulerParam_->release();
-  }
-
   if (nullptr != virtualQueue_) {
     virtualQueue_->release();
   }
@@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
   // If we have host memory, use it
   if ((devMemory->owner()->getHostMem() != nullptr) &&
       (devMemory->owner()->getSvmPtr() == nullptr)) {
-    if (!devMemory->isHostMemDirectAccess()) {
+    if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) {
       // Make sure GPU finished operation before synchronization with the backing store
       releaseGpuMemoryFence();
     }
@@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
 // ================================================================================================
 bool VirtualGPU::createSchedulerParam()
 {
-  if (nullptr != schedulerParam_) {
+  if (nullptr != schedulerQueue_) {
     return true;
   }
 
-  while(true) {
-    schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(),
-      CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap));
-
-    if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) {
-      break;
-    }
-
+  while (true) {
     // The queue is written by multiple threads of the scheduler kernel
     if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI,
         callbackQueue, &roc_device_, std::numeric_limits<uint>::max(),
@@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam()
       break;
     }
 
-    hsa_signal_t  signal0 = {0};
-
-    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) {
-      break;
-    }
-
-    schedulerSignal_ = signal0;
-
-    Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
-
-    if (nullptr == schedulerMem) {
-      break;
-    }
-
-    schedulerParam_->setVirtualDevice(this);
     return true;
   }
 
-  if (0 != schedulerSignal_.handle) {
-    hsa_signal_destroy(schedulerSignal_);
-    schedulerSignal_.handle = 0;
-  }
-
   if (nullptr != schedulerQueue_) {
     hsa_queue_destroy(schedulerQueue_);
     schedulerQueue_ = nullptr;
   }
 
-  if (nullptr != schedulerParam_) {
-    schedulerParam_->release();
-    schedulerParam_ = nullptr;
-  }
-
   return false;
 }
 
@@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     global[i] = static_cast<uint32_t>(sizes.global()[i]);
     local[i] = static_cast<uint16_t>(local_size[i]);
   }
-
+  uint64_t spVA = 0;
   // Check if runtime has to setup hidden arguments
   for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
     const auto& it = signature.at(i);
@@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
         break;
       }
       case amd::KernelParameterDescriptor::HiddenCompletionAction: {
-        uint64_t spVA = 0;
-        if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) {
-          Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
-          AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(
-                             reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
+        if (devKernel->dynamicParallelism()) {
+          auto params = allocKernArg(sizeof(AmdAqlWrap), 64);
+          AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(params);
           memset(wrap, 0, sizeof(AmdAqlWrap));
           wrap->state = AQL_WRAP_DONE;
-          spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
+          spVA = reinterpret_cast<uint64_t>(wrap);
         }
         WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_);
         break;
@@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     dispatchBarrierPacket(kBarrierPacketHeader, true);
     if (virtualQueue_ != nullptr) {
       static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
-          getVQVirtualAddress(), schedulerParam_, schedulerQueue_,
-          schedulerSignal_, schedulerThreads_);
+          getVQVirtualAddress(), schedulerQueue_,
+          schedulerThreads_, spVA);
     }
   }
 
@@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
 
 // ================================================================================================
 void VirtualGPU::addPinnedMem(amd::Memory* mem) {
-  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
-  //!        unconditionally, before it can release pinned memory
-  releaseGpuMemoryFence();
   if (!AMD_DIRECT_DISPATCH) {
+    //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+    //!        unconditionally, before it can release pinned memory
+    releaseGpuMemoryFence();
     if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
       if (pinnedMems_.size() > 7) {
         pinnedMems_.front()->release();
@@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
       pinnedMems_.push_back(mem);
     }
   } else {
-    mem->release();
+    if (command_ != nullptr) {
+      command_->AddPinnedMemory(mem);
+    } else {
+      //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+      //!        unconditionally, before it can release pinned memory
+      releaseGpuMemoryFence();
+      mem->release();
+    }
   }
 }
 
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 1d9c8b756b..1c54f653c3 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice {
   void* allocKernArg(size_t size, size_t alignment);
   bool isFenceDirty() const { return fence_dirty_; }
   void setFenceDirty(bool state) { fence_dirty_ = state; }
-  void HiddenHeapInit();
+  void WaitCompleteSignal(hsa_signal_t signal);
 
+  void HiddenHeapInit();
   void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
   uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
   uint64_t getQueueID();
@@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice {
                                   //!< one thread
   uint schedulerThreads_;         //!< The number of scheduler threads
 
-  amd::Memory* schedulerParam_;
   hsa_queue_t* schedulerQueue_;
-  hsa_signal_t schedulerSignal_;
 
   HwQueueTracker  barriers_;      //!< Tracks active barriers in ROCr
 
diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
index 6a20fe9e26..d9361703e7 100644
--- a/rocclr/platform/command.cpp
+++ b/rocclr/platform/command.cpp
@@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
     ScopedLock l(notify_lock_);
     if ((status() > CL_COMPLETE) && (nullptr != queue) &&
         // If HW event was assigned, then notification can be ignored, since a barrier was issued
-        (HwEvent() == nullptr) &&
+        // @note: Force the marker always in OCL for now, since OCL events require precise
+        // sequence of the status update
+        ((HwEvent() == nullptr) || !amd::IS_HIP) &&
         !notified_.test_and_set()) {
       // Make sure the queue is draining the enqueued commands.
       amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
@@ -364,8 +366,17 @@ void Command::enqueue() {
 
     // Notify all commands about the waiter. Barrier will be sent in order to obtain
     // HSA signal for a wait on the current queue
-    for (const auto& event : eventWaitList()) {
-      event->notifyCmdQueue(!kCpuWait);
+    for (const auto &event: eventWaitList()) {
+      if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) {
+        if (event->status() >= CL_COMPLETE) {
+          reinterpret_cast<amd::UserEvent*>(event)->AddDependent(this);
+        } else {
+          setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+          return;
+        }
+      } else {
+        event->notifyCmdQueue(!kCpuWait);
+      }
     }
 
     // The batch update must be lock protected to avoid a race condition
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index 3dbc814c54..977014f69d 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -397,6 +397,9 @@ class Command : public Event {
   //! Release the resources associated with this event.
   virtual void releaseResources();
 
+  //! Empty function for adding pinned memory
+  virtual void AddPinnedMemory(Memory* pinned) {}
+
   //! Set the next GPU command
   void setNext(Command* next) { next_ = next; }
 
@@ -424,16 +427,46 @@ class Command : public Event {
 };
 
 class UserEvent : public Command {
-  const Context& context_;
+  const Context&        context_;     //!< OCL context associated with the event
+  std::vector<Command*> dependents_;  //!< Commands, which depends on this user event
 
  public:
   UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) {
     setStatus(CL_SUBMITTED);
   }
 
+  //! Creates a user event in the backend layer
+  bool Create() {
+    if (AMD_DIRECT_DISPATCH) {
+      return context_.devices()[0]->CreateUserEvent(this);
+    } else {
+      return true;
+    }
+  }
+
+  //! Sets the execution status of the user event
+  bool SetExecutionStatus(cl_int status) {
+    if (AMD_DIRECT_DISPATCH) {
+      // If it's invalid status, then mark dependent commands as invalid
+      if (status < CL_COMPLETE) {
+        for (auto it : dependents_) {
+          it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+        }
+      }
+      dependents_.clear();
+      context_.devices()[0]->SetUserEvent(this);
+    }
+    return setStatus(status);
+  }
+
+  //! Adds dependent commands for the user event
+  void AddDependent(Command* command) {
+    dependents_.push_back(command);
+  }
+
   virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); }
 
-  virtual const Context& context() const { return context_; }
+  const Context& context() const { return context_; }
 };
 
 class ClGlEvent : public Command {
@@ -450,7 +483,7 @@ class ClGlEvent : public Command {
 
   bool awaitCompletion() { return waitForFence(); }
 
-  virtual const Context& context() const { return context_; }
+  const Context& context() const { return context_; }
 };
 
 inline Command& Event::command() { return *static_cast<Command*>(this); }
@@ -465,6 +498,7 @@ class NDRangeContainer;
 class OneMemoryArgCommand : public Command {
  protected:
   Memory* memory_;
+  std::vector<Memory*>  pinned_memory_;   //!< Pinned memory object
 
  public:
   OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
@@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command {
     memory_->release();
     DEBUG_ONLY(memory_ = NULL);
     Command::releaseResources();
+    for (auto it : pinned_memory_) {
+      it->release();
+    }
   }
 
+  //! Adds pinned memory, used in this command for later release
+  virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); }
   bool validateMemory();
   bool validatePeerMemory();
 };
diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp
index 665e5313f8..6e8d5a96ad 100644
--- a/rocclr/platform/commandqueue.cpp
+++ b/rocclr/platform/commandqueue.cpp
@@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) {
         (vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) {
       cpu_wait = true;
     }
+  } else {
+    // Force CPU wait for OpenCL, since the tests may check OCL command status after finish
+    cpu_wait = true;
   }
 
   size_t batchSize = GetSubmissionBatchSize();