Merge commit '2305f8ae565389cfc348a747c467c004909d7a90' into develop

2025-08-13 00:24:58 +00:00
parent 0ba726f8fc 2305f8ae56
commit fff63d428e
@@ -781,6 +781,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const
  return (__bf16)a + (__bf16)b;
 }

+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
+ * \brief Adds two bfloat16 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd_rn(const __hip_bfloat16 a,
+                                                     const __hip_bfloat16 b) {
+#pragma clang fp contract(off)
+  return (__bf16)a + (__bf16)b;
+}
+
 /**
 * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
 * \brief Subtracts two bfloat16 values
@@ -789,6 +799,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const
  return (__bf16)a - (__bf16)b;
 }

+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
+ * \brief Subtracts two bfloat16 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub_rn(const __hip_bfloat16 a,
+                                                     const __hip_bfloat16 b) {
+#pragma clang fp contract(off)
+  return (__bf16)a - (__bf16)b;
+}
+
 /**
 * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
 * \brief Divides two bfloat16 values
@@ -815,6 +835,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const
  return (__bf16)a * (__bf16)b;
 }

+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
+ * \brief Multiplies two bfloat16 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul_rn(const __hip_bfloat16 a,
+                                                     const __hip_bfloat16 b) {
+#pragma clang fp contract(off)
+  return (__bf16)a * (__bf16)b;
+}
+
 /**
 * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
 * \brief Negate a bfloat16 value
@@ -861,6 +891,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a,
  return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
 }

+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
+ * \brief Adds two bfloat162 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a,
+                                                       const __hip_bfloat162 b) {
+#pragma clang fp contract(off)
+  return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
+}
+
 /**
 * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
 * \brief Performs FMA of given bfloat162 values
@@ -879,6 +919,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a,
  return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)};
 }

+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
+ * \brief Multiplies two bfloat162 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2_rn(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+#pragma clang fp contract(off)
+  return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)};
+}
+
 /**
 * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
 * \brief Converts a bfloat162 into negative
@@ -896,6 +946,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a,
  return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)};
 }

+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
+ * \brief Subtracts two bfloat162 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2_rn(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+#pragma clang fp contract(off)
+  return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)};
+}
+
 /**
 * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
 * \brief Operator to multiply two __hip_bfloat16 numbers
@@ -1367,6 +1367,15 @@ THE SOFTWARE.
                return __half_raw{
                    static_cast<__half_raw>(x).data +
                    static_cast<__half_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
+            __half __hadd_rn(__half x, __half y)
+            {
+#pragma clang fp contract(off)
+                return __half_raw{
+                    static_cast<__half_raw>(x).data +
+                    static_cast<__half_raw>(y).data};
            }
 	        inline
 	        __HOST_DEVICE__
@@ -1390,6 +1399,15 @@ THE SOFTWARE.
            }
            inline
            __HOST_DEVICE__
+            __half __hsub_rn(__half x, __half y)
+            {
+#pragma clang fp contract(off)
+                return __half_raw{
+                    static_cast<__half_raw>(x).data -
+                    static_cast<__half_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
            __half __hmul(__half x, __half y)
            {
                return __half_raw{
@@ -1398,6 +1416,15 @@ THE SOFTWARE.
            }
            inline
            __HOST_DEVICE__
+            __half __hmul_rn(__half x, __half y)
+            {
+#pragma clang fp contract(off)
+                return __half_raw{
+                    static_cast<__half_raw>(x).data *
+                    static_cast<__half_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
            __half __hadd_sat(__half x, __half y)
            {
                return __clamp_01(__hadd(x, y));
@@ -1446,6 +1473,16 @@ THE SOFTWARE.
                    static_cast<__half2_raw>(x).data +
                    static_cast<__half2_raw>(y).data};
            }
+            inline
+            __HOST_DEVICE__
+            __half2 __hadd2_rn(__half2 x, __half2 y)
+            {
+#pragma clang fp contract(off)
+                return __half2{
+                    static_cast<__half2_raw>(x).data +
+                    static_cast<__half2_raw>(y).data};
+            }
+
 	        inline
 	        __HOST_DEVICE__
 	        __half2 __habs2(__half2 x)
@@ -1462,6 +1499,15 @@ THE SOFTWARE.
            }
            inline
            __HOST_DEVICE__
+            __half2 __hsub2_rn(__half2 x, __half2 y)
+            {
+#pragma clang fp contract(off)
+                return __half2{
+                    static_cast<__half2_raw>(x).data -
+                    static_cast<__half2_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
            __half2 __hmul2(__half2 x, __half2 y)
            {
                return __half2{
@@ -1470,6 +1516,15 @@ THE SOFTWARE.
            }
            inline
            __HOST_DEVICE__
+            __half2 __hmul2_rn(__half2 x, __half2 y)
+            {
+#pragma clang fp contract(off)
+                return __half2{
+                    static_cast<__half2_raw>(x).data *
+                    static_cast<__half2_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
            __half2 __hadd2_sat(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hadd2(x, y));
@@ -51,8 +51,8 @@ THE SOFTWARE.
    #include <type_traits>
 #endif // defined(__HIPCC_RTC__)

-template <class T, int N> constexpr __hip_internal::size_t __hip_vec_align_v() {
-  return (N == 4 && alignof(T) == 8) ? 16 : N * alignof(T);
+template <class T, int _hip_N> constexpr __hip_internal::size_t __hip_vec_align_v() {
+  return (_hip_N == 4 && alignof(T) == 8) ? 16 : _hip_N * alignof(T);
 }

 template <typename T, unsigned int n> struct HIP_vector_base;
@@ -199,16 +199,16 @@ struct integer_sequence {
 template<size_t... Ints>
 using index_sequence = integer_sequence<size_t, Ints...>;

-template<size_t N, size_t... Ints>
-struct make_index_sequence_impl : make_index_sequence_impl<N - 1, N - 1, Ints...> {};
+template <size_t _hip_N, size_t... Ints>
+struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};

 template<size_t... Ints>
 struct make_index_sequence_impl<0, Ints...> {
    using type = index_sequence<Ints...>;
 };

-template<size_t N>
-using make_index_sequence = typename make_index_sequence_impl<N>::type;
+template <size_t _hip_N>
+using make_index_sequence = typename make_index_sequence_impl<_hip_N>::type;

 template <size_t... Ints>
 constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {
@@ -284,9 +284,7 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
    for (auto& hostVar : hostVarsIter->second) {
      auto varIter = vars_.find(hostVar);
      if (varIter == vars_.end()) {
-        LogPrintfError(
-          "removeFatBinary: Unable to find module 0x%x hostVar 0x%x",
-          module, hostVar);
+        LogPrintfError("removeFatBinary: Unable to find module 0x%x hostVar 0x%x", module, hostVar);
      } else {
        delete varIter->second;
        vars_.erase(varIter);
@@ -325,8 +323,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
    for (auto& hostFunc : hostFuncsIter->second) {
      auto funcIter = functions_.find(hostFunc);
      if (funcIter == functions_.end()) {
-        LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x",
-                       module, hostFunc);
+        LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x", module,
+                       hostFunc);
      } else {
        delete funcIter->second;
        functions_.erase(funcIter);
@@ -343,8 +341,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
      delete moduleIter->second;
      modules_.erase(moduleIter);
    } else {
-      LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x",
-                    module, hostModule);
+      LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x", module,
+                     hostModule);
    }
    module_to_hostModule_.erase(hostModuleIter);
  }
@@ -383,7 +381,7 @@ hipError_t StatCO::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, i
  }

  // Lazy load
-  FatBinaryInfo **module = it->second->moduleInfo();
+  FatBinaryInfo** module = it->second->moduleInfo();
  if (*(module) == nullptr) {
    amd::ScopedLock lock(sclock_);
    if (*(module) == nullptr) {
@@ -405,7 +403,7 @@ hipError_t StatCO::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hos
  }

  // Lazy load
-  FatBinaryInfo **module = it->second->moduleInfo();
+  FatBinaryInfo** module = it->second->moduleInfo();
  if (*(module) == nullptr) {
    hipError_t err = digestFatBinary(module_to_hostModule_[module], *module);
    assert(err == hipSuccess);
@@ -437,7 +435,7 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice
  }

  // Lazy load
-  FatBinaryInfo **module = it->second->moduleInfo();
+  FatBinaryInfo** module = it->second->moduleInfo();
  if (*(module) == nullptr) {
    hipError_t err = digestFatBinary(module_to_hostModule_[module], *module);
    assert(err == hipSuccess);
@@ -464,7 +462,7 @@ hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) {
    for (auto& vecIter : managedVars_) {
      for (auto& var : vecIter.second) {
        // Lazy load
-        FatBinaryInfo **module = var->moduleInfo();
+        FatBinaryInfo** module = var->moduleInfo();
        if (*(module) == nullptr) {
          err = digestFatBinary(module_to_hostModule_[module], *module);
          assert(err == hipSuccess);
@@ -200,6 +200,9 @@ class IPCEvent : public Event {
      if (!amd::Os::MemoryUnmapFile(ipc_evt_.ipc_shmem_, sizeof(hip::ihipIpcEventShmem_t))) {
        // print hipErrorInvalidHandle;
      }
+      if (owners == 0) {
+        amd::Os::shm_unlink(ipc_evt_.ipc_name_);
+      }
    }
  }
  IPCEvent() : Event(hipEventInterprocess) {}
@@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc
    return (cl_event)0;
  }

-  amd::Event* event = new amd::UserEvent(*as_amd(context));
-  if (event == NULL) {
+  auto event = new amd::UserEvent(*as_amd(context));
+  if (event == nullptr || !event->Create()) {
+    delete event;
    *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
    return (cl_event)0;
  }

  event->retain();
  *not_null(errcode_ret) = CL_SUCCESS;
-  return as_cl(event);
+  return as_cl(reinterpret_cast<amd::Event*>(event));
 }
 RUNTIME_EXIT

@@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st
  if (execution_status > CL_COMPLETE) {
    return CL_INVALID_VALUE;
  }
-
-  if (!as_amd(event)->setStatus(execution_status)) {
+  auto user_event = reinterpret_cast<amd::UserEvent*>(as_amd(event));
+  if (!user_event->SetExecutionStatus(execution_status)) {
    return CL_INVALID_OPERATION;
  }
  return CL_SUCCESS;
@@ -95,6 +95,7 @@ class StreamOperationCommand;
 class BatchMemoryOperationCommand;
 class VirtualMapCommand;
 class ExternalSemaphoreCmd;
+class UserEvent;
 class Isa;
 class Device;
 struct KernelParameterDescriptor;
@@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject {
    ShouldNotReachHere();
  }
  virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
+  virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }

  virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
  virtual void ReleaseAllHwQueues() {}
@@ -2037,6 +2039,13 @@ class Device : public RuntimeObject {
    return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
  }

+  /// @brief  Creates HW user event for OpenCL implementation
+  /// @return The pointer to a HW event structure, known to the HW backend
+  virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; }
+
+  /// @brief  Sets HW user event to the complete status
+  virtual void SetUserEvent(amd::UserEvent* event) const {}
+
  //! Returns TRUE if the device is available for computations
  bool isOnline() const { return online_; }

@@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                const amd::Coord3D& origin, const amd::Coord3D& size,
                                bool entire, amd::CopyMetadata copyMetadata) const {
  // Use host copy if memory has direct access
-  if (setup_.disableReadBuffer_ ||
-      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+  if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ ||
+      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) {
    // Stall GPU before CPU access
    gpu().releaseGpuMemoryFence();
    return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire, amd::CopyMetadata copyMetadata) const {
  // Use host copy if memory has direct access
-  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
-      gpuMem(dstMemory).IsPersistentDirectMap()) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
+       gpuMem(dstMemory).IsPersistentDirectMap())) {
    // Stall GPU before CPU access
    gpu().releaseGpuMemoryFence();
    return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size,
  buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
 }

+// ================================================================================================
 void DmaBlitManager::releaseBuffer(BufferState &buffer) const {
  if (buffer.pinnedMem_) {
    gpu().addPinnedMem(buffer.pinnedMem_);
@@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
                size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata,
                bool enablePin) const {
  // Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow
-  gpu().releaseGpuMemoryFence(hostToDev);
+  gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_);
  // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer
  // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning
  bool status = true;
@@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
              copyMetadata.isAsync_);
      const_address src = static_cast<const_address>(hostSrc) + copyOffset;
      status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata);
-      if (status ) {
-        // Wait for current signal of previous rocr copy if its not pinned mem
+      if (status) {
        if (outBuffer.pinnedMem_ == nullptr) {
+          // Wait for current signal of previous rocr copy if its not pinned mem
          gpu().Barriers().WaitCurrent();
          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
                  hostDst + copyOffset, stagingBuffer, copysize);
@@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
        break;
      }
    }
+
    // Release Pinned Memory back to pool if any
    releaseBuffer(outBuffer);
    // Update Offset and Transfer Size
@@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
    firstTx = false;
  }

+  // @note: HIP requires a blocking wait on D2H with the pageable system memory
+  if (amd::IS_HIP && !hostToDev) {
+    gpu().Barriers().WaitCurrent();
+  }
+
  if(!status) {
    return false;
  }

  return true;
 }
+
 // ================================================================================================
 KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
    : DmaBlitManager(gpu, setup),
@@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
  bool result = false;

  // Use host copy if memory has direct access
-  if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
-      !srcMemory.isCpuUncached())) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
+      !srcMemory.isCpuUncached()))) {
    // Stall GPU before CPU access
    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
  bool result = false;

  // Use host copy if memory has direct access
-  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
-      gpuMem(dstMemory).IsPersistentDirectMap()) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
+       gpuMem(dstMemory).IsPersistentDirectMap())) {
    // Stall GPU before CPU access
    gpu().releaseGpuMemoryFence();
    result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const {
 }

 // ================================================================================================
-bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
+bool KernelBlitManager::runScheduler(uint64_t vqVM,
                                     hsa_queue_t* schedulerQueue,
-                                     hsa_signal_t& schedulerSignal,
-                                     uint threads) {
+                                     uint threads, uint64_t aql_wrap) {
  size_t globalWorkOffset[1] = {0};
  size_t globalWorkSize[1] = {threads};
  size_t localWorkSize[1] = {1};
@@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
  device::Kernel* devKernel = const_cast<device::Kernel*>(kernels_[Scheduler]->getDeviceKernel(dev()));
  Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);

-  SchedulerParam* sp = reinterpret_cast<SchedulerParam*>(schedulerParam->getHostMem());
+  auto* sp = reinterpret_cast<SchedulerParam*>(
+    gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment));
  memset(sp, 0, sizeof(SchedulerParam));

-  Memory* schedulerMem = dev().getRocMemory(schedulerParam);
-  sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
+  sp->kernarg_address = reinterpret_cast<uint64_t>(sp);
  sp->thread_counter = 0;
  sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
-  sp->complete_signal = schedulerSignal;
-
-  hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
-
-
+  sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr);
  sp->vqueue_header = vqVM;
-
-  sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
+  sp->parentAQL = reinterpret_cast<uint64_t>(aql_wrap);

  if (dev().info().maxEngineClockFrequency_ > 0) {
    sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
  // Use a device side global atomics to workaround the reliance of PCIe 3 atomics
  sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);

-  cl_mem mem = as_cl<amd::Memory>(schedulerParam);
-  setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem);
+  constexpr bool kDirectVa = true;
+  setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa);

  address parameters = captureArguments(kernels_[Scheduler]);

@@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
    return false;
  }
  releaseArguments(parameters);
-
-  if (!WaitForSignal(schedulerSignal)) {
+  // Wait for the scheduler to finish all operations
+  gpu().WaitCompleteSignal(sp->complete_signal);
+  // @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper
+  // value without the wait for all previous commands (see the PCIE3 atomics workaround above).
+  // The scheduler can enqueue extra commands, but the real queue write index didn't have any
+  // progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode
+  // with device enqueue and user events, because device enqueue is blocking below
+  if (!WaitForSignal(sp->complete_signal)) {
    LogWarning("Failed schedulerSignal wait");
    return false;
  }
-
  return true;
 }

@@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager {
                         ) const;

  bool runScheduler(uint64_t vqVM,
-                    amd::Memory* schedulerParam,
                    hsa_queue_t* schedulerQueue,
-                    hsa_signal_t& schedulerSignal,
-                    uint threads);
+                    uint threads, uint64_t aql_wrap);

  //! Runs a blit kernel for GWS init
  bool RunGwsInit(uint32_t value             //!< Initial value for GWS resource
@@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
  }
 }

+// ================================================================================================
+bool Device::CreateUserEvent(amd::UserEvent* event) const {
+  std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+  if ((signal == nullptr) ||
+      (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) {
+    return false;
+  }
+  hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne);
+  event->SetHwEvent(signal.release());
+  return true;
+}
+
+// ================================================================================================
+void Device::SetUserEvent(amd::UserEvent* event) const {
+  auto signal = reinterpret_cast<ProfilingSignal*>(event->HwEvent());
+  assert(signal != nullptr && "Can't have user event without hw event!");
+  hsa_signal_silent_store_relaxed(signal->signal_, 0);
+}
+
 // ================================================================================================
 bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
  // Query ptr type to see if it's a HMM allocation
@@ -450,6 +450,8 @@ class Device : public NullDevice {
                              uint32_t hip_event_flags = 0) const;
  virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
  virtual void ReleaseGlobalSignal(void* signal) const;
+  virtual bool CreateUserEvent(amd::UserEvent* event) const;
+  virtual void SetUserEvent(amd::UserEvent* event) const;

  //! Allocate host memory in terms of numa policy set by user
  void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
@@ -97,6 +97,8 @@ Settings::Settings() {
  limit_blit_wg_ = 16;

  dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
+  // note: OCL user events don't allow CPU blocking calls in DD mode
+  blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH;
 }

 // ================================================================================================
@@ -51,7 +51,8 @@ class Settings : public device::Settings {
      uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
      uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
      uint dynamic_queues_ : 1;         //!< Dynamic queues management
-      uint reserved_ : 22;
+      uint blocking_blit_ : 1;          //!< Blit ops can be blocking on CPU
+      uint reserved_ : 21;
    };
    uint value_;
  };
@@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
  return false;
 }

+// ================================================================================================
+void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) {
+  barrier_packet_.dep_signal[0] = signal;
+  dispatchBarrierPacket(kBarrierPacketHeader, false);
+}
+
 // ================================================================================================
 void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
                                       hsa_signal_t signal) {
@@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
      deviceQueueSize_(0),
      maskGroups_(0),
      schedulerThreads_(0),
-      schedulerParam_(nullptr),
      schedulerQueue_(nullptr),
-      schedulerSignal_({0}),
      barriers_(*this),
      managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
      managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
@@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() {

  delete printfdbg_;

-  if (0 != schedulerSignal_.handle) {
-    hsa_signal_destroy(schedulerSignal_);
-  }
-
  if (nullptr != schedulerQueue_) {
    hsa_queue_destroy(schedulerQueue_);
  }

-  if (nullptr != schedulerParam_) {
-    schedulerParam_->release();
-  }
-
  if (nullptr != virtualQueue_) {
    virtualQueue_->release();
  }
@@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
  // If we have host memory, use it
  if ((devMemory->owner()->getHostMem() != nullptr) &&
      (devMemory->owner()->getSvmPtr() == nullptr)) {
-    if (!devMemory->isHostMemDirectAccess()) {
+    if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) {
      // Make sure GPU finished operation before synchronization with the backing store
      releaseGpuMemoryFence();
    }
@@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
 // ================================================================================================
 bool VirtualGPU::createSchedulerParam()
 {
-  if (nullptr != schedulerParam_) {
+  if (nullptr != schedulerQueue_) {
    return true;
  }

-  while(true) {
-    schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(),
-      CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap));
-
-    if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) {
-      break;
-    }
-
+  while (true) {
    // The queue is written by multiple threads of the scheduler kernel
    if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI,
        callbackQueue, &roc_device_, std::numeric_limits<uint>::max(),
@@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam()
      break;
    }

-    hsa_signal_t  signal0 = {0};
-
-    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) {
-      break;
-    }
-
-    schedulerSignal_ = signal0;
-
-    Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
-
-    if (nullptr == schedulerMem) {
-      break;
-    }
-
-    schedulerParam_->setVirtualDevice(this);
    return true;
  }

-  if (0 != schedulerSignal_.handle) {
-    hsa_signal_destroy(schedulerSignal_);
-    schedulerSignal_.handle = 0;
-  }
-
  if (nullptr != schedulerQueue_) {
    hsa_queue_destroy(schedulerQueue_);
    schedulerQueue_ = nullptr;
  }

-  if (nullptr != schedulerParam_) {
-    schedulerParam_->release();
-    schedulerParam_ = nullptr;
-  }
-
  return false;
 }

@@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    global[i] = static_cast<uint32_t>(sizes.global()[i]);
    local[i] = static_cast<uint16_t>(local_size[i]);
  }
-
+  uint64_t spVA = 0;
  // Check if runtime has to setup hidden arguments
  for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
    const auto& it = signature.at(i);
@@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
        break;
      }
      case amd::KernelParameterDescriptor::HiddenCompletionAction: {
-        uint64_t spVA = 0;
-        if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) {
-          Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
-          AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(
-                             reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
+        if (devKernel->dynamicParallelism()) {
+          auto params = allocKernArg(sizeof(AmdAqlWrap), 64);
+          AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(params);
          memset(wrap, 0, sizeof(AmdAqlWrap));
          wrap->state = AQL_WRAP_DONE;
-          spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
+          spVA = reinterpret_cast<uint64_t>(wrap);
        }
        WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_);
        break;
@@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    dispatchBarrierPacket(kBarrierPacketHeader, true);
    if (virtualQueue_ != nullptr) {
      static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
-          getVQVirtualAddress(), schedulerParam_, schedulerQueue_,
-          schedulerSignal_, schedulerThreads_);
+          getVQVirtualAddress(), schedulerQueue_,
+          schedulerThreads_, spVA);
    }
  }

@@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {

 // ================================================================================================
 void VirtualGPU::addPinnedMem(amd::Memory* mem) {
-  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
-  //!        unconditionally, before it can release pinned memory
-  releaseGpuMemoryFence();
  if (!AMD_DIRECT_DISPATCH) {
+    //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+    //!        unconditionally, before it can release pinned memory
+    releaseGpuMemoryFence();
    if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
      if (pinnedMems_.size() > 7) {
        pinnedMems_.front()->release();
@@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
      pinnedMems_.push_back(mem);
    }
  } else {
-    mem->release();
+    if (command_ != nullptr) {
+      command_->AddPinnedMemory(mem);
+    } else {
+      //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+      //!        unconditionally, before it can release pinned memory
+      releaseGpuMemoryFence();
+      mem->release();
+    }
  }
 }

@@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice {
  void* allocKernArg(size_t size, size_t alignment);
  bool isFenceDirty() const { return fence_dirty_; }
  void setFenceDirty(bool state) { fence_dirty_ = state; }
-  void HiddenHeapInit();
+  void WaitCompleteSignal(hsa_signal_t signal);

+  void HiddenHeapInit();
  void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
  uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
  uint64_t getQueueID();
@@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice {
                                  //!< one thread
  uint schedulerThreads_;         //!< The number of scheduler threads

-  amd::Memory* schedulerParam_;
  hsa_queue_t* schedulerQueue_;
-  hsa_signal_t schedulerSignal_;

  HwQueueTracker  barriers_;      //!< Tracks active barriers in ROCr

@@ -285,6 +285,9 @@ class Os : AllStatic {
  //! Deletes file
  static int unlink(const std::string& path);

+  //! Removes the shared memory object name
+  static int shm_unlink(const std::string& path);
+
  // Library routines:
  //
  typedef bool (*SymbolCallback)(std::string, const void*, void*);
@@ -650,6 +650,7 @@ std::string Os::getTempFileName() {
 }

 int Os::unlink(const std::string& path) { return ::unlink(path.c_str()); }
+int Os::shm_unlink(const std::string& path) { return ::shm_unlink(path.c_str()); }

 #if defined(ATI_ARCH_X86)
 void Os::cpuid(int regs[4], int info) {
@@ -505,6 +505,9 @@ std::string Os::getTempFileName() {

 int Os::unlink(const std::string& path) { return ::_unlink(path.c_str()); }

+// shm_unlink is a nop on windows
+int Os::shm_unlink(const std::string& path) { return 0; }
+
 void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); }

 uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); }
@@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
    ScopedLock l(notify_lock_);
    if ((status() > CL_COMPLETE) && (nullptr != queue) &&
        // If HW event was assigned, then notification can be ignored, since a barrier was issued
-        (HwEvent() == nullptr) &&
+        // @note: Force the marker always in OCL for now, since OCL events require precise
+        // sequence of the status update
+        ((HwEvent() == nullptr) || !amd::IS_HIP) &&
        !notified_.test_and_set()) {
      // Make sure the queue is draining the enqueued commands.
      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
@@ -364,8 +366,17 @@ void Command::enqueue() {

    // Notify all commands about the waiter. Barrier will be sent in order to obtain
    // HSA signal for a wait on the current queue
-    for (const auto& event : eventWaitList()) {
-      event->notifyCmdQueue(!kCpuWait);
+    for (const auto &event: eventWaitList()) {
+      if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) {
+        if (event->status() >= CL_COMPLETE) {
+          reinterpret_cast<amd::UserEvent*>(event)->AddDependent(this);
+        } else {
+          setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+          return;
+        }
+      } else {
+        event->notifyCmdQueue(!kCpuWait);
+      }
    }

    // The batch update must be lock protected to avoid a race condition
@@ -397,6 +397,9 @@ class Command : public Event {
  //! Release the resources associated with this event.
  virtual void releaseResources();

+  //! Empty function for adding pinned memory
+  virtual void AddPinnedMemory(Memory* pinned) {}
+
  //! Set the next GPU command
  void setNext(Command* next) { next_ = next; }

@@ -424,16 +427,46 @@ class Command : public Event {
 };

 class UserEvent : public Command {
-  const Context& context_;
+  const Context&        context_;     //!< OCL context associated with the event
+  std::vector<Command*> dependents_;  //!< Commands, which depends on this user event

 public:
  UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) {
    setStatus(CL_SUBMITTED);
  }

+  //! Creates a user event in the backend layer
+  bool Create() {
+    if (AMD_DIRECT_DISPATCH) {
+      return context_.devices()[0]->CreateUserEvent(this);
+    } else {
+      return true;
+    }
+  }
+
+  //! Sets the execution status of the user event
+  bool SetExecutionStatus(cl_int status) {
+    if (AMD_DIRECT_DISPATCH) {
+      // If it's invalid status, then mark dependent commands as invalid
+      if (status < CL_COMPLETE) {
+        for (auto it : dependents_) {
+          it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+        }
+      }
+      dependents_.clear();
+      context_.devices()[0]->SetUserEvent(this);
+    }
+    return setStatus(status);
+  }
+
+  //! Adds dependent commands for the user event
+  void AddDependent(Command* command) {
+    dependents_.push_back(command);
+  }
+
  virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); }

-  virtual const Context& context() const { return context_; }
+  const Context& context() const { return context_; }
 };

 class ClGlEvent : public Command {
@@ -450,7 +483,7 @@ class ClGlEvent : public Command {

  bool awaitCompletion() { return waitForFence(); }

-  virtual const Context& context() const { return context_; }
+  const Context& context() const { return context_; }
 };

 inline Command& Event::command() { return *static_cast<Command*>(this); }
@@ -465,6 +498,7 @@ class NDRangeContainer;
 class OneMemoryArgCommand : public Command {
 protected:
  Memory* memory_;
+  std::vector<Memory*>  pinned_memory_;   //!< Pinned memory object

 public:
  OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
@@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command {
    memory_->release();
    DEBUG_ONLY(memory_ = NULL);
    Command::releaseResources();
+    for (auto it : pinned_memory_) {
+      it->release();
+    }
  }

+  //! Adds pinned memory, used in this command for later release
+  virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); }
  bool validateMemory();
  bool validatePeerMemory();
 };
@@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) {
        (vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) {
      cpu_wait = true;
    }
+  } else {
+    // Force CPU wait for OpenCL, since the tests may check OCL command status after finish
+    cpu_wait = true;
  }

  size_t batchSize = GetSubmissionBatchSize();