From 4d5fe2206d7ba09ffe7fd524007a6a897f8c1b64 Mon Sep 17 00:00:00 2001
From: "Chaudhary, Jatin Jaikishan" <JatinJaikishan.Chaudhary@amd.com>
Date: Tue, 12 Aug 2025 17:28:38 +0100
Subject: [PATCH 1/4] SWDEV-539481 - Add _rn variants of fp16 APIs (#582)

* Add _rn variants of fp16 APIs
* cover bf16 as well
---
 hipamd/include/hip/amd_detail/amd_hip_bf16.h | 60 ++++++++++++++++++++
 hipamd/include/hip/amd_detail/amd_hip_fp16.h | 55 ++++++++++++++++++
 2 files changed, 115 insertions(+)

diff --git a/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/hipamd/include/hip/amd_detail/amd_hip_bf16.h
index a34b75d1a6..db6c161c42 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_bf16.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_bf16.h
@@ -781,6 +781,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const
   return (__bf16)a + (__bf16)b;
 }
 
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
+ * \brief Adds two bfloat16 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd_rn(const __hip_bfloat16 a,
+                                                     const __hip_bfloat16 b) {
+#pragma clang fp contract(off)
+  return (__bf16)a + (__bf16)b;
+}
+
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Subtracts two bfloat16 values
@@ -789,6 +799,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const
   return (__bf16)a - (__bf16)b;
 }
 
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
+ * \brief Subtracts two bfloat16 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub_rn(const __hip_bfloat16 a,
+                                                     const __hip_bfloat16 b) {
+#pragma clang fp contract(off)
+  return (__bf16)a - (__bf16)b;
+}
+
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Divides two bfloat16 values
@@ -815,6 +835,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const
   return (__bf16)a * (__bf16)b;
 }
 
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
+ * \brief Multiplies two bfloat16 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul_rn(const __hip_bfloat16 a,
+                                                     const __hip_bfloat16 b) {
+#pragma clang fp contract(off)
+  return (__bf16)a * (__bf16)b;
+}
+
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Negate a bfloat16 value
@@ -861,6 +891,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a,
   return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
 }
 
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
+ * \brief Adds two bfloat162 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a,
+                                                       const __hip_bfloat162 b) {
+#pragma clang fp contract(off)
+  return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
+}
+
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Performs FMA of given bfloat162 values
@@ -879,6 +919,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a,
   return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)};
 }
 
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
+ * \brief Multiplies two bfloat162 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2_rn(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+#pragma clang fp contract(off)
+  return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)};
+}
+
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Converts a bfloat162 into negative
@@ -896,6 +946,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a,
   return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)};
 }
 
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
+ * \brief Subtracts two bfloat162 values, will not fuse into fma
+ */
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2_rn(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+#pragma clang fp contract(off)
+  return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)};
+}
+
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to multiply two __hip_bfloat16 numbers
diff --git a/hipamd/include/hip/amd_detail/amd_hip_fp16.h b/hipamd/include/hip/amd_detail/amd_hip_fp16.h
index 814575ea1e..f53c22cb83 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_fp16.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_fp16.h
@@ -1367,6 +1367,15 @@ THE SOFTWARE.
                 return __half_raw{
                     static_cast<__half_raw>(x).data +
                     static_cast<__half_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
+            __half __hadd_rn(__half x, __half y)
+            {
+#pragma clang fp contract(off)
+                return __half_raw{
+                    static_cast<__half_raw>(x).data +
+                    static_cast<__half_raw>(y).data};
             }
 	        inline
 	        __HOST_DEVICE__
@@ -1390,6 +1399,15 @@ THE SOFTWARE.
             }
             inline
             __HOST_DEVICE__
+            __half __hsub_rn(__half x, __half y)
+            {
+#pragma clang fp contract(off)
+                return __half_raw{
+                    static_cast<__half_raw>(x).data -
+                    static_cast<__half_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
             __half __hmul(__half x, __half y)
             {
                 return __half_raw{
@@ -1398,6 +1416,15 @@ THE SOFTWARE.
             }
             inline
             __HOST_DEVICE__
+            __half __hmul_rn(__half x, __half y)
+            {
+#pragma clang fp contract(off)
+                return __half_raw{
+                    static_cast<__half_raw>(x).data *
+                    static_cast<__half_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
             __half __hadd_sat(__half x, __half y)
             {
                 return __clamp_01(__hadd(x, y));
@@ -1446,6 +1473,16 @@ THE SOFTWARE.
                     static_cast<__half2_raw>(x).data +
                     static_cast<__half2_raw>(y).data};
             }
+            inline
+            __HOST_DEVICE__
+            __half2 __hadd2_rn(__half2 x, __half2 y)
+            {
+#pragma clang fp contract(off)
+                return __half2{
+                    static_cast<__half2_raw>(x).data +
+                    static_cast<__half2_raw>(y).data};
+            }
+
 	        inline
 	        __HOST_DEVICE__
 	        __half2 __habs2(__half2 x)
@@ -1462,6 +1499,15 @@ THE SOFTWARE.
             }
             inline
             __HOST_DEVICE__
+            __half2 __hsub2_rn(__half2 x, __half2 y)
+            {
+#pragma clang fp contract(off)
+                return __half2{
+                    static_cast<__half2_raw>(x).data -
+                    static_cast<__half2_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
             __half2 __hmul2(__half2 x, __half2 y)
             {
                 return __half2{
@@ -1470,6 +1516,15 @@ THE SOFTWARE.
             }
             inline
             __HOST_DEVICE__
+            __half2 __hmul2_rn(__half2 x, __half2 y)
+            {
+#pragma clang fp contract(off)
+                return __half2{
+                    static_cast<__half2_raw>(x).data *
+                    static_cast<__half2_raw>(y).data};
+            }
+            inline
+            __HOST_DEVICE__
             __half2 __hadd2_sat(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__hadd2(x, y));

From 42730527c6439ce9a8d0af1384d4d172ef37f6ed Mon Sep 17 00:00:00 2001
From: "Chaudhary, Jatin Jaikishan" <JatinJaikishan.Chaudhary@amd.com>
Date: Tue, 12 Aug 2025 17:28:55 +0100
Subject: [PATCH 2/4] SWDEV-545100 - rename the template variable (#798)

---
 hipamd/include/hip/amd_detail/amd_hip_vector_types.h | 4 ++--
 hipamd/include/hip/amd_detail/host_defines.h         | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hipamd/include/hip/amd_detail/amd_hip_vector_types.h b/hipamd/include/hip/amd_detail/amd_hip_vector_types.h
index 01849c0b16..702c3f85a6 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_vector_types.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_vector_types.h
@@ -51,8 +51,8 @@ THE SOFTWARE.
     #include <type_traits>
 #endif // defined(__HIPCC_RTC__)
 
-template <class T, int N> constexpr __hip_internal::size_t __hip_vec_align_v() {
-  return (N == 4 && alignof(T) == 8) ? 16 : N * alignof(T);
+template <class T, int _hip_N> constexpr __hip_internal::size_t __hip_vec_align_v() {
+  return (_hip_N == 4 && alignof(T) == 8) ? 16 : _hip_N * alignof(T);
 }
 
 template <typename T, unsigned int n> struct HIP_vector_base;
diff --git a/hipamd/include/hip/amd_detail/host_defines.h b/hipamd/include/hip/amd_detail/host_defines.h
index 5aa38661f2..59e1673dcb 100644
--- a/hipamd/include/hip/amd_detail/host_defines.h
+++ b/hipamd/include/hip/amd_detail/host_defines.h
@@ -199,16 +199,16 @@ struct integer_sequence {
 template<size_t... Ints>
 using index_sequence = integer_sequence<size_t, Ints...>;
 
-template<size_t N, size_t... Ints>
-struct make_index_sequence_impl : make_index_sequence_impl<N - 1, N - 1, Ints...> {};
+template <size_t _hip_N, size_t... Ints>
+struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};
 
 template<size_t... Ints>
 struct make_index_sequence_impl<0, Ints...> {
     using type = index_sequence<Ints...>;
 };
 
-template<size_t N>
-using make_index_sequence = typename make_index_sequence_impl<N>::type;
+template <size_t _hip_N>
+using make_index_sequence = typename make_index_sequence_impl<_hip_N>::type;
 
 template <size_t... Ints>
 constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {

From dc34af61d77681282da4418a6efbb112337f90be Mon Sep 17 00:00:00 2001
From: "Assiouras, Ioannis" <Ioannis.Assiouras@amd.com>
Date: Tue, 12 Aug 2025 21:23:09 +0100
Subject: [PATCH 3/4] SWDEV-543340 - Remove shared memory objects after IPC
 event cleanup (#745)

This change ensures that shared memory objects (e.g., files in /dev/shm)
are unlinked once all related IPC events have been destroyed.
---
 hipamd/src/hip_code_object.cpp | 20 +++++++++-----------
 hipamd/src/hip_event.hpp       |  3 +++
 rocclr/os/os.hpp               |  3 +++
 rocclr/os/os_posix.cpp         |  1 +
 rocclr/os/os_win32.cpp         |  3 +++
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/hipamd/src/hip_code_object.cpp b/hipamd/src/hip_code_object.cpp
index 4336fdf0ea..846ca299a7 100644
--- a/hipamd/src/hip_code_object.cpp
+++ b/hipamd/src/hip_code_object.cpp
@@ -284,9 +284,7 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
     for (auto& hostVar : hostVarsIter->second) {
       auto varIter = vars_.find(hostVar);
       if (varIter == vars_.end()) {
-        LogPrintfError(
-          "removeFatBinary: Unable to find module 0x%x hostVar 0x%x",
-          module, hostVar);
+        LogPrintfError("removeFatBinary: Unable to find module 0x%x hostVar 0x%x", module, hostVar);
       } else {
         delete varIter->second;
         vars_.erase(varIter);
@@ -325,8 +323,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
     for (auto& hostFunc : hostFuncsIter->second) {
       auto funcIter = functions_.find(hostFunc);
       if (funcIter == functions_.end()) {
-        LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x",
-                       module, hostFunc);
+        LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x", module,
+                       hostFunc);
       } else {
         delete funcIter->second;
         functions_.erase(funcIter);
@@ -343,8 +341,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
       delete moduleIter->second;
       modules_.erase(moduleIter);
     } else {
-      LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x",
-                    module, hostModule);
+      LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x", module,
+                     hostModule);
     }
     module_to_hostModule_.erase(hostModuleIter);
   }
@@ -383,7 +381,7 @@ hipError_t StatCO::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, i
   }
 
   // Lazy load
-  FatBinaryInfo **module = it->second->moduleInfo();
+  FatBinaryInfo** module = it->second->moduleInfo();
   if (*(module) == nullptr) {
     amd::ScopedLock lock(sclock_);
     if (*(module) == nullptr) {
@@ -405,7 +403,7 @@ hipError_t StatCO::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hos
   }
 
   // Lazy load
-  FatBinaryInfo **module = it->second->moduleInfo();
+  FatBinaryInfo** module = it->second->moduleInfo();
   if (*(module) == nullptr) {
     hipError_t err = digestFatBinary(module_to_hostModule_[module], *module);
     assert(err == hipSuccess);
@@ -437,7 +435,7 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice
   }
 
   // Lazy load
-  FatBinaryInfo **module = it->second->moduleInfo();
+  FatBinaryInfo** module = it->second->moduleInfo();
   if (*(module) == nullptr) {
     hipError_t err = digestFatBinary(module_to_hostModule_[module], *module);
     assert(err == hipSuccess);
@@ -464,7 +462,7 @@ hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) {
     for (auto& vecIter : managedVars_) {
       for (auto& var : vecIter.second) {
         // Lazy load
-        FatBinaryInfo **module = var->moduleInfo();
+        FatBinaryInfo** module = var->moduleInfo();
         if (*(module) == nullptr) {
           err = digestFatBinary(module_to_hostModule_[module], *module);
           assert(err == hipSuccess);
diff --git a/hipamd/src/hip_event.hpp b/hipamd/src/hip_event.hpp
index 456bf3d0a6..33d80dfbf2 100644
--- a/hipamd/src/hip_event.hpp
+++ b/hipamd/src/hip_event.hpp
@@ -200,6 +200,9 @@ class IPCEvent : public Event {
       if (!amd::Os::MemoryUnmapFile(ipc_evt_.ipc_shmem_, sizeof(hip::ihipIpcEventShmem_t))) {
         // print hipErrorInvalidHandle;
       }
+      if (owners == 0) {
+        amd::Os::shm_unlink(ipc_evt_.ipc_name_);
+      }
     }
   }
   IPCEvent() : Event(hipEventInterprocess) {}
diff --git a/rocclr/os/os.hpp b/rocclr/os/os.hpp
index e8b7c0d2ad..d830361229 100644
--- a/rocclr/os/os.hpp
+++ b/rocclr/os/os.hpp
@@ -285,6 +285,9 @@ class Os : AllStatic {
   //! Deletes file
   static int unlink(const std::string& path);
 
+  //! Removes the shared memory object name
+  static int shm_unlink(const std::string& path);
+
   // Library routines:
   //
   typedef bool (*SymbolCallback)(std::string, const void*, void*);
diff --git a/rocclr/os/os_posix.cpp b/rocclr/os/os_posix.cpp
index 6ee367842c..8837c575b5 100644
--- a/rocclr/os/os_posix.cpp
+++ b/rocclr/os/os_posix.cpp
@@ -650,6 +650,7 @@ std::string Os::getTempFileName() {
 }
 
 int Os::unlink(const std::string& path) { return ::unlink(path.c_str()); }
+int Os::shm_unlink(const std::string& path) { return ::shm_unlink(path.c_str()); }
 
 #if defined(ATI_ARCH_X86)
 void Os::cpuid(int regs[4], int info) {
diff --git a/rocclr/os/os_win32.cpp b/rocclr/os/os_win32.cpp
index 68a87f15f7..b933dfd452 100644
--- a/rocclr/os/os_win32.cpp
+++ b/rocclr/os/os_win32.cpp
@@ -505,6 +505,9 @@ std::string Os::getTempFileName() {
 
 int Os::unlink(const std::string& path) { return ::_unlink(path.c_str()); }
 
+// shm_unlink is a nop on windows
+int Os::shm_unlink(const std::string& path) { return 0; }
+
 void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); }
 
 uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); }

From 2305f8ae565389cfc348a747c467c004909d7a90 Mon Sep 17 00:00:00 2001
From: "Andryeyev, German" <German.Andryeyev@amd.com>
Date: Tue, 12 Aug 2025 19:04:36 -0400
Subject: [PATCH 4/4] SWDEV-465041 - Add support for user events with DD (#321)

* SWDEV-465041 - Add support for user events with DD

User events can be replaced with HSA signals. Add the interface
to allocate HSA signal for user events and update the status on
CL_COMPLETE.
Force pinned path with DD to avoid blocking calls. Pinned memory
can be released only when the command is complete.
Simplify device enqueue path to use generic kernel arg buffer and
signals

* Fix notifyCmdQueue() logic for OCL

* Avoid blocking calls in OCL with DD

* Add event  destruciton in a case of the failure.
---
 opencl/amdocl/cl_event.cpp         | 11 ++--
 rocclr/device/device.hpp           |  9 ++++
 rocclr/device/rocm/rocblit.cpp     | 68 ++++++++++++++----------
 rocclr/device/rocm/rocblit.hpp     |  4 +-
 rocclr/device/rocm/rocdevice.cpp   | 19 +++++++
 rocclr/device/rocm/rocdevice.hpp   |  2 +
 rocclr/device/rocm/rocsettings.cpp |  2 +
 rocclr/device/rocm/rocsettings.hpp |  3 +-
 rocclr/device/rocm/rocvirtual.cpp  | 85 ++++++++++--------------------
 rocclr/device/rocm/rocvirtual.hpp  |  5 +-
 rocclr/platform/command.cpp        | 17 ++++--
 rocclr/platform/command.hpp        | 45 ++++++++++++++--
 rocclr/platform/commandqueue.cpp   |  3 ++
 13 files changed, 168 insertions(+), 105 deletions(-)

diff --git a/opencl/amdocl/cl_event.cpp b/opencl/amdocl/cl_event.cpp
index d4f2b04e26..4dfc9c0b94 100644
--- a/opencl/amdocl/cl_event.cpp
+++ b/opencl/amdocl/cl_event.cpp
@@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc
     return (cl_event)0;
   }
 
-  amd::Event* event = new amd::UserEvent(*as_amd(context));
-  if (event == NULL) {
+  auto event = new amd::UserEvent(*as_amd(context));
+  if (event == nullptr || !event->Create()) {
+    delete event;
     *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
     return (cl_event)0;
   }
 
   event->retain();
   *not_null(errcode_ret) = CL_SUCCESS;
-  return as_cl(event);
+  return as_cl(reinterpret_cast<amd::Event*>(event));
 }
 RUNTIME_EXIT
 
@@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st
   if (execution_status > CL_COMPLETE) {
     return CL_INVALID_VALUE;
   }
-
-  if (!as_amd(event)->setStatus(execution_status)) {
+  auto user_event = reinterpret_cast<amd::UserEvent*>(as_amd(event));
+  if (!user_event->SetExecutionStatus(execution_status)) {
     return CL_INVALID_OPERATION;
   }
   return CL_SUCCESS;
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index e3b3f736df..4ecbd04e56 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -95,6 +95,7 @@ class StreamOperationCommand;
 class BatchMemoryOperationCommand;
 class VirtualMapCommand;
 class ExternalSemaphoreCmd;
+class UserEvent;
 class Isa;
 class Device;
 struct KernelParameterDescriptor;
@@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject {
     ShouldNotReachHere();
   }
   virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
+  virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }
 
   virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
   virtual void ReleaseAllHwQueues() {}
@@ -2037,6 +2039,13 @@ class Device : public RuntimeObject {
     return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
   }
 
+  /// @brief  Creates HW user event for OpenCL implementation
+  /// @return The pointer to a HW event structure, known to the HW backend
+  virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; }
+
+  /// @brief  Sets HW user event to the complete status
+  virtual void SetUserEvent(amd::UserEvent* event) const {}
+
   //! Returns TRUE if the device is available for computations
   bool isOnline() const { return online_; }
 
diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index 40ff3454ae..3ab0b06549 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire, amd::CopyMetadata copyMetadata) const {
   // Use host copy if memory has direct access
-  if (setup_.disableReadBuffer_ ||
-      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
+  if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ ||
+      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) {
     // Stall GPU before CPU access
     gpu().releaseGpuMemoryFence();
     return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                  const amd::Coord3D& origin, const amd::Coord3D& size,
                                  bool entire, amd::CopyMetadata copyMetadata) const {
   // Use host copy if memory has direct access
-  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
-      gpuMem(dstMemory).IsPersistentDirectMap()) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
+       gpuMem(dstMemory).IsPersistentDirectMap())) {
     // Stall GPU before CPU access
     gpu().releaseGpuMemoryFence();
     return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size,
   buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
 }
 
+// ================================================================================================
 void DmaBlitManager::releaseBuffer(BufferState &buffer) const {
   if (buffer.pinnedMem_) {
     gpu().addPinnedMem(buffer.pinnedMem_);
@@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
                 size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata,
                 bool enablePin) const {
   // Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow
-  gpu().releaseGpuMemoryFence(hostToDev);
+  gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_);
   // If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer
   // For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning
   bool status = true;
@@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
               copyMetadata.isAsync_);
       const_address src = static_cast<const_address>(hostSrc) + copyOffset;
       status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata);
-      if (status ) {
-        // Wait for current signal of previous rocr copy if its not pinned mem
+      if (status) {
         if (outBuffer.pinnedMem_ == nullptr) {
+          // Wait for current signal of previous rocr copy if its not pinned mem
           gpu().Barriers().WaitCurrent();
           ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
                   hostDst + copyOffset, stagingBuffer, copysize);
@@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
         break;
       }
     }
+
     // Release Pinned Memory back to pool if any
     releaseBuffer(outBuffer);
     // Update Offset and Transfer Size
@@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
     firstTx = false;
   }
 
+  // @note: HIP requires a blocking wait on D2H with the pageable system memory
+  if (amd::IS_HIP && !hostToDev) {
+    gpu().Barriers().WaitCurrent();
+  }
+
   if(!status) {
     return false;
   }
 
   return true;
 }
+
 // ================================================================================================
 KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
     : DmaBlitManager(gpu, setup),
@@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
   bool result = false;
 
   // Use host copy if memory has direct access
-  if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
-      !srcMemory.isCpuUncached())) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
+      !srcMemory.isCpuUncached()))) {
     // Stall GPU before CPU access
     gpu().releaseGpuMemoryFence();
     result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
   bool result = false;
 
   // Use host copy if memory has direct access
-  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
-      gpuMem(dstMemory).IsPersistentDirectMap()) {
+  if (dev().settings().blocking_blit_ &&
+      (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
+       gpuMem(dstMemory).IsPersistentDirectMap())) {
     // Stall GPU before CPU access
     gpu().releaseGpuMemoryFence();
     result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const {
 }
 
 // ================================================================================================
-bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
+bool KernelBlitManager::runScheduler(uint64_t vqVM,
                                      hsa_queue_t* schedulerQueue,
-                                     hsa_signal_t& schedulerSignal,
-                                     uint threads) {
+                                     uint threads, uint64_t aql_wrap) {
   size_t globalWorkOffset[1] = {0};
   size_t globalWorkSize[1] = {threads};
   size_t localWorkSize[1] = {1};
@@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
   device::Kernel* devKernel = const_cast<device::Kernel*>(kernels_[Scheduler]->getDeviceKernel(dev()));
   Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
 
-  SchedulerParam* sp = reinterpret_cast<SchedulerParam*>(schedulerParam->getHostMem());
+  auto* sp = reinterpret_cast<SchedulerParam*>(
+    gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment));
   memset(sp, 0, sizeof(SchedulerParam));
 
-  Memory* schedulerMem = dev().getRocMemory(schedulerParam);
-  sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
+  sp->kernarg_address = reinterpret_cast<uint64_t>(sp);
   sp->thread_counter = 0;
   sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
-  sp->complete_signal = schedulerSignal;
-
-  hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
-
-
+  sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr);
   sp->vqueue_header = vqVM;
-
-  sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
+  sp->parentAQL = reinterpret_cast<uint64_t>(aql_wrap);
 
   if (dev().info().maxEngineClockFrequency_ > 0) {
     sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
   // Use a device side global atomics to workaround the reliance of PCIe 3 atomics
   sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);
 
-  cl_mem mem = as_cl<amd::Memory>(schedulerParam);
-  setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem);
+  constexpr bool kDirectVa = true;
+  setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa);
 
   address parameters = captureArguments(kernels_[Scheduler]);
 
@@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
     return false;
   }
   releaseArguments(parameters);
-
-  if (!WaitForSignal(schedulerSignal)) {
+  // Wait for the scheduler to finish all operations
+  gpu().WaitCompleteSignal(sp->complete_signal);
+  // @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper
+  // value without the wait for all previous commands (see the PCIE3 atomics workaround above).
+  // The scheduler can enqueue extra commands, but the real queue write index didn't have any
+  // progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode
+  // with device enqueue and user events, because device enqueue is blocking below
+  if (!WaitForSignal(sp->complete_signal)) {
     LogWarning("Failed schedulerSignal wait");
     return false;
   }
-
   return true;
 }
 
diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp
index 8a7f03f454..9c4b16d580 100644
--- a/rocclr/device/rocm/rocblit.hpp
+++ b/rocclr/device/rocm/rocblit.hpp
@@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager {
                          ) const;
 
   bool runScheduler(uint64_t vqVM,
-                    amd::Memory* schedulerParam,
                     hsa_queue_t* schedulerQueue,
-                    hsa_signal_t& schedulerSignal,
-                    uint threads);
+                    uint threads, uint64_t aql_wrap);
 
   //! Runs a blit kernel for GWS init
   bool RunGwsInit(uint32_t value             //!< Initial value for GWS resource
diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 44f3822d33..3274cb22f3 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
   }
 }
 
+// ================================================================================================
+bool Device::CreateUserEvent(amd::UserEvent* event) const {
+  std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+  if ((signal == nullptr) ||
+      (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) {
+    return false;
+  }
+  hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne);
+  event->SetHwEvent(signal.release());
+  return true;
+}
+
+// ================================================================================================
+void Device::SetUserEvent(amd::UserEvent* event) const {
+  auto signal = reinterpret_cast<ProfilingSignal*>(event->HwEvent());
+  assert(signal != nullptr && "Can't have user event without hw event!");
+  hsa_signal_silent_store_relaxed(signal->signal_, 0);
+}
+
 // ================================================================================================
 bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
   // Query ptr type to see if it's a HMM allocation
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index 32dc66726c..0c7ce24805 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -450,6 +450,8 @@ class Device : public NullDevice {
                               uint32_t hip_event_flags = 0) const;
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
   virtual void ReleaseGlobalSignal(void* signal) const;
+  virtual bool CreateUserEvent(amd::UserEvent* event) const;
+  virtual void SetUserEvent(amd::UserEvent* event) const;
 
   //! Allocate host memory in terms of numa policy set by user
   void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
diff --git a/rocclr/device/rocm/rocsettings.cpp b/rocclr/device/rocm/rocsettings.cpp
index d53231fdd9..9d89124cdb 100644
--- a/rocclr/device/rocm/rocsettings.cpp
+++ b/rocclr/device/rocm/rocsettings.cpp
@@ -97,6 +97,8 @@ Settings::Settings() {
   limit_blit_wg_ = 16;
 
   dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
+  // note: OCL user events don't allow CPU blocking calls in DD mode
+  blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH;
 }
 
 // ================================================================================================
diff --git a/rocclr/device/rocm/rocsettings.hpp b/rocclr/device/rocm/rocsettings.hpp
index dd323b86ab..b6f741db8e 100644
--- a/rocclr/device/rocm/rocsettings.hpp
+++ b/rocclr/device/rocm/rocsettings.hpp
@@ -51,7 +51,8 @@ class Settings : public device::Settings {
       uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
       uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
       uint dynamic_queues_ : 1;         //!< Dynamic queues management
-      uint reserved_ : 22;
+      uint blocking_blit_ : 1;          //!< Blit ops can be blocking on CPU
+      uint reserved_ : 21;
     };
     uint value_;
   };
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index d5247ad82c..91b8e5146e 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
   return false;
 }
 
+// ================================================================================================
+void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) {
+  barrier_packet_.dep_signal[0] = signal;
+  dispatchBarrierPacket(kBarrierPacketHeader, false);
+}
+
 // ================================================================================================
 void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
                                        hsa_signal_t signal) {
@@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
       deviceQueueSize_(0),
       maskGroups_(0),
       schedulerThreads_(0),
-      schedulerParam_(nullptr),
       schedulerQueue_(nullptr),
-      schedulerSignal_({0}),
       barriers_(*this),
       managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
       managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
@@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() {
 
   delete printfdbg_;
 
-  if (0 != schedulerSignal_.handle) {
-    hsa_signal_destroy(schedulerSignal_);
-  }
-
   if (nullptr != schedulerQueue_) {
     hsa_queue_destroy(schedulerQueue_);
   }
 
-  if (nullptr != schedulerParam_) {
-    schedulerParam_->release();
-  }
-
   if (nullptr != virtualQueue_) {
     virtualQueue_->release();
   }
@@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
   // If we have host memory, use it
   if ((devMemory->owner()->getHostMem() != nullptr) &&
       (devMemory->owner()->getSvmPtr() == nullptr)) {
-    if (!devMemory->isHostMemDirectAccess()) {
+    if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) {
       // Make sure GPU finished operation before synchronization with the backing store
       releaseGpuMemoryFence();
     }
@@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
 // ================================================================================================
 bool VirtualGPU::createSchedulerParam()
 {
-  if (nullptr != schedulerParam_) {
+  if (nullptr != schedulerQueue_) {
     return true;
   }
 
-  while(true) {
-    schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(),
-      CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap));
-
-    if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) {
-      break;
-    }
-
+  while (true) {
     // The queue is written by multiple threads of the scheduler kernel
     if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI,
         callbackQueue, &roc_device_, std::numeric_limits<uint>::max(),
@@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam()
       break;
     }
 
-    hsa_signal_t  signal0 = {0};
-
-    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) {
-      break;
-    }
-
-    schedulerSignal_ = signal0;
-
-    Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
-
-    if (nullptr == schedulerMem) {
-      break;
-    }
-
-    schedulerParam_->setVirtualDevice(this);
     return true;
   }
 
-  if (0 != schedulerSignal_.handle) {
-    hsa_signal_destroy(schedulerSignal_);
-    schedulerSignal_.handle = 0;
-  }
-
   if (nullptr != schedulerQueue_) {
     hsa_queue_destroy(schedulerQueue_);
     schedulerQueue_ = nullptr;
   }
 
-  if (nullptr != schedulerParam_) {
-    schedulerParam_->release();
-    schedulerParam_ = nullptr;
-  }
-
   return false;
 }
 
@@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     global[i] = static_cast<uint32_t>(sizes.global()[i]);
     local[i] = static_cast<uint16_t>(local_size[i]);
   }
-
+  uint64_t spVA = 0;
   // Check if runtime has to setup hidden arguments
   for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
     const auto& it = signature.at(i);
@@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
         break;
       }
       case amd::KernelParameterDescriptor::HiddenCompletionAction: {
-        uint64_t spVA = 0;
-        if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) {
-          Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
-          AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(
-                             reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
+        if (devKernel->dynamicParallelism()) {
+          auto params = allocKernArg(sizeof(AmdAqlWrap), 64);
+          AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(params);
           memset(wrap, 0, sizeof(AmdAqlWrap));
           wrap->state = AQL_WRAP_DONE;
-          spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
+          spVA = reinterpret_cast<uint64_t>(wrap);
         }
         WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_);
         break;
@@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     dispatchBarrierPacket(kBarrierPacketHeader, true);
     if (virtualQueue_ != nullptr) {
       static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
-          getVQVirtualAddress(), schedulerParam_, schedulerQueue_,
-          schedulerSignal_, schedulerThreads_);
+          getVQVirtualAddress(), schedulerQueue_,
+          schedulerThreads_, spVA);
     }
   }
 
@@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
 
 // ================================================================================================
 void VirtualGPU::addPinnedMem(amd::Memory* mem) {
-  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
-  //!        unconditionally, before it can release pinned memory
-  releaseGpuMemoryFence();
   if (!AMD_DIRECT_DISPATCH) {
+    //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+    //!        unconditionally, before it can release pinned memory
+    releaseGpuMemoryFence();
     if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
       if (pinnedMems_.size() > 7) {
         pinnedMems_.front()->release();
@@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
       pinnedMems_.push_back(mem);
     }
   } else {
-    mem->release();
+    if (command_ != nullptr) {
+      command_->AddPinnedMemory(mem);
+    } else {
+      //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
+      //!        unconditionally, before it can release pinned memory
+      releaseGpuMemoryFence();
+      mem->release();
+    }
   }
 }
 
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 1d9c8b756b..1c54f653c3 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice {
   void* allocKernArg(size_t size, size_t alignment);
   bool isFenceDirty() const { return fence_dirty_; }
   void setFenceDirty(bool state) { fence_dirty_ = state; }
-  void HiddenHeapInit();
+  void WaitCompleteSignal(hsa_signal_t signal);
 
+  void HiddenHeapInit();
   void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
   uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
   uint64_t getQueueID();
@@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice {
                                   //!< one thread
   uint schedulerThreads_;         //!< The number of scheduler threads
 
-  amd::Memory* schedulerParam_;
   hsa_queue_t* schedulerQueue_;
-  hsa_signal_t schedulerSignal_;
 
   HwQueueTracker  barriers_;      //!< Tracks active barriers in ROCr
 
diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
index 6a20fe9e26..d9361703e7 100644
--- a/rocclr/platform/command.cpp
+++ b/rocclr/platform/command.cpp
@@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
     ScopedLock l(notify_lock_);
     if ((status() > CL_COMPLETE) && (nullptr != queue) &&
         // If HW event was assigned, then notification can be ignored, since a barrier was issued
-        (HwEvent() == nullptr) &&
+        // @note: Force the marker always in OCL for now, since OCL events require precise
+        // sequence of the status update
+        ((HwEvent() == nullptr) || !amd::IS_HIP) &&
         !notified_.test_and_set()) {
       // Make sure the queue is draining the enqueued commands.
       amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
@@ -364,8 +366,17 @@ void Command::enqueue() {
 
     // Notify all commands about the waiter. Barrier will be sent in order to obtain
     // HSA signal for a wait on the current queue
-    for (const auto& event : eventWaitList()) {
-      event->notifyCmdQueue(!kCpuWait);
+    for (const auto &event: eventWaitList()) {
+      if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) {
+        if (event->status() >= CL_COMPLETE) {
+          reinterpret_cast<amd::UserEvent*>(event)->AddDependent(this);
+        } else {
+          setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+          return;
+        }
+      } else {
+        event->notifyCmdQueue(!kCpuWait);
+      }
     }
 
     // The batch update must be lock protected to avoid a race condition
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index 3dbc814c54..977014f69d 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -397,6 +397,9 @@ class Command : public Event {
   //! Release the resources associated with this event.
   virtual void releaseResources();
 
+  //! Empty function for adding pinned memory
+  virtual void AddPinnedMemory(Memory* pinned) {}
+
   //! Set the next GPU command
   void setNext(Command* next) { next_ = next; }
 
@@ -424,16 +427,46 @@ class Command : public Event {
 };
 
 class UserEvent : public Command {
-  const Context& context_;
+  const Context&        context_;     //!< OCL context associated with the event
+  std::vector<Command*> dependents_;  //!< Commands, which depends on this user event
 
  public:
   UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) {
     setStatus(CL_SUBMITTED);
   }
 
+  //! Creates a user event in the backend layer
+  bool Create() {
+    if (AMD_DIRECT_DISPATCH) {
+      return context_.devices()[0]->CreateUserEvent(this);
+    } else {
+      return true;
+    }
+  }
+
+  //! Sets the execution status of the user event
+  bool SetExecutionStatus(cl_int status) {
+    if (AMD_DIRECT_DISPATCH) {
+      // If it's invalid status, then mark dependent commands as invalid
+      if (status < CL_COMPLETE) {
+        for (auto it : dependents_) {
+          it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+        }
+      }
+      dependents_.clear();
+      context_.devices()[0]->SetUserEvent(this);
+    }
+    return setStatus(status);
+  }
+
+  //! Adds dependent commands for the user event
+  void AddDependent(Command* command) {
+    dependents_.push_back(command);
+  }
+
   virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); }
 
-  virtual const Context& context() const { return context_; }
+  const Context& context() const { return context_; }
 };
 
 class ClGlEvent : public Command {
@@ -450,7 +483,7 @@ class ClGlEvent : public Command {
 
   bool awaitCompletion() { return waitForFence(); }
 
-  virtual const Context& context() const { return context_; }
+  const Context& context() const { return context_; }
 };
 
 inline Command& Event::command() { return *static_cast<Command*>(this); }
@@ -465,6 +498,7 @@ class NDRangeContainer;
 class OneMemoryArgCommand : public Command {
  protected:
   Memory* memory_;
+  std::vector<Memory*>  pinned_memory_;   //!< Pinned memory object
 
  public:
   OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
@@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command {
     memory_->release();
     DEBUG_ONLY(memory_ = NULL);
     Command::releaseResources();
+    for (auto it : pinned_memory_) {
+      it->release();
+    }
   }
 
+  //! Adds pinned memory, used in this command for later release
+  virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); }
   bool validateMemory();
   bool validatePeerMemory();
 };
diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp
index 665e5313f8..6e8d5a96ad 100644
--- a/rocclr/platform/commandqueue.cpp
+++ b/rocclr/platform/commandqueue.cpp
@@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) {
         (vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) {
       cpu_wait = true;
     }
+  } else {
+    // Force CPU wait for OpenCL, since the tests may check OCL command status after finish
+    cpu_wait = true;
   }
 
   size_t batchSize = GetSubmissionBatchSize();