diff --git a/rocclr/runtime/platform/command.cpp b/rocclr/runtime/platform/command.cpp
index 83bbb16fd1..b540d6b8fc 100644
--- a/rocclr/runtime/platform/command.cpp
+++ b/rocclr/runtime/platform/command.cpp
@@ -187,14 +187,16 @@ bool Event::notifyCmdQueue() {
 
 const Event::EventWaitList Event::nullWaitList(0);
 
-Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList)
+Command::Command(HostQueue& queue, cl_command_type type,
+                 const EventWaitList& eventWaitList, uint32_t commandWaitBits)
     : Event(queue),
       queue_(&queue),
       next_(NULL),
       type_(type),
       exception_(0),
       data_(NULL),
-      eventWaitList_(eventWaitList) {
+      eventWaitList_(eventWaitList),
+      commandWaitBits_(commandWaitBits) {
   // Retain the commands from the event wait list.
   std::for_each(eventWaitList.begin(), eventWaitList.end(), std::mem_fun(&Command::retain));
 }
@@ -218,7 +220,8 @@ void Command::enqueue() {
   }
   queue_->append(*this);
   queue_->flush();
-  if (queue_->device().settings().waitCommand_ && (type_ != 0)) {
+  if ((queue_->device().settings().waitCommand_ && (type_ != 0)) ||
+      ((commandWaitBits_ & 0x2) != 0)) {
     awaitCompletion();
   }
 }
@@ -228,7 +231,7 @@ const Context& Command::context() const { return queue_->context(); }
 NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
                                            Kernel& kernel, const NDRangeContainer& sizes,
                                            uint32_t sharedMemBytes, uint32_t extraParam)
-    : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList)
+    : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL)
     , kernel_(kernel)
     , sizes_(sizes)
     , sharedMemBytes_(sharedMemBytes)
diff --git a/rocclr/runtime/platform/command.hpp b/rocclr/runtime/platform/command.hpp
index cf8b2ed507..2a52954d5f 100644
--- a/rocclr/runtime/platform/command.hpp
+++ b/rocclr/runtime/platform/command.hpp
@@ -91,8 +91,8 @@ class Event : public RuntimeObject {
     uint64_t submitted_;
     uint64_t start_;
     uint64_t end_;
-    bool enabled_;      //!< Profiling enabled for the wave limiter
-    uint32_t waves_;    //!< The number of waves used in a dispatch
+    bool enabled_;    //!< Profiling enabled for the wave limiter
+    uint32_t waves_;  //!< The number of waves used in a dispatch
     ProfilingCallback* callback_;
     void clear() {
       queued_ = 0ULL;
@@ -202,9 +202,13 @@ class Command : public Event {
   //! The Events that need to complete before this command is submitted.
   EventWaitList eventWaitList_;
 
+  //! Force await completion of previous command
+  //! 0x1 - wait before enqueue, 0x2 - wait after, 0x3 - wait both.
+  uint32_t commandWaitBits_;
+
   //! Construct a new command of the given OpenCL type.
-  Command(HostQueue& queue, cl_command_type type,
-          const EventWaitList& eventWaitList = nullWaitList);
+  Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList = nullWaitList,
+          uint32_t commandWaitBits = 0);
 
   //! Construct a new command of the given OpenCL type.
   Command(cl_command_type type)
@@ -214,7 +218,8 @@ class Command : public Event {
         type_(type),
         exception_(0),
         data_(NULL),
-        eventWaitList_(nullWaitList) {}
+        eventWaitList_(nullWaitList),
+        commandWaitBits_(0) {}
 
   bool terminate() {
     if (Agent::shouldPostEventEvents() && type() != 0) {
@@ -271,6 +276,9 @@ class Command : public Event {
 
   //! Return the context for this event.
   virtual const Context& context() const;
+
+  //! Get command wait bits
+  uint32_t getWaitBits() const { return commandWaitBits_; }
 };
 
 class UserEvent : public Command {
@@ -319,7 +327,7 @@ class OneMemoryArgCommand : public Command {
  public:
   OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
                       Memory& memory)
-      : Command(queue, type, eventWaitList), memory_(&memory) {
+      : Command(queue, type, eventWaitList, AMD_SERIALIZE_COPY), memory_(&memory) {
     memory_->retain();
   }
 
@@ -342,7 +350,9 @@ class TwoMemoryArgsCommand : public Command {
  public:
   TwoMemoryArgsCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
                        Memory& memory1, Memory& memory2)
-      : Command(queue, type, eventWaitList), memory1_(&memory1), memory2_(&memory2) {
+      : Command(queue, type, eventWaitList, AMD_SERIALIZE_COPY),
+        memory1_(&memory1),
+        memory2_(&memory2) {
     memory1_->retain();
     memory2_->retain();
   }
@@ -788,7 +798,8 @@ class NDRangeKernelCommand : public Command {
 
   //! Return the cooperative multi device groups mode
   bool cooperativeMultiDeviceGroups() const {
-    return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false; }
+    return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false;
+  }
 
   //! Set the local work size.
   void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }
@@ -995,7 +1006,7 @@ class ThreadTraceMemObjectsCommand : public Command {
  private:
   std::vector<amd::Memory*> memObjects_;  //!< The list of memory objects,bound to the thread trace
   size_t sizeMemObjects_;     //!< The size of each memory object from memObjects_ list (all memory
-                              //!objects have the smae size)
+                              //! objects have the smae size)
   ThreadTrace& threadTrace_;  //!< The Thread Trace object
 };
 
@@ -1196,7 +1207,7 @@ class SvmMapMemoryCommand : public Command {
   Memory* svmMem_;  //!< the pointer to the amd::Memory object corresponding the svm pointer mapped
   Coord3D size_;    //!< the map size
   Coord3D origin_;  //!< the origin of the mapped svm pointer shift from the beginning of svm space
-                    //!allocated
+                    //! allocated
   cl_map_flags flags_;  //!< map flags
   void* svmPtr_;
 
@@ -1315,12 +1326,11 @@ class TransferBufferFileCommand : public OneMemoryArgCommand {
 
 class CopyMemoryP2PCommand : public CopyMemoryCommand {
  public:
-  CopyMemoryP2PCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList,
-                    Memory& srcMemory, Memory& dstMemory, Coord3D srcOrigin, Coord3D dstOrigin,
-                    Coord3D size)
-      : CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin, size)
-  {
-  }
+  CopyMemoryP2PCommand(HostQueue& queue, cl_command_type cmdType,
+                       const EventWaitList& eventWaitList, Memory& srcMemory, Memory& dstMemory,
+                       Coord3D srcOrigin, Coord3D dstOrigin, Coord3D size)
+      : CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin,
+                          size) {}
 
   virtual void submit(device::VirtualDevice& device) { device.submitCopyMemoryP2P(*this); }
 
diff --git a/rocclr/runtime/platform/commandqueue.cpp b/rocclr/runtime/platform/commandqueue.cpp
index 95ed258551..e559d23ffb 100644
--- a/rocclr/runtime/platform/commandqueue.cpp
+++ b/rocclr/runtime/platform/commandqueue.cpp
@@ -35,7 +35,8 @@ bool HostQueue::terminate() {
     Command* marker = nullptr;
 
     // Send a finish if the queue is still accepting commands.
-    { ScopedLock sl(queueLock_);
+    {
+      ScopedLock sl(queueLock_);
       if (thread_.acceptingCommands_) {
         marker = new Marker(*this, false);
         if (marker != nullptr) {
@@ -50,7 +51,8 @@ bool HostQueue::terminate() {
     }
 
     // Wake-up the command loop, so it can exit
-    { ScopedLock sl(queueLock_);
+    {
+      ScopedLock sl(queueLock_);
       thread_.acceptingCommands_ = false;
       queueLock_.notify();
     }
@@ -159,6 +161,9 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
 void HostQueue::append(Command& command) {
   // We retain the command here. It will be released when its status
   // changes to CL_COMPLETE
+  if ((command.getWaitBits() & 0x1) != 0) {
+    finish();
+  }
   command.retain();
   command.setStatus(CL_QUEUED);
   queue_.enqueue(&command);
@@ -214,4 +219,4 @@ bool DeviceQueue::create() {
   return result;
 }
 
-}  // namespace amd {
+}  // namespace amd
diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp
index 81a7bcb3af..e77ac4d8f2 100644
--- a/rocclr/runtime/utils/flags.hpp
+++ b/rocclr/runtime/utils/flags.hpp
@@ -191,7 +191,13 @@ release(bool, GPU_DUMP_CODE_OBJECT, false,                                    \
         "Enable dump code object")                                            \
 release(uint, GPU_MAX_USWC_ALLOC_SIZE, 2048,                                  \
         "Set a limit in Mb on the maximum USWC allocation size"               \
-        "-1 = No limit")
+        "-1 = No limit")                                                      \
+release(uint, AMD_SERIALIZE_KERNEL, 0,                                        \
+        "Serialize kernel enqueue, 0x1 = Wait for completion before enqueue"  \
+        "0x2 = Wait for completion after enqueue 0x3 = both")                 \
+release(uint, AMD_SERIALIZE_COPY, 0,                                          \
+        "Serialize copies, 0x1 = Wait for completion before enqueue"          \
+        "0x2 = Wait for completion after enqueue 0x3 = both")
 
 namespace amd {