diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp
index 4939fc68d1..f0bcd2f4c0 100644
--- a/rocclr/runtime/device/gpu/gpusched.hpp
+++ b/rocclr/runtime/device/gpu/gpusched.hpp
@@ -65,7 +65,7 @@ struct SchedulerParam {
     uint32_t    scratchSize;    //!< Scratch buffer size
     uint64_t    scratch;        //!< GPU address to the scratch buffer
     uint32_t    numMaxWaves;    //!< The max number of possible waves
-    uint32_t    reserved;       //!< reserved
+    uint32_t    releaseHostCP;  //!< Releases CP on the host queue
 };
 
 } // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp
index db43d6b0be..8b2811ab57 100644
--- a/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -8,7 +8,6 @@ namespace gpu {
 
 const char* SchedulerSourceCode = SCHEDULER_KERNEL(
 \n
-
 //! AmdAqlWrap slot state
 enum AqlWrapState {
     AQL_WRAP_FREE = 0,
@@ -90,7 +89,7 @@ typedef struct _SchedulerParam {
     uint    scratchSize;    //!< Scratch buffer size
     ulong   scratch;        //!< GPU address to the scratch buffer
     uint    numMaxWaves;    //!< Num max waves on the asic
-    uint    reserved;       //!< Reserved
+    uint    releaseHostCP;  //!< Releases CP on the host queue
 } SchedulerParam;
 
 typedef struct _HwDispatch {
@@ -353,11 +352,14 @@ scheduler(
     __global AmdAqlWrap*    wraps = (__global AmdAqlWrap*)&queue[1];
     __global uint*          amask = (__global uint *)queue->aql_slot_mask;
 
-    uint launch;
-    do {
-        launch = atomic_load_explicit((__global atomic_uint*)&param->launch,
-            memory_order_acquire, memory_scope_device);
-    } while (launch != 0);
+    //! @todo This is an unexplained behavior.
+    //! The scheduler can be launched one more time after termination.
+    if (1 == atomic_load_explicit((__global atomic_uint*)&param->releaseHostCP,
+        memory_order_acquire, memory_scope_device)) {
+        return;
+    }
+
+    uint launch = 0;
     uint loop;
 
     do {
@@ -494,7 +496,18 @@ scheduler(
     } while ((launch == 0) && (loop == 1));
 
     if (loop == 0) {
-        atomic_or(&hwDisp->startExe, ResumeExecution);
+        //! \todo Write deadcode to the template, but somehow
+        //! the scheduler will be launched one more time.
+        hwDisp->packet0 = 0xdeadc0de;
+        hwDisp[1].condExe0 = 0xdeadc0de;
+        hwDisp[1].condExe1 = 0xdeadc0de;
+        hwDisp[1].condExe2 = 0xdeadc0de;
+        hwDisp[1].condExe3 = 0xdeadc0de;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe,
+            ResumeExecution, memory_order_release, memory_scope_device);
+        atomic_store_explicit((__global atomic_uint*)&param->releaseHostCP,
+            1, memory_order_release, memory_scope_device);
     }
 }
 \n
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index dcb73b18a8..414046eb5b 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -1792,7 +1792,7 @@ VirtualGPU::submitKernelInternalHSA(
                     address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
                     print << "Kernel: " << child->name() << "\n";
                     static const char* Names[HSAILKernel::ExtraArguments] = {
-                    "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWarap: "};
+                    "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
                     for (j = 0; j < HSAILKernel::ExtraArguments; ++j) {
                         print << "\t" << Names[j] << *(size_t*)argum;
                         print << "\n";
@@ -1832,6 +1832,7 @@ VirtualGPU::submitKernelInternalHSA(
         param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
         param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
         param->launch = 0;
+        param->releaseHostCP = 0;
         // Fill the scratch buffer information
         if (hsaKernel.prog().maxScratchRegs() > 0) {
             gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];