diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp index 4939fc68d1..f0bcd2f4c0 100644 --- a/rocclr/runtime/device/gpu/gpusched.hpp +++ b/rocclr/runtime/device/gpu/gpusched.hpp @@ -65,7 +65,7 @@ struct SchedulerParam { uint32_t scratchSize; //!< Scratch buffer size uint64_t scratch; //!< GPU address to the scratch buffer uint32_t numMaxWaves; //!< The max number of possible waves - uint32_t reserved; //!< reserved + uint32_t releaseHostCP; //!< Releases CP on the host queue }; } // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp index db43d6b0be..8b2811ab57 100644 --- a/rocclr/runtime/device/gpu/gpuschedcl.cpp +++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp @@ -8,7 +8,6 @@ namespace gpu { const char* SchedulerSourceCode = SCHEDULER_KERNEL( \n - //! AmdAqlWrap slot state enum AqlWrapState { AQL_WRAP_FREE = 0, @@ -90,7 +89,7 @@ typedef struct _SchedulerParam { uint scratchSize; //!< Scratch buffer size ulong scratch; //!< GPU address to the scratch buffer uint numMaxWaves; //!< Num max waves on the asic - uint reserved; //!< Reserved + uint releaseHostCP; //!< Releases CP on the host queue } SchedulerParam; typedef struct _HwDispatch { @@ -353,11 +352,14 @@ scheduler( __global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1]; __global uint* amask = (__global uint *)queue->aql_slot_mask; - uint launch; - do { - launch = atomic_load_explicit((__global atomic_uint*)¶m->launch, - memory_order_acquire, memory_scope_device); - } while (launch != 0); + //! @todo This is an unexplained behavior. + //! The scheduler can be launched one more time after termination. + if (1 == atomic_load_explicit((__global atomic_uint*)¶m->releaseHostCP, + memory_order_acquire, memory_scope_device)) { + return; + } + + uint launch = 0; uint loop; do { @@ -494,7 +496,18 @@ scheduler( } while ((launch == 0) && (loop == 1)); if (loop == 0) { - atomic_or(&hwDisp->startExe, ResumeExecution); + //! \todo Write deadcode to the template, but somehow + //! the scheduler will be launched one more time. + hwDisp->packet0 = 0xdeadc0de; + hwDisp[1].condExe0 = 0xdeadc0de; + hwDisp[1].condExe1 = 0xdeadc0de; + hwDisp[1].condExe2 = 0xdeadc0de; + hwDisp[1].condExe3 = 0xdeadc0de; + barrier(CLK_GLOBAL_MEM_FENCE); + atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe, + ResumeExecution, memory_order_release, memory_scope_device); + atomic_store_explicit((__global atomic_uint*)¶m->releaseHostCP, + 1, memory_order_release, memory_scope_device); } } \n diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index dcb73b18a8..414046eb5b 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -1792,7 +1792,7 @@ VirtualGPU::submitKernelInternalHSA( address argum = gpuDefQueue->virtualQueue_->data() + offsArg; print << "Kernel: " << child->name() << "\n"; static const char* Names[HSAILKernel::ExtraArguments] = { - "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWarap: "}; + "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "}; for (j = 0; j < HSAILKernel::ExtraArguments; ++j) { print << "\t" << Names[j] << *(size_t*)argum; print << "\n"; @@ -1832,6 +1832,7 @@ VirtualGPU::submitKernelInternalHSA( param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/; param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); param->launch = 0; + param->releaseHostCP = 0; // Fill the scratch buffer information if (hsaKernel.prog().maxScratchRegs() > 0) { gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];