P4 to Git Change 1056280 by gandryey@gera-dev-w7 on 2014/07/17 11:55:30

ECR #304775 - Device enqueuing - Add extra sync point for the scheduler exit. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#323 edit
2014-07-17 12:20:49 -04:00
@@ -65,7 +65,7 @@ struct SchedulerParam {
    uint32_t    scratchSize;    //!< Scratch buffer size
    uint64_t    scratch;        //!< GPU address to the scratch buffer
    uint32_t    numMaxWaves;    //!< The max number of possible waves
-    uint32_t    reserved;       //!< reserved
+    uint32_t    releaseHostCP;  //!< Releases CP on the host queue
 };

 } // namespace gpu
@@ -8,7 +8,6 @@ namespace gpu {

 const char* SchedulerSourceCode = SCHEDULER_KERNEL(
 \n
-
 //! AmdAqlWrap slot state
 enum AqlWrapState {
    AQL_WRAP_FREE = 0,
@@ -90,7 +89,7 @@ typedef struct _SchedulerParam {
    uint    scratchSize;    //!< Scratch buffer size
    ulong   scratch;        //!< GPU address to the scratch buffer
    uint    numMaxWaves;    //!< Num max waves on the asic
-    uint    reserved;       //!< Reserved
+    uint    releaseHostCP;  //!< Releases CP on the host queue
 } SchedulerParam;

 typedef struct _HwDispatch {
@@ -353,11 +352,14 @@ scheduler(
    __global AmdAqlWrap*    wraps = (__global AmdAqlWrap*)&queue[1];
    __global uint*          amask = (__global uint *)queue->aql_slot_mask;

-    uint launch;
-    do {
-        launch = atomic_load_explicit((__global atomic_uint*)&param->launch,
-            memory_order_acquire, memory_scope_device);
-    } while (launch != 0);
+    //! @todo This is an unexplained behavior.
+    //! The scheduler can be launched one more time after termination.
+    if (1 == atomic_load_explicit((__global atomic_uint*)&param->releaseHostCP,
+        memory_order_acquire, memory_scope_device)) {
+        return;
+    }
+
+    uint launch = 0;
    uint loop;

    do {
@@ -494,7 +496,18 @@ scheduler(
    } while ((launch == 0) && (loop == 1));

    if (loop == 0) {
-        atomic_or(&hwDisp->startExe, ResumeExecution);
+        //! \todo Write deadcode to the template, but somehow
+        //! the scheduler will be launched one more time.
+        hwDisp->packet0 = 0xdeadc0de;
+        hwDisp[1].condExe0 = 0xdeadc0de;
+        hwDisp[1].condExe1 = 0xdeadc0de;
+        hwDisp[1].condExe2 = 0xdeadc0de;
+        hwDisp[1].condExe3 = 0xdeadc0de;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe,
+            ResumeExecution, memory_order_release, memory_scope_device);
+        atomic_store_explicit((__global atomic_uint*)&param->releaseHostCP,
+            1, memory_order_release, memory_scope_device);
    }
 }
 \n
@@ -1792,7 +1792,7 @@ VirtualGPU::submitKernelInternalHSA(
                    address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
                    print << "Kernel: " << child->name() << "\n";
                    static const char* Names[HSAILKernel::ExtraArguments] = {
-                    "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWarap: "};
+                    "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
                    for (j = 0; j < HSAILKernel::ExtraArguments; ++j) {
                        print << "\t" << Names[j] << *(size_t*)argum;
                        print << "\n";
@@ -1832,6 +1832,7 @@ VirtualGPU::submitKernelInternalHSA(
        param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
        param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
        param->launch = 0;
+        param->releaseHostCP = 0;
        // Fill the scratch buffer information
        if (hsaKernel.prog().maxScratchRegs() > 0) {
            gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];