P4 to Git Change 1055054 by gandryey@gera-dev-w7 on 2014/07/14 20:18:53

ECR #304775 - Device enqueuing - Switch to the single thread scheduler for now(the current version isn't friendly for single thread). Hopefully it's a temporary solution until synchronization issue with multithreaded scheduler will be identified. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#104 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#20 edit [ROCm/clr commit: 6314b334ba]
2014-07-14 20:24:58 -04:00
@@ -2730,7 +2730,7 @@ KernelBlitManager::runScheduler(

    size_t  dim = 1;
    size_t  globalWorkOffset[1] = { 0 };
-    size_t  globalWorkSize[1] = { numSlots / 32 };
+    size_t  globalWorkSize[1] = { 1 };
    size_t  localWorkSize[1] = { 1 };

    // Program kernels arguments
@@ -353,16 +353,23 @@ scheduler(
    __global AmdAqlWrap*    wraps = (__global AmdAqlWrap*)&queue[1];
    __global uint*          amask = (__global uint *)queue->aql_slot_mask;

-    uint launch = 0;
-    uint loop;  
+    uint launch;
+    do {
+        launch = atomic_load_explicit((__global atomic_uint*)&param->launch,
+            memory_order_acquire, memory_scope_device);
+    } while (launch != 0);
+    uint loop;

    do {
-        uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
-              memory_order_acquire, memory_scope_device);
+        for (uint m = 0; m < (queue->aql_slot_num >> 5); ++m) {
+            uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[m]),
+                  memory_order_acquire, memory_scope_device);

-        if (mask != 0) {
-            int baseIdx = get_group_id(0) * 32;
-            for (int idx = baseIdx + 31 - clz(mask); (idx >= baseIdx) && (launch == 0); --idx) {
+            int baseIdx = m * 32;
+            while (mask != 0) {
+                uint sIdx = ctz(mask);
+                uint idx = baseIdx + sIdx;
+                mask &= ~(1 << sIdx);
                __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
                uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
                    memory_order_acquire, memory_scope_device);
@@ -435,7 +442,7 @@ scheduler(
                        // Decrement the child execution counter on the parent
                        atomic_fetch_sub_explicit(
                            (__global atomic_uint*)&parent->child_counter,
-                             1, memory_order_acq_rel, memory_scope_device);
+                            1, memory_order_acq_rel, memory_scope_device);
                        event->state = CL_COMPLETE;
                        disp->state = AQL_WRAP_FREE;
                        release_slot(amask, idx);
@@ -465,7 +472,7 @@ scheduler(
                        // Decrement the child execution counter on the parent
                        atomic_fetch_sub_explicit(
                            (__global atomic_uint*)&parent->child_counter,
-                             1, memory_order_acq_rel, memory_scope_device);
+                            1, memory_order_acq_rel, memory_scope_device);
                        disp->state = AQL_WRAP_FREE;
                        release_slot(amask, idx);
                    }
@@ -474,6 +481,7 @@ scheduler(
                    disp->state = AQL_WRAP_DONE;
                }
            }
+            if (launch == 1) break;
        }
        barrier(CLK_GLOBAL_MEM_FENCE);