From 69ebcb55836ebbd9dff51d7fc3eecb28a82971f2 Mon Sep 17 00:00:00 2001 From: foreman Date: Mon, 14 Jul 2014 20:24:58 -0400 Subject: [PATCH] P4 to Git Change 1055054 by gandryey@gera-dev-w7 on 2014/07/14 20:18:53 ECR #304775 - Device enqueuing - Switch to the single thread scheduler for now(the current version isn't friendly for single thread). Hopefully it's a temporary solution until synchronization issue with multithreaded scheduler will be identified. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#104 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#20 edit [ROCm/clr commit: 6314b334bade159bbff6427dbb8960762714602b] --- .../clr/rocclr/runtime/device/gpu/gpublit.cpp | 2 +- .../rocclr/runtime/device/gpu/gpuschedcl.cpp | 26 ++++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp index 42ea2d6753..544dc25bc1 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp @@ -2730,7 +2730,7 @@ KernelBlitManager::runScheduler( size_t dim = 1; size_t globalWorkOffset[1] = { 0 }; - size_t globalWorkSize[1] = { numSlots / 32 }; + size_t globalWorkSize[1] = { 1 }; size_t localWorkSize[1] = { 1 }; // Program kernels arguments diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp index 0708bf975e..db43d6b0be 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp @@ -353,16 +353,23 @@ scheduler( __global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1]; __global uint* amask = (__global uint *)queue->aql_slot_mask; - uint launch = 0; - uint loop; + uint launch; + do { + launch = atomic_load_explicit((__global atomic_uint*)¶m->launch, + memory_order_acquire, memory_scope_device); + } while (launch != 0); + uint loop; do { - uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]), - memory_order_acquire, memory_scope_device); + for (uint m = 0; m < (queue->aql_slot_num >> 5); ++m) { + uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[m]), + memory_order_acquire, memory_scope_device); - if (mask != 0) { - int baseIdx = get_group_id(0) * 32; - for (int idx = baseIdx + 31 - clz(mask); (idx >= baseIdx) && (launch == 0); --idx) { + int baseIdx = m * 32; + while (mask != 0) { + uint sIdx = ctz(mask); + uint idx = baseIdx + sIdx; + mask &= ~(1 << sIdx); __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx]; uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state), memory_order_acquire, memory_scope_device); @@ -435,7 +442,7 @@ scheduler( // Decrement the child execution counter on the parent atomic_fetch_sub_explicit( (__global atomic_uint*)&parent->child_counter, - 1, memory_order_acq_rel, memory_scope_device); + 1, memory_order_acq_rel, memory_scope_device); event->state = CL_COMPLETE; disp->state = AQL_WRAP_FREE; release_slot(amask, idx); @@ -465,7 +472,7 @@ scheduler( // Decrement the child execution counter on the parent atomic_fetch_sub_explicit( (__global atomic_uint*)&parent->child_counter, - 1, memory_order_acq_rel, memory_scope_device); + 1, memory_order_acq_rel, memory_scope_device); disp->state = AQL_WRAP_FREE; release_slot(amask, idx); } @@ -474,6 +481,7 @@ scheduler( disp->state = AQL_WRAP_DONE; } } + if (launch == 1) break; } barrier(CLK_GLOBAL_MEM_FENCE);