From 69ebcb55836ebbd9dff51d7fc3eecb28a82971f2 Mon Sep 17 00:00:00 2001
From: foreman
Date: Mon, 14 Jul 2014 20:24:58 -0400
Subject: [PATCH] P4 to Git Change 1055054 by gandryey@gera-dev-w7 on
2014/07/14 20:18:53
ECR #304775 - Device enqueuing
- Switch to the single thread scheduler for now(the current version isn't friendly for single thread). Hopefully it's a temporary solution until synchronization issue with multithreaded scheduler will be identified.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#104 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#20 edit
[ROCm/clr commit: 6314b334bade159bbff6427dbb8960762714602b]
---
.../clr/rocclr/runtime/device/gpu/gpublit.cpp | 2 +-
.../rocclr/runtime/device/gpu/gpuschedcl.cpp | 26 ++++++++++++-------
2 files changed, 18 insertions(+), 10 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp
index 42ea2d6753..544dc25bc1 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp
@@ -2730,7 +2730,7 @@ KernelBlitManager::runScheduler(
size_t dim = 1;
size_t globalWorkOffset[1] = { 0 };
- size_t globalWorkSize[1] = { numSlots / 32 };
+ size_t globalWorkSize[1] = { 1 };
size_t localWorkSize[1] = { 1 };
// Program kernels arguments
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
index 0708bf975e..db43d6b0be 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -353,16 +353,23 @@ scheduler(
__global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1];
__global uint* amask = (__global uint *)queue->aql_slot_mask;
- uint launch = 0;
- uint loop;
+ uint launch;
+ do {
+ launch = atomic_load_explicit((__global atomic_uint*)¶m->launch,
+ memory_order_acquire, memory_scope_device);
+ } while (launch != 0);
+ uint loop;
do {
- uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
- memory_order_acquire, memory_scope_device);
+ for (uint m = 0; m < (queue->aql_slot_num >> 5); ++m) {
+ uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[m]),
+ memory_order_acquire, memory_scope_device);
- if (mask != 0) {
- int baseIdx = get_group_id(0) * 32;
- for (int idx = baseIdx + 31 - clz(mask); (idx >= baseIdx) && (launch == 0); --idx) {
+ int baseIdx = m * 32;
+ while (mask != 0) {
+ uint sIdx = ctz(mask);
+ uint idx = baseIdx + sIdx;
+ mask &= ~(1 << sIdx);
__global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
memory_order_acquire, memory_scope_device);
@@ -435,7 +442,7 @@ scheduler(
// Decrement the child execution counter on the parent
atomic_fetch_sub_explicit(
(__global atomic_uint*)&parent->child_counter,
- 1, memory_order_acq_rel, memory_scope_device);
+ 1, memory_order_acq_rel, memory_scope_device);
event->state = CL_COMPLETE;
disp->state = AQL_WRAP_FREE;
release_slot(amask, idx);
@@ -465,7 +472,7 @@ scheduler(
// Decrement the child execution counter on the parent
atomic_fetch_sub_explicit(
(__global atomic_uint*)&parent->child_counter,
- 1, memory_order_acq_rel, memory_scope_device);
+ 1, memory_order_acq_rel, memory_scope_device);
disp->state = AQL_WRAP_FREE;
release_slot(amask, idx);
}
@@ -474,6 +481,7 @@ scheduler(
disp->state = AQL_WRAP_DONE;
}
}
+ if (launch == 1) break;
}
barrier(CLK_GLOBAL_MEM_FENCE);