P4 to Git Change 1055054 by gandryey@gera-dev-w7 on 2014/07/14 20:18:53

ECR #304775 - Device enqueuing
	- Switch to the single thread scheduler for now(the current version isn't friendly for single thread). Hopefully it's a temporary solution until synchronization issue with multithreaded scheduler will be identified.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#104 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#20 edit
This commit is contained in:
foreman
2014-07-14 20:24:58 -04:00
szülő 7ca4c75217
commit 6314b334ba
2 fájl változott, egészen pontosan 18 új sor hozzáadva és 10 régi sor törölve
@@ -2730,7 +2730,7 @@ KernelBlitManager::runScheduler(
size_t dim = 1;
size_t globalWorkOffset[1] = { 0 };
size_t globalWorkSize[1] = { numSlots / 32 };
size_t globalWorkSize[1] = { 1 };
size_t localWorkSize[1] = { 1 };
// Program kernels arguments
@@ -353,16 +353,23 @@ scheduler(
__global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1];
__global uint* amask = (__global uint *)queue->aql_slot_mask;
uint launch = 0;
uint loop;
uint launch;
do {
launch = atomic_load_explicit((__global atomic_uint*)&param->launch,
memory_order_acquire, memory_scope_device);
} while (launch != 0);
uint loop;
do {
uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
memory_order_acquire, memory_scope_device);
for (uint m = 0; m < (queue->aql_slot_num >> 5); ++m) {
uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[m]),
memory_order_acquire, memory_scope_device);
if (mask != 0) {
int baseIdx = get_group_id(0) * 32;
for (int idx = baseIdx + 31 - clz(mask); (idx >= baseIdx) && (launch == 0); --idx) {
int baseIdx = m * 32;
while (mask != 0) {
uint sIdx = ctz(mask);
uint idx = baseIdx + sIdx;
mask &= ~(1 << sIdx);
__global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
memory_order_acquire, memory_scope_device);
@@ -435,7 +442,7 @@ scheduler(
// Decrement the child execution counter on the parent
atomic_fetch_sub_explicit(
(__global atomic_uint*)&parent->child_counter,
1, memory_order_acq_rel, memory_scope_device);
1, memory_order_acq_rel, memory_scope_device);
event->state = CL_COMPLETE;
disp->state = AQL_WRAP_FREE;
release_slot(amask, idx);
@@ -465,7 +472,7 @@ scheduler(
// Decrement the child execution counter on the parent
atomic_fetch_sub_explicit(
(__global atomic_uint*)&parent->child_counter,
1, memory_order_acq_rel, memory_scope_device);
1, memory_order_acq_rel, memory_scope_device);
disp->state = AQL_WRAP_FREE;
release_slot(amask, idx);
}
@@ -474,6 +481,7 @@ scheduler(
disp->state = AQL_WRAP_DONE;
}
}
if (launch == 1) break;
}
barrier(CLK_GLOBAL_MEM_FENCE);