From 2738b30287e4580ea002ac2a02979ef226e449d8 Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 2 Sep 2014 17:15:39 -0400 Subject: [PATCH] P4 to Git Change 1072528 by gandryey@gera-dev-w7 on 2014/09/02 17:00:16 ECR #304775 - Add batching to the device enqueue for possible asynchronous execution - Increase the max device queue size to 512KB. That will allow to pass conformance tests that enqueue more jobs than the queue size. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#459 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#28 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#333 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#65 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#39 edit --- rocclr/runtime/device/gpu/gpudevice.cpp | 2 +- rocclr/runtime/device/gpu/gpusched.hpp | 2 +- rocclr/runtime/device/gpu/gpuschedcl.cpp | 285 +++++++++--------- rocclr/runtime/device/gpu/gpuvirtual.cpp | 3 +- .../device/gpu/gslbe/src/rt/GSLContext.cpp | 4 +- .../device/gpu/gslbe/src/rt/GSLContext.h | 3 +- 6 files changed, 143 insertions(+), 156 deletions(-) diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp index d36305e885..8e15245c1c 100644 --- a/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/rocclr/runtime/device/gpu/gpudevice.cpp @@ -722,7 +722,7 @@ void Device::fillDeviceInfo( info_.queueOnDeviceProperties_ = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE; info_.queueOnDevicePreferredSize_ = 16 * Ki; - info_.queueOnDeviceMaxSize_ = 256 * Ki; + info_.queueOnDeviceMaxSize_ = 512 * Ki; info_.maxOnDeviceQueues_ = 1; info_.maxOnDeviceEvents_ = settings().numDeviceEvents_; info_.globalVariablePreferredTotalSize_ = static_cast(info_.globalMemSize_); diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp index 1fa72efdc7..242580dd48 100644 --- a/rocclr/runtime/device/gpu/gpusched.hpp +++ b/rocclr/runtime/device/gpu/gpusched.hpp @@ -62,7 +62,7 @@ struct SchedulerParam { uint32_t eng_clk; //!< Engine clock in Mhz uint64_t hw_queue; //!< Address to HW queue uint64_t hsa_queue; //!< Address to HSA dummy queue - uint32_t launch; //!< Launch semaphore for the scheduler threads + uint32_t reserved; //!< Reserved uint32_t scratchSize; //!< Scratch buffer size uint64_t scratch; //!< GPU address to the scratch buffer uint32_t numMaxWaves; //!< The max number of possible waves diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp index 221b52bc49..c94589e127 100644 --- a/rocclr/runtime/device/gpu/gpuschedcl.cpp +++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp @@ -86,7 +86,7 @@ typedef struct _SchedulerParam { uint eng_clk; //!< Engine clock in Mhz ulong hw_queue; //!< Address to HW queue ulong hsa_queue; //!< Address to HSA dummy queue - uint launch; //!< Child launch status + uint reserved; //!< reserved uint scratchSize; //!< Scratch buffer size ulong scratch; //!< GPU address to the scratch buffer uint numMaxWaves; //!< Num max waves on the asic @@ -96,13 +96,25 @@ typedef struct _SchedulerParam { uint scratchOffset; //!< Scratch buffer offset } SchedulerParam; -typedef struct _HwDispatch { - uint startExe; // REWIND execution +typedef struct _HwDispatchHeader { + uint pad00; + uint pad01; + uint pad02; + uint pad03; + uint pad04; + uint pad05; + uint pad06; + uint pad07; + uint rewind; // REWIND execution + uint startExe; // valid bit uint condExe0; // 0xC0032200 -- TYPE 3, COND_EXEC uint condExe1; // 0x00000204 ---- uint condExe2; // 0x00000000 ---- uint condExe3; // 0x00000000 ---- uint condExe4; // 0x00000000 ---- +} HwDispatchHeader; + +typedef struct _HwDispatch { uint packet0; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values) uint offset0; // 0x00000204 ---- OFFSET uint startX; // 0x00000000 ---- COMPUTE_START_X: START = 0x0 @@ -182,9 +194,6 @@ dispatch( const uint Pm4UserRegs = 0xC0007602; const uint Pm4CopyReg = 0xC0044000; - // Wait for CP idle isn't necessary if CP waits for child - // while (atomic_and(&dispatch->startExe, 0xffffffff) != StallExecution) {} - uint usrRegCnt = 0; dispatch->wrkGrpSizeX = aqlPkt->workgroup_size[0]; @@ -272,10 +281,6 @@ dispatch( dispatch->glbSizeX = aqlPkt->grid_size[0]; dispatch->glbSizeY = aqlPkt->grid_size[1]; dispatch->glbSizeZ = aqlPkt->grid_size[2]; - barrier(CLK_GLOBAL_MEM_FENCE); - - // Resume the execution - dispatch->startExe = ResumeExecution; } static inline bool @@ -354,8 +359,8 @@ scheduler( uint paramIdx) { __global SchedulerParam* param = ¶ms[paramIdx]; - volatile __global HwDispatch* hwDisp = - (volatile __global HwDispatch*)param->hw_queue; + volatile __global HwDispatch* hwDisp = (volatile __global HwDispatch*) + &((__global HwDispatchHeader*)param->hw_queue)[1]; __global AmdAqlWrap* hostParent = (__global AmdAqlWrap*)(param->parentAQL); __global uint* counter = (__global uint*)(&hostParent->child_counter); __global uint* signal = (__global uint*)(¶m->signal); @@ -372,166 +377,148 @@ scheduler( uint launch = 0; uint loop = 1; - do { - uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]), - memory_order_acquire, memory_scope_device); + uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]), + memory_order_acquire, memory_scope_device); - int baseIdx = get_group_id(0) * 32; - while (mask != 0) { - uint sIdx = ctz(mask); - uint idx = baseIdx + sIdx; - mask &= ~(1 << sIdx); - __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx]; - uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state), - memory_order_acquire, memory_scope_device); - __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap); - __global AmdEvent* event = (__global AmdEvent*)(disp->completion); + int baseIdx = get_group_id(0) * 32; + while (mask != 0) { + uint sIdx = ctz(mask); + uint idx = baseIdx + sIdx; + mask &= ~(1 << sIdx); + __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx]; + uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state), + memory_order_acquire, memory_scope_device); + __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap); + __global AmdEvent* event = (__global AmdEvent*)(disp->completion); - // Check if the current slot is ready for processing - if (slotState == AQL_WRAP_READY) { - if (launch == 0) { - launch = atomic_load_explicit((__global atomic_uint*)¶m->launch, - memory_order_acquire, memory_scope_device); + // Check if the current slot is ready for processing + if (slotState == AQL_WRAP_READY) { + if (launch == 0) { + // Attempt to find a new disaptch if nothing was launched yet + uint parentState = atomic_load_explicit( + (__global atomic_uint*)(&parent->state), + memory_order_acquire, memory_scope_device); + uint enqueueFlags = atomic_load_explicit( + (__global atomic_uint*)(&disp->enqueue_flags), + memory_order_acquire, memory_scope_device); + + // Check the launch flags + if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) || + (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) && + (parentState != AQL_WRAP_DONE)) { + continue; } - if (launch == 0) { - // Attempt to find a new disaptch if nothing was launched yet - uint parentState = atomic_load_explicit( - (__global atomic_uint*)(&parent->state), - memory_order_acquire, memory_scope_device); - uint enqueueFlags = atomic_load_explicit( - (__global atomic_uint*)(&disp->enqueue_flags), - memory_order_acquire, memory_scope_device); - // Check the launch flags - if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) || - (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) && - (parentState != AQL_WRAP_DONE)) { - continue; - } - - // Check if the wait list is COMPLETE - launch = checkWaitEvents( - (__global AmdEvent**)(disp->wait_list), disp->wait_num); - - if (launch == 0) continue; - - uint tmp = 0; - if (atomic_compare_exchange_strong_explicit( - (__global atomic_uint*)¶m->launch, &tmp, launch, - memory_order_acq_rel, memory_order_acquire, memory_scope_device)) { - if (event != 0) { - event->timer[PROFILING_COMMAND_START] = - (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; - } - // Launch child kernel .... - dispatch(hwDisp, &disp->aql, param->scratch, param->hsa_queue, - param->scratchSize, param->scratchOffset, param->numMaxWaves); - disp->state = AQL_WRAP_BUSY; - releaseWaitEvents((__global AmdEvent**)(disp->wait_list), - disp->wait_num, (__global uint*)queue->event_slot_mask, - (__global AmdEvent*)queue->event_slots); - break; + // Check if the wait list is COMPLETE + launch = checkWaitEvents( + (__global AmdEvent**)(disp->wait_list), disp->wait_num); + + if (launch) { + if (event != 0) { + event->timer[PROFILING_COMMAND_START] = + (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; } + // Launch child kernel .... + dispatch(&hwDisp[get_group_id(0)], &disp->aql, param->scratch, param->hsa_queue, + param->scratchSize, param->scratchOffset, param->numMaxWaves); + disp->state = AQL_WRAP_BUSY; + releaseWaitEvents((__global AmdEvent**)(disp->wait_list), + disp->wait_num, (__global uint*)queue->event_slot_mask, + (__global AmdEvent*)queue->event_slots); + break; } } - else if (slotState == AQL_WRAP_MARKER) { - bool complete = false; - if (disp->wait_num == 0) { - uint minCommand = min_command(queue->aql_slot_num, wraps); - if (disp->command_id == minCommand) { - complete = true; - } + } + else if (slotState == AQL_WRAP_MARKER) { + bool complete = false; + if (disp->wait_num == 0) { + uint minCommand = min_command(queue->aql_slot_num, wraps); + if (disp->command_id == minCommand) { + complete = true; } - else { - // Check if the wait list is COMPLETE - if (checkWaitEvents( - (__global AmdEvent**)(disp->wait_list), disp->wait_num)) { - complete = true; - releaseWaitEvents((__global AmdEvent**)(disp->wait_list), - disp->wait_num, (__global uint*)queue->event_slot_mask, - (__global AmdEvent*)queue->event_slots); - } - } - if (complete) { - // Decrement the child execution counter on the parent - atomic_fetch_sub_explicit( - (__global atomic_uint*)&parent->child_counter, - 1, memory_order_acq_rel, memory_scope_device); - event->state = CL_COMPLETE; - disp->state = AQL_WRAP_FREE; - release_slot(amask, idx); - releaseEvent(event, (__global uint*)queue->event_slot_mask, + } + else { + // Check if the wait list is COMPLETE + if (checkWaitEvents( + (__global AmdEvent**)(disp->wait_list), disp->wait_num)) { + complete = true; + releaseWaitEvents((__global AmdEvent**)(disp->wait_list), + disp->wait_num, (__global uint*)queue->event_slot_mask, (__global AmdEvent*)queue->event_slots); } } - else if ((slotState == AQL_WRAP_BUSY) || - (slotState == AQL_WRAP_DONE)) { - if (slotState == AQL_WRAP_BUSY) { - disp->state = AQL_WRAP_DONE; - if (event != 0) { - event->timer[PROFILING_COMMAND_END] = - (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; - } - } - // Was CL_EVENT requested? - if (event != 0) { - // The current dispatch doesn't have any outstanding children - if (disp->child_counter == 0) { - event->timer[PROFILING_COMMAND_COMPLETE] = - (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; - event->state = CL_COMPLETE; - if (event->capture_info != 0) { - __global ulong* values = (__global ulong*)event->capture_info; - values[0] = event->timer[PROFILING_COMMAND_END] - - event->timer[PROFILING_COMMAND_START]; - values[1] = event->timer[PROFILING_COMMAND_COMPLETE] - - event->timer[PROFILING_COMMAND_START]; - } - releaseEvent(event, (__global uint *)queue->event_slot_mask, - (__global AmdEvent *)queue->event_slots); - } - } - // The current dispatch doesn't have any outstanding children - if (disp->child_counter == 0) { - // Decrement the child execution counter on the parent - atomic_fetch_sub_explicit( - (__global atomic_uint*)&parent->child_counter, - 1, memory_order_acq_rel, memory_scope_device); - disp->state = AQL_WRAP_FREE; - release_slot(amask, idx); - } + if (complete) { + // Decrement the child execution counter on the parent + atomic_fetch_sub_explicit( + (__global atomic_uint*)&parent->child_counter, + 1, memory_order_acq_rel, memory_scope_device); + event->state = CL_COMPLETE; + disp->state = AQL_WRAP_FREE; + release_slot(amask, idx); + releaseEvent(event, (__global uint*)queue->event_slot_mask, + (__global AmdEvent*)queue->event_slots); } } + else if ((slotState == AQL_WRAP_BUSY) || + (slotState == AQL_WRAP_DONE)) { + if (slotState == AQL_WRAP_BUSY) { + disp->state = AQL_WRAP_DONE; + if (event != 0) { + event->timer[PROFILING_COMMAND_END] = + (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; + } + } + // Was CL_EVENT requested? + if (event != 0) { + // The current dispatch doesn't have any outstanding children + if (disp->child_counter == 0) { + event->timer[PROFILING_COMMAND_COMPLETE] = + (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; + event->state = CL_COMPLETE; + if (event->capture_info != 0) { + __global ulong* values = (__global ulong*)event->capture_info; + values[0] = event->timer[PROFILING_COMMAND_END] - + event->timer[PROFILING_COMMAND_START]; + values[1] = event->timer[PROFILING_COMMAND_COMPLETE] - + event->timer[PROFILING_COMMAND_START]; + } + releaseEvent(event, (__global uint *)queue->event_slot_mask, + (__global AmdEvent *)queue->event_slots); + } + } + // The current dispatch doesn't have any outstanding children + if (disp->child_counter == 0) { + // Decrement the child execution counter on the parent + atomic_fetch_sub_explicit( + (__global atomic_uint*)&parent->child_counter, + 1, memory_order_acq_rel, memory_scope_device); + disp->state = AQL_WRAP_FREE; + release_slot(amask, idx); + } + } + } - barrier(CLK_GLOBAL_MEM_FENCE); + if (launch == 0) { + hwDisp[get_group_id(0)].glbSizeX = 0; + hwDisp[get_group_id(0)].glbSizeY = 0; + hwDisp[get_group_id(0)].glbSizeZ = 0; + } - launch = atomic_load_explicit((__global atomic_uint*)¶m->launch, + if (param->dedicatedQueue) { + loop = atomic_load_explicit((__global atomic_uint*)signal, memory_order_acquire, memory_scope_device); - - if (param->dedicatedQueue) { - loop = atomic_load_explicit((__global atomic_uint*)signal, - memory_order_acquire, memory_scope_device); - } - else { - loop = atomic_load_explicit((__global atomic_uint*)counter, - memory_order_acquire, memory_scope_device); - } - - } while ((launch == 0) && (loop != 0)); + } + else { + loop = atomic_load_explicit((__global atomic_uint*)counter, + memory_order_acquire, memory_scope_device); + } if (loop == 0) { //! \todo Write deadcode to the template, but somehow //! the scheduler will be launched one more time. hwDisp->packet0 = 0xdeadc0de; - hwDisp[1].condExe0 = 0xdeadc0de; - hwDisp[1].condExe1 = 0xdeadc0de; - hwDisp[1].condExe2 = 0xdeadc0de; - hwDisp[1].condExe3 = 0xdeadc0de; atomic_store_explicit((__global atomic_uint*)signal, 0, memory_order_release, memory_scope_device); - barrier(CLK_GLOBAL_MEM_FENCE); - atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe, - ResumeExecution, memory_order_release, memory_scope_device); atomic_store_explicit((__global atomic_uint*)¶m->releaseHostCP, 1, memory_order_release, memory_scope_device); } diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index 7713f294c9..1dadd25b4e 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -1874,7 +1874,6 @@ VirtualGPU::submitKernelInternalHSA( param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_; param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/; param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); - param->launch = 0; param->releaseHostCP = 0; param->parentAQL = vmParentWrap; param->dedicatedQueue = dev().settings().useDeviceQueue_; @@ -1908,7 +1907,7 @@ VirtualGPU::submitKernelInternalHSA( gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent, gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, - signalAddr, loopStart); + signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / 32); // Set GPU event for the used resources for (uint i = 0; i < memList.size(); ++i) { diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp index dfd1f7dc84..59a5c2da2e 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp @@ -1283,10 +1283,10 @@ CALGSLContext::virtualQueueDispatcherStart() void CALGSLContext::virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, - uint32 numMems, mcaddr signal, mcaddr loopStart) + uint32 numMems, mcaddr signal, mcaddr loopStart, uint32 numTemplates) { eventBegin(MainEngine); - m_cs->VirtualQueueDispatcherEnd(mems, numMems, signal, loopStart); + m_cs->VirtualQueueDispatcherEnd(mems, numMems, signal, loopStart, numTemplates); eventEnd(MainEngine, event); } diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h index ecd04024d9..7310fd6266 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h @@ -46,7 +46,8 @@ public: void runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems, uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA); mcaddr virtualQueueDispatcherStart(); - void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart); + void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, + mcaddr signal, mcaddr loopStart, uint32 numTemplates); void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState, uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue); bool isDone(GpuEvent* event);