From cd3fefb00db00b000228d1ae66460e27e4550436 Mon Sep 17 00:00:00 2001 From: foreman Date: Mon, 7 Jul 2014 18:58:52 -0400 Subject: [PATCH] P4 to Git Change 1052832 by gandryey@gera-dev-w7 on 2014/07/07 18:44:29 ECR #304775 - Device enqueuing - Update the scheduler to handle event mask Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#320 edit --- rocclr/runtime/device/gpu/gpuschedcl.cpp | 52 +++++++++++++++++++----- rocclr/runtime/device/gpu/gpuvirtual.cpp | 11 +++-- 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp index 50e0d61db2..7503532e01 100644 --- a/rocclr/runtime/device/gpu/gpuschedcl.cpp +++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp @@ -164,7 +164,8 @@ const uint StallExecution = 0x00000000; // 0x01000000 const uint WavefrontSize = 64; const uint MaxWaveSize = 0x400; -void dispatch( +static inline void +dispatch( volatile __global HwDispatch* dispatch, __global HsaAqlDispatchPacket* aqlPkt, uint scratchSize, @@ -272,20 +273,21 @@ void dispatch( dispatch->startExe = ResumeExecution; } -bool -checkWaitEvents(__global AmdEvent* events, uint numEvents) +static inline bool +checkWaitEvents(__global AmdEvent** events, uint numEvents) { for (uint i = 0; i < numEvents; ++i) { - if (atomic_and(&events[i].state, 0xffffffff) != CL_COMPLETE) { + if (atomic_and(&events[i]->state, 0xffffffff) != CL_COMPLETE) { return false; } } return true; } + // release slot in a bitmask controlled resource i is the slot number static inline void -release_slot(__global uint * restrict mask, uint i) +release_slot(__global uint* restrict mask, uint i) { /* uint b = ~(1UL << (i & 0x1f)); */ uint b = ~amd_bfm(1U, i); @@ -301,6 +303,26 @@ release_slot(__global uint * restrict mask, uint i) } } +static inline void +releaseEvent(__global AmdEvent* ev, __global uint* emask, __global AmdEvent* eb) +{ + uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U, + memory_order_acq_rel, memory_scope_device); + if (c == 1U) { + uint i = ev - eb; + release_slot(emask, i); + } +} + +static inline void +releaseWaitEvents(__global AmdEvent** events, uint numEvents, + __global uint* emask, __global AmdEvent* eb) +{ + for (uint i = 0; i < numEvents; ++i) { + releaseEvent(events[i], emask, eb); + } +} + static inline uint min_command(uint slot_num, __global AmdAqlWrap* wraps) { @@ -370,7 +392,7 @@ scheduler( if (disp->wait_num != 0) { // Check if the wait list is COMPLETE launch = checkWaitEvents( - (__global AmdEvent*)(disp->wait_list), disp->wait_num); + (__global AmdEvent**)(disp->wait_list), disp->wait_num); } else { launch = 1; @@ -381,12 +403,15 @@ scheduler( memory_order_acq_rel, memory_order_acq_rel, memory_scope_device)) { if (event != 0) { event->timer[PROFILING_COMMAND_START] = - (__hsail_get_clock() * 1000) / (ulong)param->eng_clk; + (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; } // Launch child kernel .... dispatch(hwDisp, &disp->aql, param->scratchSize, param->numMaxWaves, param->scratch, param->hsa_queue); disp->state = AQL_WRAP_BUSY; + releaseWaitEvents((__global AmdEvent**)(disp->wait_list), + disp->wait_num, (__global uint*)queue->event_slot_mask, + (__global AmdEvent*)queue->event_slots); break; } } @@ -402,8 +427,11 @@ scheduler( else { // Check if the wait list is COMPLETE if (checkWaitEvents( - (__global AmdEvent*)(disp->wait_list), disp->wait_num)) { + (__global AmdEvent**)(disp->wait_list), disp->wait_num)) { complete = true; + releaseWaitEvents((__global AmdEvent**)(disp->wait_list), + disp->wait_num, (__global uint*)queue->event_slot_mask, + (__global AmdEvent*)queue->event_slots); } } if (complete) { @@ -414,6 +442,8 @@ scheduler( event->state = CL_COMPLETE; disp->state = AQL_WRAP_FREE; release_slot(amask, idx); + releaseEvent(event, (__global uint*)queue->event_slot_mask, + (__global AmdEvent*)queue->event_slots); } } else if (slotState == AQL_WRAP_DONE) { @@ -424,12 +454,14 @@ scheduler( event->state = CL_COMPLETE; event->timer[PROFILING_COMMAND_END] = event->timer[PROFILING_COMMAND_COMPLETE] = - (__hsail_get_clock() * 1000) / (ulong)param->eng_clk; + (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; } else { event->timer[PROFILING_COMMAND_END] = - (__hsail_get_clock() * 1000) / (ulong)param->eng_clk; + (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; } + releaseEvent(event, (__global uint *)queue->event_slot_mask, + (__global AmdEvent *)queue->event_slots); } // The current dispatch doesn't have any outstanding children if (disp->child_counter == 0) { diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index cba81a9e4c..0e6b55cd65 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -284,11 +284,11 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) uint eventMaskOffs = allocSize; // Add mask array for events - allocSize += amd::alignUp(dev().settings().numDeviceEvents_, 32) / 32; + allocSize += amd::alignUp(dev().settings().numDeviceEvents_, 32) / 8; uint slotMaskOffs = allocSize; // Add mask array for AmdAqlWrap slots - allocSize += amd::alignUp(numSlots, 32) / 32; + allocSize += amd::alignUp(numSlots, 32) / 8; virtualQueue_ = new Memory(dev(), allocSize); Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? @@ -1680,6 +1680,10 @@ VirtualGPU::submitKernelInternalHSA( gpuDefQueue = static_cast(defQueue->vDev()); } vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress(); + if (gpuDefQueue->hwRing() == hwRing()) { + LogError("Can't submit the child kernels to the same HW ring as the host queue!"); + return false; + } // Add memory handles before the actual dispatch memList.push_back(gpuDefQueue->virtualQueue_); @@ -1811,7 +1815,8 @@ VirtualGPU::submitKernelInternalHSA( SchedulerParam* param = &reinterpret_cast (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; param->signal = 1; - param->eng_clk = dev().info().maxClockFrequency_; + // Scale clock to 1024 to avoid 64 bit div in the scheduler + param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_; param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/; param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); param->launch = 0;