From 2738b30287e4580ea002ac2a02979ef226e449d8 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 2 Sep 2014 17:15:39 -0400
Subject: [PATCH] P4 to Git Change 1072528 by gandryey@gera-dev-w7 on
2014/09/02 17:00:16
ECR #304775 - Add batching to the device enqueue for possible asynchronous execution
- Increase the max device queue size to 512KB. That will allow to pass conformance tests that enqueue more jobs than the queue size.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#459 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#28 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#333 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#65 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#39 edit
---
rocclr/runtime/device/gpu/gpudevice.cpp | 2 +-
rocclr/runtime/device/gpu/gpusched.hpp | 2 +-
rocclr/runtime/device/gpu/gpuschedcl.cpp | 285 +++++++++---------
rocclr/runtime/device/gpu/gpuvirtual.cpp | 3 +-
.../device/gpu/gslbe/src/rt/GSLContext.cpp | 4 +-
.../device/gpu/gslbe/src/rt/GSLContext.h | 3 +-
6 files changed, 143 insertions(+), 156 deletions(-)
diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index d36305e885..8e15245c1c 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -722,7 +722,7 @@ void Device::fillDeviceInfo(
info_.queueOnDeviceProperties_ =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
info_.queueOnDevicePreferredSize_ = 16 * Ki;
- info_.queueOnDeviceMaxSize_ = 256 * Ki;
+ info_.queueOnDeviceMaxSize_ = 512 * Ki;
info_.maxOnDeviceQueues_ = 1;
info_.maxOnDeviceEvents_ = settings().numDeviceEvents_;
info_.globalVariablePreferredTotalSize_ = static_cast(info_.globalMemSize_);
diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp
index 1fa72efdc7..242580dd48 100644
--- a/rocclr/runtime/device/gpu/gpusched.hpp
+++ b/rocclr/runtime/device/gpu/gpusched.hpp
@@ -62,7 +62,7 @@ struct SchedulerParam {
uint32_t eng_clk; //!< Engine clock in Mhz
uint64_t hw_queue; //!< Address to HW queue
uint64_t hsa_queue; //!< Address to HSA dummy queue
- uint32_t launch; //!< Launch semaphore for the scheduler threads
+ uint32_t reserved; //!< Reserved
uint32_t scratchSize; //!< Scratch buffer size
uint64_t scratch; //!< GPU address to the scratch buffer
uint32_t numMaxWaves; //!< The max number of possible waves
diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp
index 221b52bc49..c94589e127 100644
--- a/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -86,7 +86,7 @@ typedef struct _SchedulerParam {
uint eng_clk; //!< Engine clock in Mhz
ulong hw_queue; //!< Address to HW queue
ulong hsa_queue; //!< Address to HSA dummy queue
- uint launch; //!< Child launch status
+ uint reserved; //!< reserved
uint scratchSize; //!< Scratch buffer size
ulong scratch; //!< GPU address to the scratch buffer
uint numMaxWaves; //!< Num max waves on the asic
@@ -96,13 +96,25 @@ typedef struct _SchedulerParam {
uint scratchOffset; //!< Scratch buffer offset
} SchedulerParam;
-typedef struct _HwDispatch {
- uint startExe; // REWIND execution
+typedef struct _HwDispatchHeader {
+ uint pad00;
+ uint pad01;
+ uint pad02;
+ uint pad03;
+ uint pad04;
+ uint pad05;
+ uint pad06;
+ uint pad07;
+ uint rewind; // REWIND execution
+ uint startExe; // valid bit
uint condExe0; // 0xC0032200 -- TYPE 3, COND_EXEC
uint condExe1; // 0x00000204 ----
uint condExe2; // 0x00000000 ----
uint condExe3; // 0x00000000 ----
uint condExe4; // 0x00000000 ----
+} HwDispatchHeader;
+
+typedef struct _HwDispatch {
uint packet0; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values)
uint offset0; // 0x00000204 ---- OFFSET
uint startX; // 0x00000000 ---- COMPUTE_START_X: START = 0x0
@@ -182,9 +194,6 @@ dispatch(
const uint Pm4UserRegs = 0xC0007602;
const uint Pm4CopyReg = 0xC0044000;
- // Wait for CP idle isn't necessary if CP waits for child
- // while (atomic_and(&dispatch->startExe, 0xffffffff) != StallExecution) {}
-
uint usrRegCnt = 0;
dispatch->wrkGrpSizeX = aqlPkt->workgroup_size[0];
@@ -272,10 +281,6 @@ dispatch(
dispatch->glbSizeX = aqlPkt->grid_size[0];
dispatch->glbSizeY = aqlPkt->grid_size[1];
dispatch->glbSizeZ = aqlPkt->grid_size[2];
- barrier(CLK_GLOBAL_MEM_FENCE);
-
- // Resume the execution
- dispatch->startExe = ResumeExecution;
}
static inline bool
@@ -354,8 +359,8 @@ scheduler(
uint paramIdx)
{
__global SchedulerParam* param = ¶ms[paramIdx];
- volatile __global HwDispatch* hwDisp =
- (volatile __global HwDispatch*)param->hw_queue;
+ volatile __global HwDispatch* hwDisp = (volatile __global HwDispatch*)
+ &((__global HwDispatchHeader*)param->hw_queue)[1];
__global AmdAqlWrap* hostParent = (__global AmdAqlWrap*)(param->parentAQL);
__global uint* counter = (__global uint*)(&hostParent->child_counter);
__global uint* signal = (__global uint*)(¶m->signal);
@@ -372,166 +377,148 @@ scheduler(
uint launch = 0;
uint loop = 1;
- do {
- uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
- memory_order_acquire, memory_scope_device);
+ uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
+ memory_order_acquire, memory_scope_device);
- int baseIdx = get_group_id(0) * 32;
- while (mask != 0) {
- uint sIdx = ctz(mask);
- uint idx = baseIdx + sIdx;
- mask &= ~(1 << sIdx);
- __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
- uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
- memory_order_acquire, memory_scope_device);
- __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap);
- __global AmdEvent* event = (__global AmdEvent*)(disp->completion);
+ int baseIdx = get_group_id(0) * 32;
+ while (mask != 0) {
+ uint sIdx = ctz(mask);
+ uint idx = baseIdx + sIdx;
+ mask &= ~(1 << sIdx);
+ __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
+ uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
+ memory_order_acquire, memory_scope_device);
+ __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap);
+ __global AmdEvent* event = (__global AmdEvent*)(disp->completion);
- // Check if the current slot is ready for processing
- if (slotState == AQL_WRAP_READY) {
- if (launch == 0) {
- launch = atomic_load_explicit((__global atomic_uint*)¶m->launch,
- memory_order_acquire, memory_scope_device);
+ // Check if the current slot is ready for processing
+ if (slotState == AQL_WRAP_READY) {
+ if (launch == 0) {
+ // Attempt to find a new disaptch if nothing was launched yet
+ uint parentState = atomic_load_explicit(
+ (__global atomic_uint*)(&parent->state),
+ memory_order_acquire, memory_scope_device);
+ uint enqueueFlags = atomic_load_explicit(
+ (__global atomic_uint*)(&disp->enqueue_flags),
+ memory_order_acquire, memory_scope_device);
+
+ // Check the launch flags
+ if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) ||
+ (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) &&
+ (parentState != AQL_WRAP_DONE)) {
+ continue;
}
- if (launch == 0) {
- // Attempt to find a new disaptch if nothing was launched yet
- uint parentState = atomic_load_explicit(
- (__global atomic_uint*)(&parent->state),
- memory_order_acquire, memory_scope_device);
- uint enqueueFlags = atomic_load_explicit(
- (__global atomic_uint*)(&disp->enqueue_flags),
- memory_order_acquire, memory_scope_device);
- // Check the launch flags
- if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) ||
- (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) &&
- (parentState != AQL_WRAP_DONE)) {
- continue;
- }
-
- // Check if the wait list is COMPLETE
- launch = checkWaitEvents(
- (__global AmdEvent**)(disp->wait_list), disp->wait_num);
-
- if (launch == 0) continue;
-
- uint tmp = 0;
- if (atomic_compare_exchange_strong_explicit(
- (__global atomic_uint*)¶m->launch, &tmp, launch,
- memory_order_acq_rel, memory_order_acquire, memory_scope_device)) {
- if (event != 0) {
- event->timer[PROFILING_COMMAND_START] =
- (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
- }
- // Launch child kernel ....
- dispatch(hwDisp, &disp->aql, param->scratch, param->hsa_queue,
- param->scratchSize, param->scratchOffset, param->numMaxWaves);
- disp->state = AQL_WRAP_BUSY;
- releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
- disp->wait_num, (__global uint*)queue->event_slot_mask,
- (__global AmdEvent*)queue->event_slots);
- break;
+ // Check if the wait list is COMPLETE
+ launch = checkWaitEvents(
+ (__global AmdEvent**)(disp->wait_list), disp->wait_num);
+
+ if (launch) {
+ if (event != 0) {
+ event->timer[PROFILING_COMMAND_START] =
+ (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
}
+ // Launch child kernel ....
+ dispatch(&hwDisp[get_group_id(0)], &disp->aql, param->scratch, param->hsa_queue,
+ param->scratchSize, param->scratchOffset, param->numMaxWaves);
+ disp->state = AQL_WRAP_BUSY;
+ releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
+ disp->wait_num, (__global uint*)queue->event_slot_mask,
+ (__global AmdEvent*)queue->event_slots);
+ break;
}
}
- else if (slotState == AQL_WRAP_MARKER) {
- bool complete = false;
- if (disp->wait_num == 0) {
- uint minCommand = min_command(queue->aql_slot_num, wraps);
- if (disp->command_id == minCommand) {
- complete = true;
- }
+ }
+ else if (slotState == AQL_WRAP_MARKER) {
+ bool complete = false;
+ if (disp->wait_num == 0) {
+ uint minCommand = min_command(queue->aql_slot_num, wraps);
+ if (disp->command_id == minCommand) {
+ complete = true;
}
- else {
- // Check if the wait list is COMPLETE
- if (checkWaitEvents(
- (__global AmdEvent**)(disp->wait_list), disp->wait_num)) {
- complete = true;
- releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
- disp->wait_num, (__global uint*)queue->event_slot_mask,
- (__global AmdEvent*)queue->event_slots);
- }
- }
- if (complete) {
- // Decrement the child execution counter on the parent
- atomic_fetch_sub_explicit(
- (__global atomic_uint*)&parent->child_counter,
- 1, memory_order_acq_rel, memory_scope_device);
- event->state = CL_COMPLETE;
- disp->state = AQL_WRAP_FREE;
- release_slot(amask, idx);
- releaseEvent(event, (__global uint*)queue->event_slot_mask,
+ }
+ else {
+ // Check if the wait list is COMPLETE
+ if (checkWaitEvents(
+ (__global AmdEvent**)(disp->wait_list), disp->wait_num)) {
+ complete = true;
+ releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
+ disp->wait_num, (__global uint*)queue->event_slot_mask,
(__global AmdEvent*)queue->event_slots);
}
}
- else if ((slotState == AQL_WRAP_BUSY) ||
- (slotState == AQL_WRAP_DONE)) {
- if (slotState == AQL_WRAP_BUSY) {
- disp->state = AQL_WRAP_DONE;
- if (event != 0) {
- event->timer[PROFILING_COMMAND_END] =
- (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
- }
- }
- // Was CL_EVENT requested?
- if (event != 0) {
- // The current dispatch doesn't have any outstanding children
- if (disp->child_counter == 0) {
- event->timer[PROFILING_COMMAND_COMPLETE] =
- (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
- event->state = CL_COMPLETE;
- if (event->capture_info != 0) {
- __global ulong* values = (__global ulong*)event->capture_info;
- values[0] = event->timer[PROFILING_COMMAND_END] -
- event->timer[PROFILING_COMMAND_START];
- values[1] = event->timer[PROFILING_COMMAND_COMPLETE] -
- event->timer[PROFILING_COMMAND_START];
- }
- releaseEvent(event, (__global uint *)queue->event_slot_mask,
- (__global AmdEvent *)queue->event_slots);
- }
- }
- // The current dispatch doesn't have any outstanding children
- if (disp->child_counter == 0) {
- // Decrement the child execution counter on the parent
- atomic_fetch_sub_explicit(
- (__global atomic_uint*)&parent->child_counter,
- 1, memory_order_acq_rel, memory_scope_device);
- disp->state = AQL_WRAP_FREE;
- release_slot(amask, idx);
- }
+ if (complete) {
+ // Decrement the child execution counter on the parent
+ atomic_fetch_sub_explicit(
+ (__global atomic_uint*)&parent->child_counter,
+ 1, memory_order_acq_rel, memory_scope_device);
+ event->state = CL_COMPLETE;
+ disp->state = AQL_WRAP_FREE;
+ release_slot(amask, idx);
+ releaseEvent(event, (__global uint*)queue->event_slot_mask,
+ (__global AmdEvent*)queue->event_slots);
}
}
+ else if ((slotState == AQL_WRAP_BUSY) ||
+ (slotState == AQL_WRAP_DONE)) {
+ if (slotState == AQL_WRAP_BUSY) {
+ disp->state = AQL_WRAP_DONE;
+ if (event != 0) {
+ event->timer[PROFILING_COMMAND_END] =
+ (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
+ }
+ }
+ // Was CL_EVENT requested?
+ if (event != 0) {
+ // The current dispatch doesn't have any outstanding children
+ if (disp->child_counter == 0) {
+ event->timer[PROFILING_COMMAND_COMPLETE] =
+ (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
+ event->state = CL_COMPLETE;
+ if (event->capture_info != 0) {
+ __global ulong* values = (__global ulong*)event->capture_info;
+ values[0] = event->timer[PROFILING_COMMAND_END] -
+ event->timer[PROFILING_COMMAND_START];
+ values[1] = event->timer[PROFILING_COMMAND_COMPLETE] -
+ event->timer[PROFILING_COMMAND_START];
+ }
+ releaseEvent(event, (__global uint *)queue->event_slot_mask,
+ (__global AmdEvent *)queue->event_slots);
+ }
+ }
+ // The current dispatch doesn't have any outstanding children
+ if (disp->child_counter == 0) {
+ // Decrement the child execution counter on the parent
+ atomic_fetch_sub_explicit(
+ (__global atomic_uint*)&parent->child_counter,
+ 1, memory_order_acq_rel, memory_scope_device);
+ disp->state = AQL_WRAP_FREE;
+ release_slot(amask, idx);
+ }
+ }
+ }
- barrier(CLK_GLOBAL_MEM_FENCE);
+ if (launch == 0) {
+ hwDisp[get_group_id(0)].glbSizeX = 0;
+ hwDisp[get_group_id(0)].glbSizeY = 0;
+ hwDisp[get_group_id(0)].glbSizeZ = 0;
+ }
- launch = atomic_load_explicit((__global atomic_uint*)¶m->launch,
+ if (param->dedicatedQueue) {
+ loop = atomic_load_explicit((__global atomic_uint*)signal,
memory_order_acquire, memory_scope_device);
-
- if (param->dedicatedQueue) {
- loop = atomic_load_explicit((__global atomic_uint*)signal,
- memory_order_acquire, memory_scope_device);
- }
- else {
- loop = atomic_load_explicit((__global atomic_uint*)counter,
- memory_order_acquire, memory_scope_device);
- }
-
- } while ((launch == 0) && (loop != 0));
+ }
+ else {
+ loop = atomic_load_explicit((__global atomic_uint*)counter,
+ memory_order_acquire, memory_scope_device);
+ }
if (loop == 0) {
//! \todo Write deadcode to the template, but somehow
//! the scheduler will be launched one more time.
hwDisp->packet0 = 0xdeadc0de;
- hwDisp[1].condExe0 = 0xdeadc0de;
- hwDisp[1].condExe1 = 0xdeadc0de;
- hwDisp[1].condExe2 = 0xdeadc0de;
- hwDisp[1].condExe3 = 0xdeadc0de;
atomic_store_explicit((__global atomic_uint*)signal,
0, memory_order_release, memory_scope_device);
- barrier(CLK_GLOBAL_MEM_FENCE);
- atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe,
- ResumeExecution, memory_order_release, memory_scope_device);
atomic_store_explicit((__global atomic_uint*)¶m->releaseHostCP,
1, memory_order_release, memory_scope_device);
}
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 7713f294c9..1dadd25b4e 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -1874,7 +1874,6 @@ VirtualGPU::submitKernelInternalHSA(
param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_;
param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
- param->launch = 0;
param->releaseHostCP = 0;
param->parentAQL = vmParentWrap;
param->dedicatedQueue = dev().settings().useDeviceQueue_;
@@ -1908,7 +1907,7 @@ VirtualGPU::submitKernelInternalHSA(
gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent,
gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
- signalAddr, loopStart);
+ signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / 32);
// Set GPU event for the used resources
for (uint i = 0; i < memList.size(); ++i) {
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
index dfd1f7dc84..59a5c2da2e 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
@@ -1283,10 +1283,10 @@ CALGSLContext::virtualQueueDispatcherStart()
void
CALGSLContext::virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems,
- uint32 numMems, mcaddr signal, mcaddr loopStart)
+ uint32 numMems, mcaddr signal, mcaddr loopStart, uint32 numTemplates)
{
eventBegin(MainEngine);
- m_cs->VirtualQueueDispatcherEnd(mems, numMems, signal, loopStart);
+ m_cs->VirtualQueueDispatcherEnd(mems, numMems, signal, loopStart, numTemplates);
eventEnd(MainEngine, event);
}
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
index ecd04024d9..7310fd6266 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
@@ -46,7 +46,8 @@ public:
void runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems,
uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA);
mcaddr virtualQueueDispatcherStart();
- void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart);
+ void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems,
+ mcaddr signal, mcaddr loopStart, uint32 numTemplates);
void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue);
bool isDone(GpuEvent* event);