P4 to Git Change 1052832 by gandryey@gera-dev-w7 on 2014/07/07 18:44:29
ECR #304775 - Device enqueuing - Update the scheduler to handle event mask Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#320 edit
This commit is contained in:
@@ -164,7 +164,8 @@ const uint StallExecution = 0x00000000; // 0x01000000
|
||||
const uint WavefrontSize = 64;
|
||||
const uint MaxWaveSize = 0x400;
|
||||
|
||||
void dispatch(
|
||||
static inline void
|
||||
dispatch(
|
||||
volatile __global HwDispatch* dispatch,
|
||||
__global HsaAqlDispatchPacket* aqlPkt,
|
||||
uint scratchSize,
|
||||
@@ -272,20 +273,21 @@ void dispatch(
|
||||
dispatch->startExe = ResumeExecution;
|
||||
}
|
||||
|
||||
bool
|
||||
checkWaitEvents(__global AmdEvent* events, uint numEvents)
|
||||
static inline bool
|
||||
checkWaitEvents(__global AmdEvent** events, uint numEvents)
|
||||
{
|
||||
for (uint i = 0; i < numEvents; ++i) {
|
||||
if (atomic_and(&events[i].state, 0xffffffff) != CL_COMPLETE) {
|
||||
if (atomic_and(&events[i]->state, 0xffffffff) != CL_COMPLETE) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// release slot in a bitmask controlled resource i is the slot number
|
||||
static inline void
|
||||
release_slot(__global uint * restrict mask, uint i)
|
||||
release_slot(__global uint* restrict mask, uint i)
|
||||
{
|
||||
/* uint b = ~(1UL << (i & 0x1f)); */
|
||||
uint b = ~amd_bfm(1U, i);
|
||||
@@ -301,6 +303,26 @@ release_slot(__global uint * restrict mask, uint i)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
releaseEvent(__global AmdEvent* ev, __global uint* emask, __global AmdEvent* eb)
|
||||
{
|
||||
uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U,
|
||||
memory_order_acq_rel, memory_scope_device);
|
||||
if (c == 1U) {
|
||||
uint i = ev - eb;
|
||||
release_slot(emask, i);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
releaseWaitEvents(__global AmdEvent** events, uint numEvents,
|
||||
__global uint* emask, __global AmdEvent* eb)
|
||||
{
|
||||
for (uint i = 0; i < numEvents; ++i) {
|
||||
releaseEvent(events[i], emask, eb);
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint
|
||||
min_command(uint slot_num, __global AmdAqlWrap* wraps)
|
||||
{
|
||||
@@ -370,7 +392,7 @@ scheduler(
|
||||
if (disp->wait_num != 0) {
|
||||
// Check if the wait list is COMPLETE
|
||||
launch = checkWaitEvents(
|
||||
(__global AmdEvent*)(disp->wait_list), disp->wait_num);
|
||||
(__global AmdEvent**)(disp->wait_list), disp->wait_num);
|
||||
}
|
||||
else {
|
||||
launch = 1;
|
||||
@@ -381,12 +403,15 @@ scheduler(
|
||||
memory_order_acq_rel, memory_order_acq_rel, memory_scope_device)) {
|
||||
if (event != 0) {
|
||||
event->timer[PROFILING_COMMAND_START] =
|
||||
(__hsail_get_clock() * 1000) / (ulong)param->eng_clk;
|
||||
(__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
|
||||
}
|
||||
// Launch child kernel ....
|
||||
dispatch(hwDisp, &disp->aql, param->scratchSize, param->numMaxWaves,
|
||||
param->scratch, param->hsa_queue);
|
||||
disp->state = AQL_WRAP_BUSY;
|
||||
releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
|
||||
disp->wait_num, (__global uint*)queue->event_slot_mask,
|
||||
(__global AmdEvent*)queue->event_slots);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -402,8 +427,11 @@ scheduler(
|
||||
else {
|
||||
// Check if the wait list is COMPLETE
|
||||
if (checkWaitEvents(
|
||||
(__global AmdEvent*)(disp->wait_list), disp->wait_num)) {
|
||||
(__global AmdEvent**)(disp->wait_list), disp->wait_num)) {
|
||||
complete = true;
|
||||
releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
|
||||
disp->wait_num, (__global uint*)queue->event_slot_mask,
|
||||
(__global AmdEvent*)queue->event_slots);
|
||||
}
|
||||
}
|
||||
if (complete) {
|
||||
@@ -414,6 +442,8 @@ scheduler(
|
||||
event->state = CL_COMPLETE;
|
||||
disp->state = AQL_WRAP_FREE;
|
||||
release_slot(amask, idx);
|
||||
releaseEvent(event, (__global uint*)queue->event_slot_mask,
|
||||
(__global AmdEvent*)queue->event_slots);
|
||||
}
|
||||
}
|
||||
else if (slotState == AQL_WRAP_DONE) {
|
||||
@@ -424,12 +454,14 @@ scheduler(
|
||||
event->state = CL_COMPLETE;
|
||||
event->timer[PROFILING_COMMAND_END] =
|
||||
event->timer[PROFILING_COMMAND_COMPLETE] =
|
||||
(__hsail_get_clock() * 1000) / (ulong)param->eng_clk;
|
||||
(__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
|
||||
}
|
||||
else {
|
||||
event->timer[PROFILING_COMMAND_END] =
|
||||
(__hsail_get_clock() * 1000) / (ulong)param->eng_clk;
|
||||
(__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
|
||||
}
|
||||
releaseEvent(event, (__global uint *)queue->event_slot_mask,
|
||||
(__global AmdEvent *)queue->event_slots);
|
||||
}
|
||||
// The current dispatch doesn't have any outstanding children
|
||||
if (disp->child_counter == 0) {
|
||||
|
||||
@@ -284,11 +284,11 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
|
||||
uint eventMaskOffs = allocSize;
|
||||
// Add mask array for events
|
||||
allocSize += amd::alignUp(dev().settings().numDeviceEvents_, 32) / 32;
|
||||
allocSize += amd::alignUp(dev().settings().numDeviceEvents_, 32) / 8;
|
||||
|
||||
uint slotMaskOffs = allocSize;
|
||||
// Add mask array for AmdAqlWrap slots
|
||||
allocSize += amd::alignUp(numSlots, 32) / 32;
|
||||
allocSize += amd::alignUp(numSlots, 32) / 8;
|
||||
|
||||
virtualQueue_ = new Memory(dev(), allocSize);
|
||||
Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ?
|
||||
@@ -1680,6 +1680,10 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
|
||||
}
|
||||
vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress();
|
||||
if (gpuDefQueue->hwRing() == hwRing()) {
|
||||
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Add memory handles before the actual dispatch
|
||||
memList.push_back(gpuDefQueue->virtualQueue_);
|
||||
@@ -1811,7 +1815,8 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
|
||||
(gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
|
||||
param->signal = 1;
|
||||
param->eng_clk = dev().info().maxClockFrequency_;
|
||||
// Scale clock to 1024 to avoid 64 bit div in the scheduler
|
||||
param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_;
|
||||
param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
|
||||
param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
|
||||
param->launch = 0;
|
||||
|
||||
Viittaa uudesa ongelmassa
Block a user