P4 to Git Change 1056280 by gandryey@gera-dev-w7 on 2014/07/17 11:55:30

ECR #304775 - Device enqueuing
	- Add extra sync point for the scheduler exit.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#323 edit
This commit is contained in:
foreman
2014-07-17 12:20:49 -04:00
والد 4798473034
کامیت 6627cf5cf7
3فایلهای تغییر یافته به همراه24 افزوده شده و 10 حذف شده
@@ -65,7 +65,7 @@ struct SchedulerParam {
uint32_t scratchSize; //!< Scratch buffer size
uint64_t scratch; //!< GPU address to the scratch buffer
uint32_t numMaxWaves; //!< The max number of possible waves
uint32_t reserved; //!< reserved
uint32_t releaseHostCP; //!< Releases CP on the host queue
};
} // namespace gpu
@@ -8,7 +8,6 @@ namespace gpu {
const char* SchedulerSourceCode = SCHEDULER_KERNEL(
\n
//! AmdAqlWrap slot state
enum AqlWrapState {
AQL_WRAP_FREE = 0,
@@ -90,7 +89,7 @@ typedef struct _SchedulerParam {
uint scratchSize; //!< Scratch buffer size
ulong scratch; //!< GPU address to the scratch buffer
uint numMaxWaves; //!< Num max waves on the asic
uint reserved; //!< Reserved
uint releaseHostCP; //!< Releases CP on the host queue
} SchedulerParam;
typedef struct _HwDispatch {
@@ -353,11 +352,14 @@ scheduler(
__global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1];
__global uint* amask = (__global uint *)queue->aql_slot_mask;
uint launch;
do {
launch = atomic_load_explicit((__global atomic_uint*)&param->launch,
memory_order_acquire, memory_scope_device);
} while (launch != 0);
//! @todo This is an unexplained behavior.
//! The scheduler can be launched one more time after termination.
if (1 == atomic_load_explicit((__global atomic_uint*)&param->releaseHostCP,
memory_order_acquire, memory_scope_device)) {
return;
}
uint launch = 0;
uint loop;
do {
@@ -494,7 +496,18 @@ scheduler(
} while ((launch == 0) && (loop == 1));
if (loop == 0) {
atomic_or(&hwDisp->startExe, ResumeExecution);
//! \todo Write deadcode to the template, but somehow
//! the scheduler will be launched one more time.
hwDisp->packet0 = 0xdeadc0de;
hwDisp[1].condExe0 = 0xdeadc0de;
hwDisp[1].condExe1 = 0xdeadc0de;
hwDisp[1].condExe2 = 0xdeadc0de;
hwDisp[1].condExe3 = 0xdeadc0de;
barrier(CLK_GLOBAL_MEM_FENCE);
atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe,
ResumeExecution, memory_order_release, memory_scope_device);
atomic_store_explicit((__global atomic_uint*)&param->releaseHostCP,
1, memory_order_release, memory_scope_device);
}
}
\n
@@ -1792,7 +1792,7 @@ VirtualGPU::submitKernelInternalHSA(
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
print << "Kernel: " << child->name() << "\n";
static const char* Names[HSAILKernel::ExtraArguments] = {
"Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWarap: "};
"Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
for (j = 0; j < HSAILKernel::ExtraArguments; ++j) {
print << "\t" << Names[j] << *(size_t*)argum;
print << "\n";
@@ -1832,6 +1832,7 @@ VirtualGPU::submitKernelInternalHSA(
param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
param->launch = 0;
param->releaseHostCP = 0;
// Fill the scratch buffer information
if (hsaKernel.prog().maxScratchRegs() > 0) {
gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];