P4 to Git Change 1056280 by gandryey@gera-dev-w7 on 2014/07/17 11:55:30
ECR #304775 - Device enqueuing - Add extra sync point for the scheduler exit. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#323 edit
This commit is contained in:
@@ -65,7 +65,7 @@ struct SchedulerParam {
|
||||
uint32_t scratchSize; //!< Scratch buffer size
|
||||
uint64_t scratch; //!< GPU address to the scratch buffer
|
||||
uint32_t numMaxWaves; //!< The max number of possible waves
|
||||
uint32_t reserved; //!< reserved
|
||||
uint32_t releaseHostCP; //!< Releases CP on the host queue
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
@@ -8,7 +8,6 @@ namespace gpu {
|
||||
|
||||
const char* SchedulerSourceCode = SCHEDULER_KERNEL(
|
||||
\n
|
||||
|
||||
//! AmdAqlWrap slot state
|
||||
enum AqlWrapState {
|
||||
AQL_WRAP_FREE = 0,
|
||||
@@ -90,7 +89,7 @@ typedef struct _SchedulerParam {
|
||||
uint scratchSize; //!< Scratch buffer size
|
||||
ulong scratch; //!< GPU address to the scratch buffer
|
||||
uint numMaxWaves; //!< Num max waves on the asic
|
||||
uint reserved; //!< Reserved
|
||||
uint releaseHostCP; //!< Releases CP on the host queue
|
||||
} SchedulerParam;
|
||||
|
||||
typedef struct _HwDispatch {
|
||||
@@ -353,11 +352,14 @@ scheduler(
|
||||
__global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1];
|
||||
__global uint* amask = (__global uint *)queue->aql_slot_mask;
|
||||
|
||||
uint launch;
|
||||
do {
|
||||
launch = atomic_load_explicit((__global atomic_uint*)¶m->launch,
|
||||
memory_order_acquire, memory_scope_device);
|
||||
} while (launch != 0);
|
||||
//! @todo This is an unexplained behavior.
|
||||
//! The scheduler can be launched one more time after termination.
|
||||
if (1 == atomic_load_explicit((__global atomic_uint*)¶m->releaseHostCP,
|
||||
memory_order_acquire, memory_scope_device)) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint launch = 0;
|
||||
uint loop;
|
||||
|
||||
do {
|
||||
@@ -494,7 +496,18 @@ scheduler(
|
||||
} while ((launch == 0) && (loop == 1));
|
||||
|
||||
if (loop == 0) {
|
||||
atomic_or(&hwDisp->startExe, ResumeExecution);
|
||||
//! \todo Write deadcode to the template, but somehow
|
||||
//! the scheduler will be launched one more time.
|
||||
hwDisp->packet0 = 0xdeadc0de;
|
||||
hwDisp[1].condExe0 = 0xdeadc0de;
|
||||
hwDisp[1].condExe1 = 0xdeadc0de;
|
||||
hwDisp[1].condExe2 = 0xdeadc0de;
|
||||
hwDisp[1].condExe3 = 0xdeadc0de;
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe,
|
||||
ResumeExecution, memory_order_release, memory_scope_device);
|
||||
atomic_store_explicit((__global atomic_uint*)¶m->releaseHostCP,
|
||||
1, memory_order_release, memory_scope_device);
|
||||
}
|
||||
}
|
||||
\n
|
||||
|
||||
@@ -1792,7 +1792,7 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
|
||||
print << "Kernel: " << child->name() << "\n";
|
||||
static const char* Names[HSAILKernel::ExtraArguments] = {
|
||||
"Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWarap: "};
|
||||
"Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
|
||||
for (j = 0; j < HSAILKernel::ExtraArguments; ++j) {
|
||||
print << "\t" << Names[j] << *(size_t*)argum;
|
||||
print << "\n";
|
||||
@@ -1832,6 +1832,7 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
|
||||
param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
|
||||
param->launch = 0;
|
||||
param->releaseHostCP = 0;
|
||||
// Fill the scratch buffer information
|
||||
if (hsaKernel.prog().maxScratchRegs() > 0) {
|
||||
gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];
|
||||
|
||||
مرجع در شماره جدید
Block a user