P4 to Git Change 1059564 by gandryey@gera-dev-w7 on 2014/07/25 18:14:33

ECR #304775 - Device enqueuing
	- Run the scheduler in the host queue by default.
	- GPU_USE_DEVICE_QUEUE=1 can force execution in the device queue

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#451 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#128 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#260 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#273 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#86 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#327 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#119 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#62 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#208 edit


[ROCm/clr commit: 3a4400135d]
This commit is contained in:
foreman
2014-07-25 20:41:05 -04:00
rodzic 9aca4fedc5
commit f8dc564915
12 zmienionych plików z 107 dodań i 51 usunięć
@@ -359,7 +359,6 @@ Device::Device()
: NullDevice()
, CALGSLDevice()
, numOfVgpus_(0)
, numDeviceQueues_(0)
, context_(NULL)
, heap_(NULL)
, dummyPage_(NULL)
@@ -468,7 +468,6 @@ public:
//! Returns the number of virtual GPUs allocated on this device
uint numOfVgpus() const { return numOfVgpus_; }
uint numOfVgpus_; //!< The number of virtual GPUs (lock protected)
uint numDeviceQueues_; //!< Number of device queues
typedef std::vector<VirtualGPU*> VirtualGPUs;
@@ -3975,7 +3975,17 @@ HSAILKernel::loadArguments(
const amd::DeviceQueue* queue =
*reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue = gpuQueue->vQueue()->vmAddress();
uint64_t vmQueue;
if (dev().settings().useDeviceQueue_) {
vmQueue = gpuQueue->vQueue()->vmAddress();
}
else {
if (!gpu.createVirtualQueue(queue->size())) {
LogError( "Virtual queue creaiton failed!");
return false;
}
vmQueue = gpu.vQueue()->vmAddress();
}
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
break;
}
@@ -66,6 +66,9 @@ struct SchedulerParam {
uint64_t scratch; //!< GPU address to the scratch buffer
uint32_t numMaxWaves; //!< The max number of possible waves
uint32_t releaseHostCP; //!< Releases CP on the host queue
uint64_t parentAQL; //!< Host parent AmdAqlWrap packet
uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue
uint32_t reserved; //!< Reserved field
};
} // namespace gpu
@@ -90,6 +90,9 @@ typedef struct _SchedulerParam {
ulong scratch; //!< GPU address to the scratch buffer
uint numMaxWaves; //!< Num max waves on the asic
uint releaseHostCP; //!< Releases CP on the host queue
ulong parentAQL; //!< Host parent AmdAqlWrap packet
uint dedicatedQueue; //!< Scheduler uses a dedicated queue
uint reserved; //!< Reserved field
} SchedulerParam;
typedef struct _HwDispatch {
@@ -276,7 +279,9 @@ static inline bool
checkWaitEvents(__global AmdEvent** events, uint numEvents)
{
for (uint i = 0; i < numEvents; ++i) {
if (atomic_and(&events[i]->state, 0xffffffff) != CL_COMPLETE) {
if (atomic_load_explicit(
(__global atomic_uint*)(&events[i]->state),
memory_order_acquire, memory_scope_device) != CL_COMPLETE) {
return false;
}
}
@@ -348,6 +353,8 @@ scheduler(
__global SchedulerParam* param = &params[paramIdx];
volatile __global HwDispatch* hwDisp =
(volatile __global HwDispatch*)param->hw_queue;
__global AmdAqlWrap* hostParent = (__global AmdAqlWrap*)(param->parentAQL);
__global uint* counter = (__global uint*)(&hostParent->child_counter);
__global uint* signal = (__global uint*)(&param->signal);
__global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1];
__global uint* amask = (__global uint *)queue->aql_slot_mask;
@@ -360,7 +367,7 @@ scheduler(
}
uint launch = 0;
uint loop;
uint loop = 1;
do {
uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
@@ -491,10 +498,16 @@ scheduler(
launch = atomic_load_explicit((__global atomic_uint*)&param->launch,
memory_order_acquire, memory_scope_device);
loop = atomic_load_explicit((__global atomic_uint*)signal,
memory_order_acquire, memory_scope_device);
if (param->dedicatedQueue) {
loop = atomic_load_explicit((__global atomic_uint*)signal,
memory_order_acquire, memory_scope_device);
}
else {
loop = atomic_load_explicit((__global atomic_uint*)counter,
memory_order_acquire, memory_scope_device);
}
} while ((launch == 0) && (loop == 1));
} while ((launch == 0) && (loop != 0));
if (loop == 0) {
//! \todo Write deadcode to the template, but somehow
@@ -504,6 +517,8 @@ scheduler(
hwDisp[1].condExe1 = 0xdeadc0de;
hwDisp[1].condExe2 = 0xdeadc0de;
hwDisp[1].condExe3 = 0xdeadc0de;
atomic_store_explicit((__global atomic_uint*)signal,
0, memory_order_release, memory_scope_device);
barrier(CLK_GLOBAL_MEM_FENCE);
atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe,
ResumeExecution, memory_order_release, memory_scope_device);
@@ -137,6 +137,9 @@ Settings::Settings()
// Use direct SRD by default
hsailDirectSRD_ = GPU_DIRECT_SRD;
// Use host queue for device enqueuing by default
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
}
bool
@@ -73,7 +73,8 @@ public:
uint apuSystem_: 1; //!< Device is APU system with shared memory
uint asyncMemCopy_: 1; //!< Use async memory transfers
uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
uint reserved_: 2;
uint useDeviceQueue_: 1; //!< Submit to separate device queue
uint reserved_: 1;
};
uint value_;
};
@@ -264,6 +264,24 @@ VirtualGPU::releasePinnedMem()
bool
VirtualGPU::createVirtualQueue(uint deviceQueueSize)
{
if (deviceQueueSize_ == deviceQueueSize) {
return true;
}
else {
//! @todo Temporarily keep the buffer mapped for debug purpose
if (NULL != schedParams_) {
schedParams_->unmap(this);
}
delete vqHeader_;
delete virtualQueue_;
delete schedParams_;
vqHeader_ = NULL;
virtualQueue_ = NULL;
schedParams_ = NULL;
schedParamIdx_ = 0;
deviceQueueSize_ = 0;
}
uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
uint allocSize = deviceQueueSize;
@@ -339,6 +357,8 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
ptr = reinterpret_cast<address>(schedParams_->map(this));
deviceQueueSize_ = deviceQueueSize;
return true;
}
@@ -365,6 +385,7 @@ VirtualGPU::VirtualGPU(
, virtualQueue_(NULL)
, schedParams_(NULL)
, schedParamIdx_(0)
, deviceQueueSize_(0)
, hsaQueueMem_(NULL)
{
memset(&cal_, 0, sizeof(CalVirtualDesc));
@@ -453,23 +474,7 @@ VirtualGPU::create(
#endif // !cl_amd_open_video
{
if (dev().engines().numComputeRings()) {
uint idx;
//! @todo Temporary workaround for Linux, because 2 HW queues only
//! Fixes conformance failures with multi queues
if ((0 == deviceQueueSize) || IS_WINDOWS) {
//! @note: Add 1 to account the device queue for transfers
idx = (index() + 1) % (dev().engines().numComputeRings() -
gpuDevice_.numDeviceQueues_);
}
else {
gpuDevice_.numDeviceQueues_++;
if (gpuDevice_.numDeviceQueues_ >= dev().engines().numComputeRings()) {
return false;
}
idx = (dev().engines().numComputeRings() - gpuDevice_.numDeviceQueues_)
% dev().engines().numComputeRings();
}
uint idx = index() % dev().engines().numComputeRings();
// hwRing_ should be set 0 if forced to have single scratch buffer
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
@@ -558,10 +563,9 @@ VirtualGPU::create(
return false;
}
//! @todo for testing only
//deviceQueueSize = (deviceQueueSize == 0) ? (128 * Ki) : deviceQueueSize;
// Check if the app requested a device queue creation
if ((0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
if (dev().settings().useDeviceQueue_ &&
(0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
LogError("Could not create a virtual queue!");
return false;
}
@@ -598,10 +602,6 @@ VirtualGPU::~VirtualGPU()
amd::ScopedLock k(dev().lockAsyncOps());
amd::ScopedLock lock(dev().vgpusAccess());
if ((NULL != virtualQueue_) && IS_LINUX) {
gpuDevice_.numDeviceQueues_--;
}
uint i;
// Destroy all kernels
for (GslKernels::const_iterator it = gslKernels_.begin();
@@ -1696,13 +1696,19 @@ VirtualGPU::submitKernelInternalHSA(
return false;
}
else {
gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
if (dev().settings().useDeviceQueue_) {
gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
if (gpuDefQueue->hwRing() == hwRing()) {
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
return false;
}
}
else {
createVirtualQueue(defQueue->size());
gpuDefQueue = this;
}
}
vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress();
if (gpuDefQueue->hwRing() == hwRing()) {
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
return false;
}
// Add memory handles before the actual dispatch
memList.push_back(gpuDefQueue->virtualQueue_);
@@ -1830,6 +1836,14 @@ VirtualGPU::submitKernelInternalHSA(
}
}
if (!dev().settings().useDeviceQueue_) {
// Add the termination handshake to the host queue
virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
0, dev().settings().useDeviceQueue_);
}
// Get the global loop start before the scheduler
mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart();
static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
@@ -1842,6 +1856,7 @@ VirtualGPU::submitKernelInternalHSA(
// Get the address of PM4 template and add write it to params
//! @note DMA flush must not occur between patch and the scheduler
mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart();
// Program parameters for the scheduler
SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
(gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
@@ -1852,6 +1867,9 @@ VirtualGPU::submitKernelInternalHSA(
param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
param->launch = 0;
param->releaseHostCP = 0;
param->parentAQL = vmParentWrap;
param->dedicatedQueue = dev().settings().useDeviceQueue_;
// Fill the scratch buffer information
if (hsaKernel.prog().maxScratchRegs() > 0) {
gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];
@@ -1880,16 +1898,20 @@ VirtualGPU::submitKernelInternalHSA(
gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent,
gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
signalAddr, loopStart);
// Set GPU event for the used resources
for (uint i = 0; i < memList.size(); ++i) {
memList[i]->setBusy(*gpuDefQueue, gpuEvent);
}
// Add the termination handshake to the host queue
virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
signalAddr);
if (dev().settings().useDeviceQueue_) {
// Add the termination handshake to the host queue
virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
signalAddr, dev().settings().useDeviceQueue_);
}
++gpuDefQueue->schedParamIdx_ %=
gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
//! \todo optimize the wrap around
@@ -387,6 +387,11 @@ public:
//! Update virtual queue header
void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(
uint deviceQueueSize //!< Device queue size
);
EngineType engineID_; //!< Engine ID for this VirtualGPU
ResourceSlots slots_; //!< Resource slots for kernel arguments
State state_; //!< virtual GPU current state
@@ -488,11 +493,6 @@ private:
const amd::BufferRect& dstRect //!< region of destination for copy
);
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(
uint deviceQueueSize //!< Device queue size
);
GslKernels gslKernels_; //!< GSL kernel descriptors
GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors
GpuEvents gpuEvents_; //!< GPU events
@@ -529,6 +529,7 @@ private:
Memory* virtualQueue_; //!< Virtual device queue
Memory* schedParams_; //!< The scheduler parameters
uint schedParamIdx_; //!< Index in the scheduler parameters buffer
uint deviceQueueSize_; //!< Device queue size
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
};
@@ -1290,9 +1290,9 @@ CALGSLContext::virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* me
void
CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal)
uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue)
{
eventBegin(MainEngine);
m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal);
m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue);
eventEnd(MainEngine, event);
}
@@ -47,7 +47,8 @@ public:
uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA);
mcaddr virtualQueueDispatcherStart();
void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart);
void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState, uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal);
void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue);
bool isDone(GpuEvent* event);
void waitForEvent(GpuEvent* event);
void flushIOCaches() const;
@@ -150,6 +150,8 @@ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
"Prints the specified number of the child kernels") \
release(bool, GPU_DIRECT_SRD, true, \
"Use indirect SRD access in HSAIL") \
release(bool, GPU_USE_DEVICE_QUEUE, false, \
"Use a dedicated device queue for the actual submissions") \
release(bool, AMD_DEPTH_MSAA_INTEROP, false, \
"Enable depth stencil and MSAA buffer interop") \
release(bool, AMD_THREAD_TRACE_ENABLE, false, \