P4 to Git Change 1059564 by gandryey@gera-dev-w7 on 2014/07/25 18:14:33
ECR #304775 - Device enqueuing
- Run the scheduler in the host queue by default.
- GPU_USE_DEVICE_QUEUE=1 can force execution in the device queue
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#451 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#128 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#260 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#273 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#86 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#327 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#119 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#62 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#208 edit
[ROCm/clr commit: 3a4400135d]
This commit is contained in:
@@ -359,7 +359,6 @@ Device::Device()
|
||||
: NullDevice()
|
||||
, CALGSLDevice()
|
||||
, numOfVgpus_(0)
|
||||
, numDeviceQueues_(0)
|
||||
, context_(NULL)
|
||||
, heap_(NULL)
|
||||
, dummyPage_(NULL)
|
||||
|
||||
@@ -468,7 +468,6 @@ public:
|
||||
//! Returns the number of virtual GPUs allocated on this device
|
||||
uint numOfVgpus() const { return numOfVgpus_; }
|
||||
uint numOfVgpus_; //!< The number of virtual GPUs (lock protected)
|
||||
uint numDeviceQueues_; //!< Number of device queues
|
||||
|
||||
typedef std::vector<VirtualGPU*> VirtualGPUs;
|
||||
|
||||
|
||||
@@ -3975,7 +3975,17 @@ HSAILKernel::loadArguments(
|
||||
const amd::DeviceQueue* queue =
|
||||
*reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
|
||||
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
||||
uint64_t vmQueue = gpuQueue->vQueue()->vmAddress();
|
||||
uint64_t vmQueue;
|
||||
if (dev().settings().useDeviceQueue_) {
|
||||
vmQueue = gpuQueue->vQueue()->vmAddress();
|
||||
}
|
||||
else {
|
||||
if (!gpu.createVirtualQueue(queue->size())) {
|
||||
LogError( "Virtual queue creaiton failed!");
|
||||
return false;
|
||||
}
|
||||
vmQueue = gpu.vQueue()->vmAddress();
|
||||
}
|
||||
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -66,6 +66,9 @@ struct SchedulerParam {
|
||||
uint64_t scratch; //!< GPU address to the scratch buffer
|
||||
uint32_t numMaxWaves; //!< The max number of possible waves
|
||||
uint32_t releaseHostCP; //!< Releases CP on the host queue
|
||||
uint64_t parentAQL; //!< Host parent AmdAqlWrap packet
|
||||
uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue
|
||||
uint32_t reserved; //!< Reserved field
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
@@ -90,6 +90,9 @@ typedef struct _SchedulerParam {
|
||||
ulong scratch; //!< GPU address to the scratch buffer
|
||||
uint numMaxWaves; //!< Num max waves on the asic
|
||||
uint releaseHostCP; //!< Releases CP on the host queue
|
||||
ulong parentAQL; //!< Host parent AmdAqlWrap packet
|
||||
uint dedicatedQueue; //!< Scheduler uses a dedicated queue
|
||||
uint reserved; //!< Reserved field
|
||||
} SchedulerParam;
|
||||
|
||||
typedef struct _HwDispatch {
|
||||
@@ -276,7 +279,9 @@ static inline bool
|
||||
checkWaitEvents(__global AmdEvent** events, uint numEvents)
|
||||
{
|
||||
for (uint i = 0; i < numEvents; ++i) {
|
||||
if (atomic_and(&events[i]->state, 0xffffffff) != CL_COMPLETE) {
|
||||
if (atomic_load_explicit(
|
||||
(__global atomic_uint*)(&events[i]->state),
|
||||
memory_order_acquire, memory_scope_device) != CL_COMPLETE) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -348,6 +353,8 @@ scheduler(
|
||||
__global SchedulerParam* param = ¶ms[paramIdx];
|
||||
volatile __global HwDispatch* hwDisp =
|
||||
(volatile __global HwDispatch*)param->hw_queue;
|
||||
__global AmdAqlWrap* hostParent = (__global AmdAqlWrap*)(param->parentAQL);
|
||||
__global uint* counter = (__global uint*)(&hostParent->child_counter);
|
||||
__global uint* signal = (__global uint*)(¶m->signal);
|
||||
__global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1];
|
||||
__global uint* amask = (__global uint *)queue->aql_slot_mask;
|
||||
@@ -360,7 +367,7 @@ scheduler(
|
||||
}
|
||||
|
||||
uint launch = 0;
|
||||
uint loop;
|
||||
uint loop = 1;
|
||||
|
||||
do {
|
||||
uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
|
||||
@@ -491,10 +498,16 @@ scheduler(
|
||||
launch = atomic_load_explicit((__global atomic_uint*)¶m->launch,
|
||||
memory_order_acquire, memory_scope_device);
|
||||
|
||||
loop = atomic_load_explicit((__global atomic_uint*)signal,
|
||||
memory_order_acquire, memory_scope_device);
|
||||
if (param->dedicatedQueue) {
|
||||
loop = atomic_load_explicit((__global atomic_uint*)signal,
|
||||
memory_order_acquire, memory_scope_device);
|
||||
}
|
||||
else {
|
||||
loop = atomic_load_explicit((__global atomic_uint*)counter,
|
||||
memory_order_acquire, memory_scope_device);
|
||||
}
|
||||
|
||||
} while ((launch == 0) && (loop == 1));
|
||||
} while ((launch == 0) && (loop != 0));
|
||||
|
||||
if (loop == 0) {
|
||||
//! \todo Write deadcode to the template, but somehow
|
||||
@@ -504,6 +517,8 @@ scheduler(
|
||||
hwDisp[1].condExe1 = 0xdeadc0de;
|
||||
hwDisp[1].condExe2 = 0xdeadc0de;
|
||||
hwDisp[1].condExe3 = 0xdeadc0de;
|
||||
atomic_store_explicit((__global atomic_uint*)signal,
|
||||
0, memory_order_release, memory_scope_device);
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe,
|
||||
ResumeExecution, memory_order_release, memory_scope_device);
|
||||
|
||||
@@ -137,6 +137,9 @@ Settings::Settings()
|
||||
|
||||
// Use direct SRD by default
|
||||
hsailDirectSRD_ = GPU_DIRECT_SRD;
|
||||
|
||||
// Use host queue for device enqueuing by default
|
||||
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
|
||||
}
|
||||
|
||||
bool
|
||||
|
||||
@@ -73,7 +73,8 @@ public:
|
||||
uint apuSystem_: 1; //!< Device is APU system with shared memory
|
||||
uint asyncMemCopy_: 1; //!< Use async memory transfers
|
||||
uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
|
||||
uint reserved_: 2;
|
||||
uint useDeviceQueue_: 1; //!< Submit to separate device queue
|
||||
uint reserved_: 1;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -264,6 +264,24 @@ VirtualGPU::releasePinnedMem()
|
||||
bool
|
||||
VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
{
|
||||
if (deviceQueueSize_ == deviceQueueSize) {
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
//! @todo Temporarily keep the buffer mapped for debug purpose
|
||||
if (NULL != schedParams_) {
|
||||
schedParams_->unmap(this);
|
||||
}
|
||||
delete vqHeader_;
|
||||
delete virtualQueue_;
|
||||
delete schedParams_;
|
||||
vqHeader_ = NULL;
|
||||
virtualQueue_ = NULL;
|
||||
schedParams_ = NULL;
|
||||
schedParamIdx_ = 0;
|
||||
deviceQueueSize_ = 0;
|
||||
}
|
||||
|
||||
uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
|
||||
uint allocSize = deviceQueueSize;
|
||||
|
||||
@@ -339,6 +357,8 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
|
||||
ptr = reinterpret_cast<address>(schedParams_->map(this));
|
||||
|
||||
deviceQueueSize_ = deviceQueueSize;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -365,6 +385,7 @@ VirtualGPU::VirtualGPU(
|
||||
, virtualQueue_(NULL)
|
||||
, schedParams_(NULL)
|
||||
, schedParamIdx_(0)
|
||||
, deviceQueueSize_(0)
|
||||
, hsaQueueMem_(NULL)
|
||||
{
|
||||
memset(&cal_, 0, sizeof(CalVirtualDesc));
|
||||
@@ -453,23 +474,7 @@ VirtualGPU::create(
|
||||
#endif // !cl_amd_open_video
|
||||
{
|
||||
if (dev().engines().numComputeRings()) {
|
||||
uint idx;
|
||||
|
||||
//! @todo Temporary workaround for Linux, because 2 HW queues only
|
||||
//! Fixes conformance failures with multi queues
|
||||
if ((0 == deviceQueueSize) || IS_WINDOWS) {
|
||||
//! @note: Add 1 to account the device queue for transfers
|
||||
idx = (index() + 1) % (dev().engines().numComputeRings() -
|
||||
gpuDevice_.numDeviceQueues_);
|
||||
}
|
||||
else {
|
||||
gpuDevice_.numDeviceQueues_++;
|
||||
if (gpuDevice_.numDeviceQueues_ >= dev().engines().numComputeRings()) {
|
||||
return false;
|
||||
}
|
||||
idx = (dev().engines().numComputeRings() - gpuDevice_.numDeviceQueues_)
|
||||
% dev().engines().numComputeRings();
|
||||
}
|
||||
uint idx = index() % dev().engines().numComputeRings();
|
||||
|
||||
// hwRing_ should be set 0 if forced to have single scratch buffer
|
||||
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
|
||||
@@ -558,10 +563,9 @@ VirtualGPU::create(
|
||||
return false;
|
||||
}
|
||||
|
||||
//! @todo for testing only
|
||||
//deviceQueueSize = (deviceQueueSize == 0) ? (128 * Ki) : deviceQueueSize;
|
||||
// Check if the app requested a device queue creation
|
||||
if ((0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
|
||||
if (dev().settings().useDeviceQueue_ &&
|
||||
(0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
|
||||
LogError("Could not create a virtual queue!");
|
||||
return false;
|
||||
}
|
||||
@@ -598,10 +602,6 @@ VirtualGPU::~VirtualGPU()
|
||||
amd::ScopedLock k(dev().lockAsyncOps());
|
||||
amd::ScopedLock lock(dev().vgpusAccess());
|
||||
|
||||
if ((NULL != virtualQueue_) && IS_LINUX) {
|
||||
gpuDevice_.numDeviceQueues_--;
|
||||
}
|
||||
|
||||
uint i;
|
||||
// Destroy all kernels
|
||||
for (GslKernels::const_iterator it = gslKernels_.begin();
|
||||
@@ -1696,13 +1696,19 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
|
||||
if (dev().settings().useDeviceQueue_) {
|
||||
gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
|
||||
if (gpuDefQueue->hwRing() == hwRing()) {
|
||||
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
createVirtualQueue(defQueue->size());
|
||||
gpuDefQueue = this;
|
||||
}
|
||||
}
|
||||
vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress();
|
||||
if (gpuDefQueue->hwRing() == hwRing()) {
|
||||
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Add memory handles before the actual dispatch
|
||||
memList.push_back(gpuDefQueue->virtualQueue_);
|
||||
@@ -1830,6 +1836,14 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
}
|
||||
}
|
||||
|
||||
if (!dev().settings().useDeviceQueue_) {
|
||||
// Add the termination handshake to the host queue
|
||||
virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
|
||||
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
0, dev().settings().useDeviceQueue_);
|
||||
}
|
||||
|
||||
// Get the global loop start before the scheduler
|
||||
mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart();
|
||||
static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
|
||||
@@ -1842,6 +1856,7 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
// Get the address of PM4 template and add write it to params
|
||||
//! @note DMA flush must not occur between patch and the scheduler
|
||||
mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart();
|
||||
|
||||
// Program parameters for the scheduler
|
||||
SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
|
||||
(gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
|
||||
@@ -1852,6 +1867,9 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
|
||||
param->launch = 0;
|
||||
param->releaseHostCP = 0;
|
||||
param->parentAQL = vmParentWrap;
|
||||
param->dedicatedQueue = dev().settings().useDeviceQueue_;
|
||||
|
||||
// Fill the scratch buffer information
|
||||
if (hsaKernel.prog().maxScratchRegs() > 0) {
|
||||
gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];
|
||||
@@ -1880,16 +1898,20 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent,
|
||||
gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
|
||||
signalAddr, loopStart);
|
||||
|
||||
// Set GPU event for the used resources
|
||||
for (uint i = 0; i < memList.size(); ++i) {
|
||||
memList[i]->setBusy(*gpuDefQueue, gpuEvent);
|
||||
}
|
||||
|
||||
// Add the termination handshake to the host queue
|
||||
virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
|
||||
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
signalAddr);
|
||||
if (dev().settings().useDeviceQueue_) {
|
||||
// Add the termination handshake to the host queue
|
||||
virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
|
||||
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
|
||||
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
|
||||
signalAddr, dev().settings().useDeviceQueue_);
|
||||
}
|
||||
|
||||
++gpuDefQueue->schedParamIdx_ %=
|
||||
gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
|
||||
//! \todo optimize the wrap around
|
||||
|
||||
@@ -387,6 +387,11 @@ public:
|
||||
//! Update virtual queue header
|
||||
void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
|
||||
|
||||
//! Returns TRUE if virtual queue was successfully allocatted
|
||||
bool createVirtualQueue(
|
||||
uint deviceQueueSize //!< Device queue size
|
||||
);
|
||||
|
||||
EngineType engineID_; //!< Engine ID for this VirtualGPU
|
||||
ResourceSlots slots_; //!< Resource slots for kernel arguments
|
||||
State state_; //!< virtual GPU current state
|
||||
@@ -488,11 +493,6 @@ private:
|
||||
const amd::BufferRect& dstRect //!< region of destination for copy
|
||||
);
|
||||
|
||||
//! Returns TRUE if virtual queue was successfully allocatted
|
||||
bool createVirtualQueue(
|
||||
uint deviceQueueSize //!< Device queue size
|
||||
);
|
||||
|
||||
GslKernels gslKernels_; //!< GSL kernel descriptors
|
||||
GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors
|
||||
GpuEvents gpuEvents_; //!< GPU events
|
||||
@@ -529,6 +529,7 @@ private:
|
||||
Memory* virtualQueue_; //!< Virtual device queue
|
||||
Memory* schedParams_; //!< The scheduler parameters
|
||||
uint schedParamIdx_; //!< Index in the scheduler parameters buffer
|
||||
uint deviceQueueSize_; //!< Device queue size
|
||||
|
||||
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
|
||||
};
|
||||
|
||||
@@ -1290,9 +1290,9 @@ CALGSLContext::virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* me
|
||||
|
||||
void
|
||||
CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
|
||||
uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal)
|
||||
uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue)
|
||||
{
|
||||
eventBegin(MainEngine);
|
||||
m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal);
|
||||
m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue);
|
||||
eventEnd(MainEngine, event);
|
||||
}
|
||||
|
||||
@@ -47,7 +47,8 @@ public:
|
||||
uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA);
|
||||
mcaddr virtualQueueDispatcherStart();
|
||||
void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart);
|
||||
void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState, uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal);
|
||||
void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
|
||||
uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue);
|
||||
bool isDone(GpuEvent* event);
|
||||
void waitForEvent(GpuEvent* event);
|
||||
void flushIOCaches() const;
|
||||
|
||||
@@ -150,6 +150,8 @@ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
|
||||
"Prints the specified number of the child kernels") \
|
||||
release(bool, GPU_DIRECT_SRD, true, \
|
||||
"Use indirect SRD access in HSAIL") \
|
||||
release(bool, GPU_USE_DEVICE_QUEUE, false, \
|
||||
"Use a dedicated device queue for the actual submissions") \
|
||||
release(bool, AMD_DEPTH_MSAA_INTEROP, false, \
|
||||
"Enable depth stencil and MSAA buffer interop") \
|
||||
release(bool, AMD_THREAD_TRACE_ENABLE, false, \
|
||||
|
||||
Reference in New Issue
Block a user