From f8dc564915875c6eff34acc204091040c7facc0d Mon Sep 17 00:00:00 2001 From: foreman Date: Fri, 25 Jul 2014 20:41:05 -0400 Subject: [PATCH] P4 to Git Change 1059564 by gandryey@gera-dev-w7 on 2014/07/25 18:14:33 ECR #304775 - Device enqueuing - Run the scheduler in the host queue by default. - GPU_USE_DEVICE_QUEUE=1 can force execution in the device queue Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#451 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#128 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#260 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#273 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#86 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#327 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#119 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#62 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#36 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#208 edit [ROCm/clr commit: 3a4400135d4f1716beb799527806e06582b97a75] --- .../rocclr/runtime/device/gpu/gpudevice.cpp | 1 - .../rocclr/runtime/device/gpu/gpudevice.hpp | 1 - .../rocclr/runtime/device/gpu/gpukernel.cpp | 12 ++- .../rocclr/runtime/device/gpu/gpusched.hpp | 3 + .../rocclr/runtime/device/gpu/gpuschedcl.cpp | 25 ++++-- .../rocclr/runtime/device/gpu/gpusettings.cpp | 3 + .../rocclr/runtime/device/gpu/gpusettings.hpp | 3 +- .../rocclr/runtime/device/gpu/gpuvirtual.cpp | 90 ++++++++++++------- .../rocclr/runtime/device/gpu/gpuvirtual.hpp | 11 +-- .../device/gpu/gslbe/src/rt/GSLContext.cpp | 4 +- .../device/gpu/gslbe/src/rt/GSLContext.h | 3 +- projects/clr/rocclr/runtime/utils/flags.hpp | 2 + 12 files changed, 107 insertions(+), 51 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp index 025322fa8c..1d088139e2 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp @@ -359,7 +359,6 @@ Device::Device() : NullDevice() , CALGSLDevice() , numOfVgpus_(0) - , numDeviceQueues_(0) , context_(NULL) , heap_(NULL) , dummyPage_(NULL) diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp index 2a6cb24aaf..e7f384ecd5 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp @@ -468,7 +468,6 @@ public: //! Returns the number of virtual GPUs allocated on this device uint numOfVgpus() const { return numOfVgpus_; } uint numOfVgpus_; //!< The number of virtual GPUs (lock protected) - uint numDeviceQueues_; //!< Number of device queues typedef std::vector VirtualGPUs; diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp index 0a40baefed..142e3d7029 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp @@ -3975,7 +3975,17 @@ HSAILKernel::loadArguments( const amd::DeviceQueue* queue = *reinterpret_cast(paramaddr); VirtualGPU* gpuQueue = static_cast(queue->vDev()); - uint64_t vmQueue = gpuQueue->vQueue()->vmAddress(); + uint64_t vmQueue; + if (dev().settings().useDeviceQueue_) { + vmQueue = gpuQueue->vQueue()->vmAddress(); + } + else { + if (!gpu.createVirtualQueue(queue->size())) { + LogError( "Virtual queue creaiton failed!"); + return false; + } + vmQueue = gpu.vQueue()->vmAddress(); + } WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*)); break; } diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp b/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp index f0bcd2f4c0..bb456c3b60 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp @@ -66,6 +66,9 @@ struct SchedulerParam { uint64_t scratch; //!< GPU address to the scratch buffer uint32_t numMaxWaves; //!< The max number of possible waves uint32_t releaseHostCP; //!< Releases CP on the host queue + uint64_t parentAQL; //!< Host parent AmdAqlWrap packet + uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue + uint32_t reserved; //!< Reserved field }; } // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp index 6390025655..98be8f0c39 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp @@ -90,6 +90,9 @@ typedef struct _SchedulerParam { ulong scratch; //!< GPU address to the scratch buffer uint numMaxWaves; //!< Num max waves on the asic uint releaseHostCP; //!< Releases CP on the host queue + ulong parentAQL; //!< Host parent AmdAqlWrap packet + uint dedicatedQueue; //!< Scheduler uses a dedicated queue + uint reserved; //!< Reserved field } SchedulerParam; typedef struct _HwDispatch { @@ -276,7 +279,9 @@ static inline bool checkWaitEvents(__global AmdEvent** events, uint numEvents) { for (uint i = 0; i < numEvents; ++i) { - if (atomic_and(&events[i]->state, 0xffffffff) != CL_COMPLETE) { + if (atomic_load_explicit( + (__global atomic_uint*)(&events[i]->state), + memory_order_acquire, memory_scope_device) != CL_COMPLETE) { return false; } } @@ -348,6 +353,8 @@ scheduler( __global SchedulerParam* param = ¶ms[paramIdx]; volatile __global HwDispatch* hwDisp = (volatile __global HwDispatch*)param->hw_queue; + __global AmdAqlWrap* hostParent = (__global AmdAqlWrap*)(param->parentAQL); + __global uint* counter = (__global uint*)(&hostParent->child_counter); __global uint* signal = (__global uint*)(¶m->signal); __global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1]; __global uint* amask = (__global uint *)queue->aql_slot_mask; @@ -360,7 +367,7 @@ scheduler( } uint launch = 0; - uint loop; + uint loop = 1; do { uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]), @@ -491,10 +498,16 @@ scheduler( launch = atomic_load_explicit((__global atomic_uint*)¶m->launch, memory_order_acquire, memory_scope_device); - loop = atomic_load_explicit((__global atomic_uint*)signal, - memory_order_acquire, memory_scope_device); + if (param->dedicatedQueue) { + loop = atomic_load_explicit((__global atomic_uint*)signal, + memory_order_acquire, memory_scope_device); + } + else { + loop = atomic_load_explicit((__global atomic_uint*)counter, + memory_order_acquire, memory_scope_device); + } - } while ((launch == 0) && (loop == 1)); + } while ((launch == 0) && (loop != 0)); if (loop == 0) { //! \todo Write deadcode to the template, but somehow @@ -504,6 +517,8 @@ scheduler( hwDisp[1].condExe1 = 0xdeadc0de; hwDisp[1].condExe2 = 0xdeadc0de; hwDisp[1].condExe3 = 0xdeadc0de; + atomic_store_explicit((__global atomic_uint*)signal, + 0, memory_order_release, memory_scope_device); barrier(CLK_GLOBAL_MEM_FENCE); atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe, ResumeExecution, memory_order_release, memory_scope_device); diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp index 580976e51a..363ba49192 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp @@ -137,6 +137,9 @@ Settings::Settings() // Use direct SRD by default hsailDirectSRD_ = GPU_DIRECT_SRD; + + // Use host queue for device enqueuing by default + useDeviceQueue_ = GPU_USE_DEVICE_QUEUE; } bool diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp b/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp index 5835b6ca39..9ea5f6b1c6 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp @@ -73,7 +73,8 @@ public: uint apuSystem_: 1; //!< Device is APU system with shared memory uint asyncMemCopy_: 1; //!< Use async memory transfers uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL - uint reserved_: 2; + uint useDeviceQueue_: 1; //!< Submit to separate device queue + uint reserved_: 1; }; uint value_; }; diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp index 865c216ea4..2664676f5e 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -264,6 +264,24 @@ VirtualGPU::releasePinnedMem() bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) { + if (deviceQueueSize_ == deviceQueueSize) { + return true; + } + else { + //! @todo Temporarily keep the buffer mapped for debug purpose + if (NULL != schedParams_) { + schedParams_->unmap(this); + } + delete vqHeader_; + delete virtualQueue_; + delete schedParams_; + vqHeader_ = NULL; + virtualQueue_ = NULL; + schedParams_ = NULL; + schedParamIdx_ = 0; + deviceQueueSize_ = 0; + } + uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap); uint allocSize = deviceQueueSize; @@ -339,6 +357,8 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) ptr = reinterpret_cast
(schedParams_->map(this)); + deviceQueueSize_ = deviceQueueSize; + return true; } @@ -365,6 +385,7 @@ VirtualGPU::VirtualGPU( , virtualQueue_(NULL) , schedParams_(NULL) , schedParamIdx_(0) + , deviceQueueSize_(0) , hsaQueueMem_(NULL) { memset(&cal_, 0, sizeof(CalVirtualDesc)); @@ -453,23 +474,7 @@ VirtualGPU::create( #endif // !cl_amd_open_video { if (dev().engines().numComputeRings()) { - uint idx; - - //! @todo Temporary workaround for Linux, because 2 HW queues only - //! Fixes conformance failures with multi queues - if ((0 == deviceQueueSize) || IS_WINDOWS) { - //! @note: Add 1 to account the device queue for transfers - idx = (index() + 1) % (dev().engines().numComputeRings() - - gpuDevice_.numDeviceQueues_); - } - else { - gpuDevice_.numDeviceQueues_++; - if (gpuDevice_.numDeviceQueues_ >= dev().engines().numComputeRings()) { - return false; - } - idx = (dev().engines().numComputeRings() - gpuDevice_.numDeviceQueues_) - % dev().engines().numComputeRings(); - } + uint idx = index() % dev().engines().numComputeRings(); // hwRing_ should be set 0 if forced to have single scratch buffer hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; @@ -558,10 +563,9 @@ VirtualGPU::create( return false; } - //! @todo for testing only - //deviceQueueSize = (deviceQueueSize == 0) ? (128 * Ki) : deviceQueueSize; // Check if the app requested a device queue creation - if ((0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) { + if (dev().settings().useDeviceQueue_ && + (0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) { LogError("Could not create a virtual queue!"); return false; } @@ -598,10 +602,6 @@ VirtualGPU::~VirtualGPU() amd::ScopedLock k(dev().lockAsyncOps()); amd::ScopedLock lock(dev().vgpusAccess()); - if ((NULL != virtualQueue_) && IS_LINUX) { - gpuDevice_.numDeviceQueues_--; - } - uint i; // Destroy all kernels for (GslKernels::const_iterator it = gslKernels_.begin(); @@ -1696,13 +1696,19 @@ VirtualGPU::submitKernelInternalHSA( return false; } else { - gpuDefQueue = static_cast(defQueue->vDev()); + if (dev().settings().useDeviceQueue_) { + gpuDefQueue = static_cast(defQueue->vDev()); + if (gpuDefQueue->hwRing() == hwRing()) { + LogError("Can't submit the child kernels to the same HW ring as the host queue!"); + return false; + } + } + else { + createVirtualQueue(defQueue->size()); + gpuDefQueue = this; + } } vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress(); - if (gpuDefQueue->hwRing() == hwRing()) { - LogError("Can't submit the child kernels to the same HW ring as the host queue!"); - return false; - } // Add memory handles before the actual dispatch memList.push_back(gpuDefQueue->virtualQueue_); @@ -1830,6 +1836,14 @@ VirtualGPU::submitKernelInternalHSA( } } + if (!dev().settings().useDeviceQueue_) { + // Add the termination handshake to the host queue + virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + 0, dev().settings().useDeviceQueue_); + } + // Get the global loop start before the scheduler mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart(); static_cast(gpuDefQueue->blitMgr()).runScheduler( @@ -1842,6 +1856,7 @@ VirtualGPU::submitKernelInternalHSA( // Get the address of PM4 template and add write it to params //! @note DMA flush must not occur between patch and the scheduler mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart(); + // Program parameters for the scheduler SchedulerParam* param = &reinterpret_cast (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; @@ -1852,6 +1867,9 @@ VirtualGPU::submitKernelInternalHSA( param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); param->launch = 0; param->releaseHostCP = 0; + param->parentAQL = vmParentWrap; + param->dedicatedQueue = dev().settings().useDeviceQueue_; + // Fill the scratch buffer information if (hsaKernel.prog().maxScratchRegs() > 0) { gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0]; @@ -1880,16 +1898,20 @@ VirtualGPU::submitKernelInternalHSA( gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent, gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, signalAddr, loopStart); + // Set GPU event for the used resources for (uint i = 0; i < memList.size(); ++i) { memList[i]->setBusy(*gpuDefQueue, gpuEvent); } - // Add the termination handshake to the host queue - virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(), - vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - signalAddr); + if (dev().settings().useDeviceQueue_) { + // Add the termination handshake to the host queue + virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + signalAddr, dev().settings().useDeviceQueue_); + } + ++gpuDefQueue->schedParamIdx_ %= gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam); //! \todo optimize the wrap around diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp index 79af320c25..93b09a5a0f 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp @@ -387,6 +387,11 @@ public: //! Update virtual queue header void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable); + //! Returns TRUE if virtual queue was successfully allocatted + bool createVirtualQueue( + uint deviceQueueSize //!< Device queue size + ); + EngineType engineID_; //!< Engine ID for this VirtualGPU ResourceSlots slots_; //!< Resource slots for kernel arguments State state_; //!< virtual GPU current state @@ -488,11 +493,6 @@ private: const amd::BufferRect& dstRect //!< region of destination for copy ); - //! Returns TRUE if virtual queue was successfully allocatted - bool createVirtualQueue( - uint deviceQueueSize //!< Device queue size - ); - GslKernels gslKernels_; //!< GSL kernel descriptors GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors GpuEvents gpuEvents_; //!< GPU events @@ -529,6 +529,7 @@ private: Memory* virtualQueue_; //!< Virtual device queue Memory* schedParams_; //!< The scheduler parameters uint schedParamIdx_; //!< Index in the scheduler parameters buffer + uint deviceQueueSize_; //!< Device queue size Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object }; diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp index 46c078df59..303ff0c120 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp @@ -1290,9 +1290,9 @@ CALGSLContext::virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* me void CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState, - uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal) + uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue) { eventBegin(MainEngine); - m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal); + m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue); eventEnd(MainEngine, event); } diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h index 849101423e..2f7cc62cea 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h +++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h @@ -47,7 +47,8 @@ public: uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA); mcaddr virtualQueueDispatcherStart(); void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart); - void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState, uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal); + void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState, + uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue); bool isDone(GpuEvent* event); void waitForEvent(GpuEvent* event); void flushIOCaches() const; diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp index a36f7ea156..862c7d199b 100644 --- a/projects/clr/rocclr/runtime/utils/flags.hpp +++ b/projects/clr/rocclr/runtime/utils/flags.hpp @@ -150,6 +150,8 @@ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \ "Prints the specified number of the child kernels") \ release(bool, GPU_DIRECT_SRD, true, \ "Use indirect SRD access in HSAIL") \ +release(bool, GPU_USE_DEVICE_QUEUE, false, \ + "Use a dedicated device queue for the actual submissions") \ release(bool, AMD_DEPTH_MSAA_INTEROP, false, \ "Enable depth stencil and MSAA buffer interop") \ release(bool, AMD_THREAD_TRACE_ENABLE, false, \