From 8fa184db38a510ab1d2b6aeb7450dc40a7fefeea Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 12 Aug 2014 18:49:08 -0400 Subject: [PATCH] P4 to Git Change 1065597 by gandryey@gera-dev-w7 on 2014/08/12 18:38:45 ECR #304775 - Device enqueuing - Provide scratch buffer offset for generic address space - Use single scratch buffer for all available queues. Each queue will have a unique subbuffer in the global buffer Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#454 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#129 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#329 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#120 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#37 edit --- rocclr/runtime/device/gpu/gpudevice.cpp | 78 ++++++++++++++----- rocclr/runtime/device/gpu/gpudevice.hpp | 10 ++- rocclr/runtime/device/gpu/gpusched.hpp | 2 +- rocclr/runtime/device/gpu/gpuschedcl.cpp | 20 ++--- rocclr/runtime/device/gpu/gpuvirtual.cpp | 19 ++--- rocclr/runtime/device/gpu/gpuvirtual.hpp | 1 - .../device/gpu/gslbe/src/rt/GSLContext.cpp | 5 +- .../device/gpu/gslbe/src/rt/GSLContext.h | 2 +- 8 files changed, 93 insertions(+), 44 deletions(-) diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp index f8e5c32a66..036a75a995 100644 --- a/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/rocclr/runtime/device/gpu/gpudevice.cpp @@ -373,6 +373,7 @@ Device::Device() , resourceCache_(NULL) , heapInitComplete_(false) , xferQueue_(NULL) + , globalScratchBuf_(NULL) , srdManager_(NULL) { } @@ -389,6 +390,8 @@ Device::~Device() scratch_[s] = NULL; } + delete globalScratchBuf_; + // Destroy transfer queue delete xferQueue_; @@ -2273,7 +2276,6 @@ Device::ScratchBuffer::destroyMemory() delete memObjs_[i]; memObjs_[i] = NULL; } - regNum_ = 0; } bool @@ -2282,30 +2284,63 @@ Device::allocScratch(uint regNum, const VirtualGPU* vgpu) if (regNum > 0) { // Serialize the scratch buffer allocation code amd::ScopedLock lk(*lockAsyncOps_); - uint s = vgpu->hwRing(); + uint sb = vgpu->hwRing(); // Check if the current buffer isn't big enough - if (regNum > scratch_[s]->regNum_) { + if (regNum > scratch_[sb]->regNum_) { // Stall all command queues, since runtime will reallocate memory ScopedLockVgpus lock(*this); - std::vector& mems = scratch_[s]->memObjs_; - // Calculate the size of the new buffer - size_t size = calcScratchBufferSize(regNum); + scratch_[sb]->regNum_ = regNum; + size_t size = 0; + uint offset = 0; - scratch_[s]->destroyMemory(); - - // Loop through all memory objects and reallocate them - for (uint i = 0; i < mems.size(); ++i) { - // Allocate new buffer - mems[i] = new gpu::Memory(*this, size); - if ((mems[i] == NULL) || !mems[i]->create(Resource::Scratch)) { - LogError("Couldn't allocate scratch memory"); - scratch_[s]->regNum_ = 0; - return false; + // Destroy all views + for (uint s = 0; s < scratch_.size(); ++s) { + ScratchBuffer* scratchBuf = scratch_[s]; + if (scratchBuf->regNum_ > 0) { + scratchBuf->destroyMemory(); + // Calculate the size of the scratch buffer for a queue + scratchBuf->size_ = calcScratchBufferSize(scratchBuf->regNum_); + scratchBuf->offset_ = offset; + size += scratchBuf->size_ * scratchBuf->memObjs_.size(); + offset += scratchBuf->size_; + } + } + + delete globalScratchBuf_; + + // Allocate new buffer. + globalScratchBuf_ = new gpu::Memory(*this, size); + if ((globalScratchBuf_ == NULL) || + !globalScratchBuf_->create(Resource::Scratch)) { + LogError("Couldn't allocate scratch memory"); + for (uint s = 0; s < scratch_.size(); ++s) { + scratch_[s]->regNum_ = 0; + } + return false; + } + + for (uint s = 0; s < scratch_.size(); ++s) { + std::vector& mems = scratch_[s]->memObjs_; + + // Loop through all memory objects and reallocate them + for (uint i = 0; i < mems.size(); ++i) { + if (scratch_[s]->regNum_ > 0) { + // Allocate new buffer + mems[i] = new gpu::Memory(*this, scratch_[s]->size_); + Resource::ViewParams view; + view.resource_ = globalScratchBuf_; + view.offset_ = scratch_[s]->offset_ + i * scratch_[s]->size_; + view.size_ = scratch_[s]->size_; + if ((mems[i] == NULL) || !mems[i]->create(Resource::View, &view)) { + LogError("Couldn't allocate a scratch view"); + scratch_[s]->regNum_ = 0; + return false; + } + } } } - scratch_[s]->regNum_ = regNum; } } return true; @@ -2341,8 +2376,13 @@ Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* v void Device::destroyScratchBuffers() { - for (uint s = 0; s < scratch_.size(); ++s) { - scratch_[s]->destroyMemory(); + if (globalScratchBuf_ != NULL) { + for (uint s = 0; s < scratch_.size(); ++s) { + scratch_[s]->destroyMemory(); + scratch_[s]->regNum_ = 0; + } + delete globalScratchBuf_; + globalScratchBuf_ = NULL; } } diff --git a/rocclr/runtime/device/gpu/gpudevice.hpp b/rocclr/runtime/device/gpu/gpudevice.hpp index e7f384ecd5..ffcc5f056a 100644 --- a/rocclr/runtime/device/gpu/gpudevice.hpp +++ b/rocclr/runtime/device/gpu/gpudevice.hpp @@ -297,9 +297,11 @@ public: { uint regNum_; //!< The number of used scratch registers std::vector memObjs_; //!< Memory objects for scratch buffers + uint offset_; //!< Offset from the global scratch store + uint size_; //!< Scratch buffer size on this queue //! Default constructor - ScratchBuffer(uint numMems): regNum_(0), memObjs_(numMems) {} + ScratchBuffer(uint numMems): regNum_(0), memObjs_(numMems), offset_(0) {} //! Default constructor ~ScratchBuffer(); @@ -524,6 +526,9 @@ public: const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; } + //! Returns the global scratch buffer + Memory* globalScratchBuf() const { return globalScratchBuf_; }; + //! Destroys scratch buffer memory void destroyScratchBuffers(); @@ -613,9 +618,10 @@ private: std::vector* mapCache_; //!< Map cache info structure ResourceCache* resourceCache_; //!< CAL resource cache Engines engines_; //!< Available engines on device - bool heapInitComplete_; //!< Keep track of initialization status of heap resources + bool heapInitComplete_; //!< Keep track of initialization status of heap resources VirtualGPU* xferQueue_; //!< Transfer queue std::vector scratch_; //!< Scratch buffers for kernels + Memory* globalScratchBuf_; //!< Global scratch buffer SrdManager* srdManager_; //!< SRD manager object static AppProfile appProfile_; //!< application profile diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp index bb456c3b60..5ace3cb0c7 100644 --- a/rocclr/runtime/device/gpu/gpusched.hpp +++ b/rocclr/runtime/device/gpu/gpusched.hpp @@ -68,7 +68,7 @@ struct SchedulerParam { uint32_t releaseHostCP; //!< Releases CP on the host queue uint64_t parentAQL; //!< Host parent AmdAqlWrap packet uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue - uint32_t reserved; //!< Reserved field + uint32_t scratchOffset; //!< Scratch buffer offset }; } // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp index 98be8f0c39..c08a88e02c 100644 --- a/rocclr/runtime/device/gpu/gpuschedcl.cpp +++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp @@ -92,7 +92,7 @@ typedef struct _SchedulerParam { uint releaseHostCP; //!< Releases CP on the host queue ulong parentAQL; //!< Host parent AmdAqlWrap packet uint dedicatedQueue; //!< Scheduler uses a dedicated queue - uint reserved; //!< Reserved field + uint scratchOffset; //!< Scratch buffer offset } SchedulerParam; typedef struct _HwDispatch { @@ -152,7 +152,7 @@ typedef struct _HwDispatch { uint shPrivateHi; // 0x00000000 ---- dstAddressHi uint user4; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) uint offsUser4; // 0x00000248 ---- OFFSET - uint privOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0 + uint scratchOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0 uint privSize; // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30 uint packet4; // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE uint glbSizeX; // 0x00000000 @@ -170,10 +170,11 @@ static inline void dispatch( volatile __global HwDispatch* dispatch, __global HsaAqlDispatchPacket* aqlPkt, - uint scratchSize, - uint numMaxWaves, - ulong scratch, - ulong hsaQueue) + ulong scratch, + ulong hsaQueue, + uint scratchSize, + uint scratchOffset, + uint numMaxWaves) { const uint UsrRegOffset = 0x240; const uint Pm4Nop = 0xC0001002; @@ -258,8 +259,9 @@ dispatch( // flatScratchEna = (flags & 0x20); if (flags & 0x20) { dispatch->copyData = Pm4CopyReg; - dispatch->scratchAddrLo = (uint)(scratch >> 16); + dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16); dispatch->offsUser4 = UsrRegOffset + usrRegCnt; + dispatch->scratchOffs = scratchOffset; dispatch->privSize = privateSize; } else { @@ -421,8 +423,8 @@ scheduler( (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; } // Launch child kernel .... - dispatch(hwDisp, &disp->aql, param->scratchSize, param->numMaxWaves, - param->scratch, param->hsa_queue); + dispatch(hwDisp, &disp->aql, param->scratch, param->hsa_queue, + param->scratchSize, param->scratchOffset, param->numMaxWaves); disp->state = AQL_WRAP_BUSY; releaseWaitEvents((__global AmdEvent**)(disp->wait_list), disp->wait_num, (__global uint*)queue->event_slot_mask, diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index 730c6e744e..3ddeb86371 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -377,7 +377,6 @@ VirtualGPU::VirtualGPU( , numVmMems_(0) , dmaFlushMgmt_(device) , numGrpCb_(NULL) - , scratchRegNum_(0) , hwRing_(0) , readjustTimeGPU_(0) , currTs_(NULL) @@ -645,7 +644,7 @@ VirtualGPU::~VirtualGPU() //!@note OCLtst uses single device with multiple tests //! Release memory only if it's the last command queue. //! The first queue is reserved for the transfers on device - if ((scratchRegNum_ > 0) && (gpuDevice_.numOfVgpus_ <= 1)) { + if (gpuDevice_.numOfVgpus_ <= 1) { gpuDevice_.destroyScratchBuffers(); } @@ -1736,12 +1735,14 @@ VirtualGPU::submitKernelInternalHSA( } gslMemObject scratch = NULL; + uint scratchOffset = 0; // Check if the device allocated more registers than the old setup if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { - const std::vector& mems = dev().scratch(hwRing())->memObjs_; + const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing()); + const std::vector& mems = scratchObj->memObjs_; scratch = mems[0]->gslResource(); memList.push_back(mems[0]); - scratchRegNum_ = dev().scratch(hwRing())->regNum_; + scratchOffset = scratchObj->offset_; } // Add GSL handle to the memory list for VidMM @@ -1752,7 +1753,7 @@ VirtualGPU::submitKernelInternalHSA( GpuEvent gpuEvent; // Run AQL dispatch in HW runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_, - scratch, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress()); + scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress()); if (hsaKernel.dynamicParallelism()) { // Make sure exculsive access to the device queue @@ -1884,12 +1885,14 @@ VirtualGPU::submitKernelInternalHSA( param->scratchSize = scratchBuf->size(); param->scratch = scratchBuf->vmAddress(); param->numMaxWaves = 32 * dev().info().maxComputeUnits_; + param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; memList.push_back(scratchBuf); } else { param->numMaxWaves = 0; param->scratchSize = 0; param->scratch = 0; + param->scratchOffset = 0; } // Add all kernels in the program to the mem list. @@ -2180,7 +2183,6 @@ VirtualGPU::releaseMemory(gslMemObject gslResource, bool wait) for (uint i = 0; i < mems.size(); ++i) { if ((mems[i] != NULL) && (mems[i]->gslResource() == gslResource)) { setScratchBuffer(NULL, i); - scratchRegNum_ = 0; } } } @@ -2986,14 +2988,13 @@ VirtualGPU::waitEventLock(CommandBatch* cb) void VirtualGPU::validateScratchBuffer(const Kernel* kernel) { - // Check if the device allocated more registers than the old setup - if (dev().scratch(hwRing())->regNum_ > scratchRegNum_) { + // Check if a scratch buffer is required + if (dev().scratch(hwRing())->regNum_ > 0) { const std::vector& mems = dev().scratch(hwRing())->memObjs_; for (uint i = 0; i < mems.size(); ++i) { // Setup scratch buffer setScratchBuffer(mems[i]->gslResource(), i); } - scratchRegNum_ = dev().scratch(hwRing())->regNum_; } } diff --git a/rocclr/runtime/device/gpu/gpuvirtual.hpp b/rocclr/runtime/device/gpu/gpuvirtual.hpp index 93b09a5a0f..79e070bb33 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.hpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.hpp @@ -519,7 +519,6 @@ private: CommandBatchList cbList_; //!< List of command batches ConstBuffer* numGrpCb_; //!< Constant buffer for 8xx workaround - uint scratchRegNum_; //!< Number of scratch registers used in this queue uint hwRing_; //!< HW ring used on this virtual device uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp index 303ff0c120..6c657344e0 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp @@ -1266,10 +1266,11 @@ CALGSLContext::writeTimer(bool sdma, const gslMemObject mem, uint32 offset) cons void CALGSLContext::runAqlDispatch(GpuEvent& event, const void* aqlPacket, - const gslMemObject* mems, uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA) + const gslMemObject* mems, uint32 numMems, gslMemObject scratch, uint32 scratchOffset, + const void* cpuKernelCode, uint64 hsaQueueVA) { eventBegin(MainEngine); - m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, cpuKernelCode, hsaQueueVA); + m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA); eventEnd(MainEngine, event); } diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h index 2f7cc62cea..cd02a46bc3 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h @@ -44,7 +44,7 @@ public: bool runProgramGrid(GpuEvent& event, const ProgramGrid* pProgramGrid, const gslMemObject* mems, uint32 numMems); bool runProgramVideoDecode(GpuEvent& event, gslMemObject mo, const CALprogramVideoDecode& decode); void runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems, - uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA); + uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA); mcaddr virtualQueueDispatcherStart(); void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart); void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,