From cd9d1dfd35b19a49fbe755092dc332c4172bb66f Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 7 Jul 2016 00:56:14 -0400 Subject: [PATCH] P4 to Git Change 1288063 by bsumner@bsumner-lin-opencl on 2016/07/06 17:07:15 SWDEV-97663 - optimize cbrt for AMD devices Affected files ... ... //depot/stg/opencl/drivers/opencl/library/ocml/src/cbrtF.cl#5 edit ... //depot/stg/opencl/drivers/opencl/library/ocml/src/rcbrtF.cl#4 edit --- rocclr/runtime/device/pal/palblit.cpp | 2 +- rocclr/runtime/device/pal/paldevice.cpp | 1 - rocclr/runtime/device/pal/palprintf.cpp | 2 +- rocclr/runtime/device/pal/palresource.cpp | 12 ++- rocclr/runtime/device/pal/palresource.hpp | 9 +- rocclr/runtime/device/pal/palvirtual.cpp | 121 +++++++++++----------- rocclr/runtime/device/pal/palvirtual.hpp | 25 ++--- 7 files changed, 82 insertions(+), 90 deletions(-) diff --git a/rocclr/runtime/device/pal/palblit.cpp b/rocclr/runtime/device/pal/palblit.cpp index 66d02aba9d..5d5ef73471 100644 --- a/rocclr/runtime/device/pal/palblit.cpp +++ b/rocclr/runtime/device/pal/palblit.cpp @@ -2678,7 +2678,7 @@ KernelBlitManager::writeRawData( const void* data ) const { - static_cast(memory).writeRawData(gpu(), 0, size, data, false); + static_cast(memory).writeRawData(gpu(), size, data, false); synchronize(); } diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index 608f676088..ab38c8059c 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -706,7 +706,6 @@ Device::create(Pal::IDevice* device) // palSettings ... palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled; palSettings->forceHighClocks = appProfile_.enableHighPerformanceState(); - palSettings->cmdBufBatchedSubmitChainLimit = 0; // Commit the new settings for the device result = iDev()->CommitSettingsAndInit(); diff --git a/rocclr/runtime/device/pal/palprintf.cpp b/rocclr/runtime/device/pal/palprintf.cpp index 1d81036668..40d902b377 100644 --- a/rocclr/runtime/device/pal/palprintf.cpp +++ b/rocclr/runtime/device/pal/palprintf.cpp @@ -619,7 +619,7 @@ PrintfDbgHSA::init( // Copy offset and number of bytes available for printf data // into the corresponding location in the debug buffer - dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true); + dbgBuffer_->writeRawData(gpu, initSize, sysMem, true); } return true; } diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp index b6f47d0f0e..54e56454b2 100644 --- a/rocclr/runtime/device/pal/palresource.cpp +++ b/rocclr/runtime/device/pal/palresource.cpp @@ -1072,10 +1072,9 @@ Resource::free() void Resource::writeRawData( VirtualGPU& gpu, - size_t offset, - size_t size, + size_t size, const void* data, - bool waitForEvent) const + bool waitForEvent) const { GpuEvent event; @@ -1083,8 +1082,11 @@ Resource::writeRawData( // size needs to be DWORD aligned assert((size & 3) == 0); gpu.eventBegin(MainEngine); + //! @todo Remove cache flush + //! It's a workaround for a PAL crash with embedded data, allocated before any command + gpu.flushCUCaches(); gpu.queue(MainEngine).addCmdMemRef(iMem()); - gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast(data)); + gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast(data)); gpu.eventEnd(MainEngine, event); setBusy(gpu, event); @@ -1936,7 +1938,7 @@ Resource::warmUpRenames(VirtualGPU& gpu) uint dummy = 0; const bool NoWait = false; // Write 0 for the buffer paging by VidMM - writeRawData(gpu, 0, sizeof(dummy), &dummy, NoWait); + writeRawData(gpu, sizeof(dummy), &dummy, NoWait); const bool Force = true; rename(gpu, Force); } diff --git a/rocclr/runtime/device/pal/palresource.hpp b/rocclr/runtime/device/pal/palresource.hpp index 9687674c6b..b1fac5b20c 100644 --- a/rocclr/runtime/device/pal/palresource.hpp +++ b/rocclr/runtime/device/pal/palresource.hpp @@ -240,11 +240,10 @@ public: * */ void writeRawData( - VirtualGPU& gpu, //!< Virtual GPU device object - size_t offset, //!< Offset for in the buffer for data - size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) - const void* data, //!< Data to be copied - bool waitForEvent //!< Wait for event complete + VirtualGPU& gpu, //!< Virtual GPU device object + size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) + const void* data, //!< Data to be copied + bool waitForEvent //!< Wait for event complete ) const; //! Returns the offset in GPU memory for aliases diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index e5beeac673..697ed8bb3c 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -155,11 +155,11 @@ VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem) } uint -VirtualGPU::Queue::submit(bool forceFlush) +VirtualGPU::Queue::submit() { cmdCnt_++; uint id = cmdBufIdCurrent_; - if ((cmdCnt_ > MaxCommands) || forceFlush) { + if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) { if (!flush()) { return GpuEvent::InvalidID; } @@ -238,11 +238,6 @@ VirtualGPU::Queue::flush() return false; } - // Reset command buffer, so CB chunks could be reused - if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) { - LogError("PAL failed CB reset!"); - return false; - } // Start command buffer building Pal::CmdBufferBuildInfo cmdBuildInfo = {}; if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) { @@ -596,44 +591,41 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) { return false; } - - if (GPU_PRINT_CHILD_KERNEL != 0) { - address ptr = reinterpret_cast
( - virtualQueue_->map(this, Resource::WriteOnly)); - if (nullptr == ptr) { - return false; - } + address ptr = reinterpret_cast
( + virtualQueue_->map(this, Resource::WriteOnly)); + if (nullptr == ptr) { + return false; } + // Clear memory + memset(ptr, 0, allocSize); + uint64_t vaBase = virtualQueue_->vmAddress(); + AmdVQueueHeader* header = reinterpret_cast(ptr); - uint64_t vaBase = virtualQueue_->vmAddress(); - AmdVQueueHeader header = {}; // Initialize the virtual queue header - header.aql_slot_num = numSlots; - header.event_slot_num = dev().settings().numDeviceEvents_; - header.event_slot_mask = vaBase + eventMaskOffs; - header.event_slots = vaBase + eventsOffs; - header.aql_slot_mask = vaBase + slotMaskOffs; - header.wait_size = dev().settings().numWaitEvents_; - header.arg_size = dev().info().maxParameterSize_ + 64; - header.mask_groups = maskGroups_; - + header->aql_slot_num = numSlots; + header->event_slot_num = dev().settings().numDeviceEvents_; + header->event_slot_mask = vaBase + eventMaskOffs; + header->event_slots = vaBase + eventsOffs; + header->aql_slot_mask = vaBase + slotMaskOffs; + header->wait_size = dev().settings().numWaitEvents_; + header->arg_size = dev().info().maxParameterSize_ + 64; + header->mask_groups = maskGroups_; vqHeader_ = new AmdVQueueHeader; if (nullptr == vqHeader_) { return false; } - *vqHeader_ = header; - - virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false); + *vqHeader_ = *header; // Go over all slots and perform initialization - AmdAqlWrap slot = {}; - size_t offset = sizeof(AmdVQueueHeader); + AmdAqlWrap* slots = reinterpret_cast(&header[1]); for (uint i = 0; i < numSlots; ++i) { uint64_t argStart = vaBase + argOffs + i * singleArgSize; - slot.aql.kernarg_address = reinterpret_cast(argStart); - slot.wait_list = argStart + dev().info().maxParameterSize_ + 64; - virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false); - offset += sizeof(AmdAqlWrap); + slots[i].aql.kernarg_address = reinterpret_cast(argStart); + slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64; + } + // Upload data back to local memory + if (GPU_PRINT_CHILD_KERNEL == 0) { + virtualQueue_->unmap(this); } schedParams_ = new Memory(dev(), 64 * Ki); @@ -641,7 +633,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) return false; } - address ptr = reinterpret_cast
(schedParams_->map(this)); + ptr = reinterpret_cast
(schedParams_->map(this)); deviceQueueSize_ = deviceQueueSize; @@ -697,9 +689,9 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize) state_.profiling_ = profiling; Pal::CmdAllocatorCreateInfo createInfo = {}; - createInfo.flags.threadSafe = true; // \todo forces PAL to reuse CBs, but requires postamble - createInfo.flags.autoMemoryReuse = false; + createInfo.flags.autoMemoryReuse = true; + createInfo.flags.threadSafe = false; createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartCacheable; createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki; @@ -2108,28 +2100,34 @@ VirtualGPU::submitKernelInternal( } if (!dev().settings().useDeviceQueue_) { + Unimplemented(); +/* // Add the termination handshake to the host queue eventBegin(MainEngine); - //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(), - // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - // vmParentWrap + offsetof(AmdAqlWrap, child_counter), - // 0, dev().settings().useDeviceQueue_); + cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + 0, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); +*/ } // Get the global loop start before the scheduler - //Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); - //static_cast(gpuDefQueue->blitMgr()).runScheduler( - // *gpuDefQueue->virtualQueue_, - // *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, - // gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + Unimplemented(); +/* + mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); + static_cast(gpuDefQueue->blitMgr()).runScheduler( + *gpuDefQueue->virtualQueue_, + *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); const static bool FlushL2 = true; gpuDefQueue->flushCUCaches(FlushL2); // Get the address of PM4 template and add write it to params //! @note DMA flush must not occur between patch and the scheduler + mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); +*/ Pal::gpusize patchStart = 0; - //Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); // Program parameters for the scheduler SchedulerParam* param = &reinterpret_cast (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; @@ -2170,28 +2168,31 @@ VirtualGPU::submitKernelInternal( Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() + gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); + Unimplemented(); +/* gpuDefQueue->eventBegin(MainEngine); - //gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd( - // signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / - // (DeviceQueueMaskSize * maskGroups_)); - // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call. - // Thus TS command for profiling has to follow in the next CB. - constexpr bool ForceSubmitFirst = true; - gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst); - + gpuDefQueue->cs()->VirtualQueueDispatcherEnd( + gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, + signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / + (DeviceQueueMaskSize * maskGroups_)); + gpuDefQueue->eventEnd(MainEngine, gpuEvent); +*/ // Set GPU event for the used resources for (uint i = 0; i < memList.size(); ++i) { memList[i]->setBusy(*gpuDefQueue, gpuEvent); } if (dev().settings().useDeviceQueue_) { + Unimplemented(); +/* // Add the termination handshake to the host queue eventBegin(MainEngine); - //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(), - // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - // vmParentWrap + offsetof(AmdAqlWrap, child_counter), - // signalAddr, dev().settings().useDeviceQueue_); + cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + signalAddr, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); +*/ } ++gpuDefQueue->schedParamIdx_ %= @@ -3249,7 +3250,7 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) { const static bool Wait = true; vqHeader_->kernel_table = kernelTable; - virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait); + virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait); } void diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp index 5f9a73d303..c566a3f646 100644 --- a/rocclr/runtime/device/pal/palvirtual.hpp +++ b/rocclr/runtime/device/pal/palvirtual.hpp @@ -79,7 +79,7 @@ public: //! Flushes the current command buffer to HW //! Returns ID associated with the submission - uint submit(bool forceFlush); + uint submit(); bool flush(); @@ -401,17 +401,15 @@ public: //! Returns queue, associated with VirtualGPU Queue& queue(EngineType id) const { return *queues_[id]; } - void flushCUCaches(bool flushL2 = false) const + void flushCUCaches() const { Pal::BarrierInfo barrier = {}; barrier.pipePointWaitCount = 1; Pal::HwPipePoint point = Pal::HwPipePostCs; barrier.pPipePoints = &point; barrier.transitionCount = 1; - uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader; - Pal::BarrierTransition trans = { cacheMask, cacheMask, - { nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 }, - Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; + Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader, + {nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; barrier.pTransitions = &trans; barrier.waitPoint = Pal::HwPipePreCs; iCmd()->CmdBarrier(barrier); @@ -422,17 +420,10 @@ public: profileEvent(engId, Begin); } - void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const { - constexpr bool End = false; - if (forceExec) { - constexpr bool ForceFlush = true; - event.id = queues_[engId]->submit(ForceFlush); - profileEvent(engId, End); - } - else { - profileEvent(engId, End); - event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION); - } + void eventEnd(EngineType engId, GpuEvent& event) const { + const static bool End = false; + profileEvent(engId, End); + event.id = queues_[engId]->submit(); event.engineId_ = engId; }