From 13b474485bc307cbc64b2ebff161e3d843a58b1b Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 7 Jul 2016 03:52:33 -0400 Subject: [PATCH] P4 to Git Change 1288113 by jsjodin@jsjodin-git2p4-llvm on 2016/07/06 18:23:12 SWDEV-3 - AMDGPU: Expand unaligned accesses early Due to visit order problems, in the case of an unaligned copy the legalized DAG fails to eliminate extra instructions introduced by the expansion of both unaligned parts. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274397 91177308-0d34-0410-b5e6-96231b3b80d8 GitHash: d4452f8fcf496a2e19c1a1c9792f5f063f4e9703 Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/lib/Target/AMDGPU/AMDGPUISelLowering.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/lib/Target/AMDGPU/AMDGPUISelLowering.h#3 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll#3 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll#1 add ... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/sext-in-reg.ll#3 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/unaligned-load-store.ll#2 edit [ROCm/clr commit: 739bdacc65dc54b68b06f888f43111df65e96bab] --- .../clr/rocclr/runtime/device/pal/palblit.cpp | 2 +- .../rocclr/runtime/device/pal/paldevice.cpp | 1 - .../rocclr/runtime/device/pal/palprintf.cpp | 2 +- .../rocclr/runtime/device/pal/palresource.cpp | 12 +- .../rocclr/runtime/device/pal/palresource.hpp | 9 +- .../rocclr/runtime/device/pal/palvirtual.cpp | 121 +++++++++--------- .../rocclr/runtime/device/pal/palvirtual.hpp | 25 ++-- 7 files changed, 82 insertions(+), 90 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp index 66d02aba9d..5d5ef73471 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp @@ -2678,7 +2678,7 @@ KernelBlitManager::writeRawData( const void* data ) const { - static_cast(memory).writeRawData(gpu(), 0, size, data, false); + static_cast(memory).writeRawData(gpu(), size, data, false); synchronize(); } diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp index 608f676088..ab38c8059c 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -706,7 +706,6 @@ Device::create(Pal::IDevice* device) // palSettings ... palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled; palSettings->forceHighClocks = appProfile_.enableHighPerformanceState(); - palSettings->cmdBufBatchedSubmitChainLimit = 0; // Commit the new settings for the device result = iDev()->CommitSettingsAndInit(); diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp index 1d81036668..40d902b377 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp @@ -619,7 +619,7 @@ PrintfDbgHSA::init( // Copy offset and number of bytes available for printf data // into the corresponding location in the debug buffer - dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true); + dbgBuffer_->writeRawData(gpu, initSize, sysMem, true); } return true; } diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp index b6f47d0f0e..54e56454b2 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp @@ -1072,10 +1072,9 @@ Resource::free() void Resource::writeRawData( VirtualGPU& gpu, - size_t offset, - size_t size, + size_t size, const void* data, - bool waitForEvent) const + bool waitForEvent) const { GpuEvent event; @@ -1083,8 +1082,11 @@ Resource::writeRawData( // size needs to be DWORD aligned assert((size & 3) == 0); gpu.eventBegin(MainEngine); + //! @todo Remove cache flush + //! It's a workaround for a PAL crash with embedded data, allocated before any command + gpu.flushCUCaches(); gpu.queue(MainEngine).addCmdMemRef(iMem()); - gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast(data)); + gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast(data)); gpu.eventEnd(MainEngine, event); setBusy(gpu, event); @@ -1936,7 +1938,7 @@ Resource::warmUpRenames(VirtualGPU& gpu) uint dummy = 0; const bool NoWait = false; // Write 0 for the buffer paging by VidMM - writeRawData(gpu, 0, sizeof(dummy), &dummy, NoWait); + writeRawData(gpu, sizeof(dummy), &dummy, NoWait); const bool Force = true; rename(gpu, Force); } diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp index 9687674c6b..b1fac5b20c 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp @@ -240,11 +240,10 @@ public: * */ void writeRawData( - VirtualGPU& gpu, //!< Virtual GPU device object - size_t offset, //!< Offset for in the buffer for data - size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) - const void* data, //!< Data to be copied - bool waitForEvent //!< Wait for event complete + VirtualGPU& gpu, //!< Virtual GPU device object + size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) + const void* data, //!< Data to be copied + bool waitForEvent //!< Wait for event complete ) const; //! Returns the offset in GPU memory for aliases diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index e5beeac673..697ed8bb3c 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -155,11 +155,11 @@ VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem) } uint -VirtualGPU::Queue::submit(bool forceFlush) +VirtualGPU::Queue::submit() { cmdCnt_++; uint id = cmdBufIdCurrent_; - if ((cmdCnt_ > MaxCommands) || forceFlush) { + if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) { if (!flush()) { return GpuEvent::InvalidID; } @@ -238,11 +238,6 @@ VirtualGPU::Queue::flush() return false; } - // Reset command buffer, so CB chunks could be reused - if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) { - LogError("PAL failed CB reset!"); - return false; - } // Start command buffer building Pal::CmdBufferBuildInfo cmdBuildInfo = {}; if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) { @@ -596,44 +591,41 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) { return false; } - - if (GPU_PRINT_CHILD_KERNEL != 0) { - address ptr = reinterpret_cast
( - virtualQueue_->map(this, Resource::WriteOnly)); - if (nullptr == ptr) { - return false; - } + address ptr = reinterpret_cast
( + virtualQueue_->map(this, Resource::WriteOnly)); + if (nullptr == ptr) { + return false; } + // Clear memory + memset(ptr, 0, allocSize); + uint64_t vaBase = virtualQueue_->vmAddress(); + AmdVQueueHeader* header = reinterpret_cast(ptr); - uint64_t vaBase = virtualQueue_->vmAddress(); - AmdVQueueHeader header = {}; // Initialize the virtual queue header - header.aql_slot_num = numSlots; - header.event_slot_num = dev().settings().numDeviceEvents_; - header.event_slot_mask = vaBase + eventMaskOffs; - header.event_slots = vaBase + eventsOffs; - header.aql_slot_mask = vaBase + slotMaskOffs; - header.wait_size = dev().settings().numWaitEvents_; - header.arg_size = dev().info().maxParameterSize_ + 64; - header.mask_groups = maskGroups_; - + header->aql_slot_num = numSlots; + header->event_slot_num = dev().settings().numDeviceEvents_; + header->event_slot_mask = vaBase + eventMaskOffs; + header->event_slots = vaBase + eventsOffs; + header->aql_slot_mask = vaBase + slotMaskOffs; + header->wait_size = dev().settings().numWaitEvents_; + header->arg_size = dev().info().maxParameterSize_ + 64; + header->mask_groups = maskGroups_; vqHeader_ = new AmdVQueueHeader; if (nullptr == vqHeader_) { return false; } - *vqHeader_ = header; - - virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false); + *vqHeader_ = *header; // Go over all slots and perform initialization - AmdAqlWrap slot = {}; - size_t offset = sizeof(AmdVQueueHeader); + AmdAqlWrap* slots = reinterpret_cast(&header[1]); for (uint i = 0; i < numSlots; ++i) { uint64_t argStart = vaBase + argOffs + i * singleArgSize; - slot.aql.kernarg_address = reinterpret_cast(argStart); - slot.wait_list = argStart + dev().info().maxParameterSize_ + 64; - virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false); - offset += sizeof(AmdAqlWrap); + slots[i].aql.kernarg_address = reinterpret_cast(argStart); + slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64; + } + // Upload data back to local memory + if (GPU_PRINT_CHILD_KERNEL == 0) { + virtualQueue_->unmap(this); } schedParams_ = new Memory(dev(), 64 * Ki); @@ -641,7 +633,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) return false; } - address ptr = reinterpret_cast
(schedParams_->map(this)); + ptr = reinterpret_cast
(schedParams_->map(this)); deviceQueueSize_ = deviceQueueSize; @@ -697,9 +689,9 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize) state_.profiling_ = profiling; Pal::CmdAllocatorCreateInfo createInfo = {}; - createInfo.flags.threadSafe = true; // \todo forces PAL to reuse CBs, but requires postamble - createInfo.flags.autoMemoryReuse = false; + createInfo.flags.autoMemoryReuse = true; + createInfo.flags.threadSafe = false; createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartCacheable; createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki; @@ -2108,28 +2100,34 @@ VirtualGPU::submitKernelInternal( } if (!dev().settings().useDeviceQueue_) { + Unimplemented(); +/* // Add the termination handshake to the host queue eventBegin(MainEngine); - //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(), - // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - // vmParentWrap + offsetof(AmdAqlWrap, child_counter), - // 0, dev().settings().useDeviceQueue_); + cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + 0, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); +*/ } // Get the global loop start before the scheduler - //Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); - //static_cast(gpuDefQueue->blitMgr()).runScheduler( - // *gpuDefQueue->virtualQueue_, - // *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, - // gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + Unimplemented(); +/* + mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); + static_cast(gpuDefQueue->blitMgr()).runScheduler( + *gpuDefQueue->virtualQueue_, + *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); const static bool FlushL2 = true; gpuDefQueue->flushCUCaches(FlushL2); // Get the address of PM4 template and add write it to params //! @note DMA flush must not occur between patch and the scheduler + mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); +*/ Pal::gpusize patchStart = 0; - //Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); // Program parameters for the scheduler SchedulerParam* param = &reinterpret_cast (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; @@ -2170,28 +2168,31 @@ VirtualGPU::submitKernelInternal( Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() + gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); + Unimplemented(); +/* gpuDefQueue->eventBegin(MainEngine); - //gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd( - // signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / - // (DeviceQueueMaskSize * maskGroups_)); - // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call. - // Thus TS command for profiling has to follow in the next CB. - constexpr bool ForceSubmitFirst = true; - gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst); - + gpuDefQueue->cs()->VirtualQueueDispatcherEnd( + gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, + signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / + (DeviceQueueMaskSize * maskGroups_)); + gpuDefQueue->eventEnd(MainEngine, gpuEvent); +*/ // Set GPU event for the used resources for (uint i = 0; i < memList.size(); ++i) { memList[i]->setBusy(*gpuDefQueue, gpuEvent); } if (dev().settings().useDeviceQueue_) { + Unimplemented(); +/* // Add the termination handshake to the host queue eventBegin(MainEngine); - //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(), - // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - // vmParentWrap + offsetof(AmdAqlWrap, child_counter), - // signalAddr, dev().settings().useDeviceQueue_); + cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + signalAddr, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); +*/ } ++gpuDefQueue->schedParamIdx_ %= @@ -3249,7 +3250,7 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) { const static bool Wait = true; vqHeader_->kernel_table = kernelTable; - virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait); + virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait); } void diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index 5f9a73d303..c566a3f646 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -79,7 +79,7 @@ public: //! Flushes the current command buffer to HW //! Returns ID associated with the submission - uint submit(bool forceFlush); + uint submit(); bool flush(); @@ -401,17 +401,15 @@ public: //! Returns queue, associated with VirtualGPU Queue& queue(EngineType id) const { return *queues_[id]; } - void flushCUCaches(bool flushL2 = false) const + void flushCUCaches() const { Pal::BarrierInfo barrier = {}; barrier.pipePointWaitCount = 1; Pal::HwPipePoint point = Pal::HwPipePostCs; barrier.pPipePoints = &point; barrier.transitionCount = 1; - uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader; - Pal::BarrierTransition trans = { cacheMask, cacheMask, - { nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 }, - Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; + Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader, + {nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; barrier.pTransitions = &trans; barrier.waitPoint = Pal::HwPipePreCs; iCmd()->CmdBarrier(barrier); @@ -422,17 +420,10 @@ public: profileEvent(engId, Begin); } - void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const { - constexpr bool End = false; - if (forceExec) { - constexpr bool ForceFlush = true; - event.id = queues_[engId]->submit(ForceFlush); - profileEvent(engId, End); - } - else { - profileEvent(engId, End); - event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION); - } + void eventEnd(EngineType engId, GpuEvent& event) const { + const static bool End = false; + profileEvent(engId, End); + event.id = queues_[engId]->submit(); event.engineId_ = engId; }