From 13b474485bc307cbc64b2ebff161e3d843a58b1b Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Thu, 7 Jul 2016 03:52:33 -0400
Subject: [PATCH] P4 to Git Change 1288113 by jsjodin@jsjodin-git2p4-llvm on
 2016/07/06 18:23:12

	SWDEV-3 - AMDGPU: Expand unaligned accesses early

	Due to visit order problems, in the case of an unaligned copy
	the legalized DAG fails to eliminate extra instructions introduced
	by the expansion of both unaligned parts.

	git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274397 91177308-0d34-0410-b5e6-96231b3b80d8

	GitHash: d4452f8fcf496a2e19c1a1c9792f5f063f4e9703

Affected files ...

... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/lib/Target/AMDGPU/AMDGPUISelLowering.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/lib/Target/AMDGPU/AMDGPUISelLowering.h#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll#1 add
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/sext-in-reg.ll#3 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/test/CodeGen/AMDGPU/unaligned-load-store.ll#2 edit


[ROCm/clr commit: 739bdacc65dc54b68b06f888f43111df65e96bab]
---
 .../clr/rocclr/runtime/device/pal/palblit.cpp |   2 +-
 .../rocclr/runtime/device/pal/paldevice.cpp   |   1 -
 .../rocclr/runtime/device/pal/palprintf.cpp   |   2 +-
 .../rocclr/runtime/device/pal/palresource.cpp |  12 +-
 .../rocclr/runtime/device/pal/palresource.hpp |   9 +-
 .../rocclr/runtime/device/pal/palvirtual.cpp  | 121 +++++++++---------
 .../rocclr/runtime/device/pal/palvirtual.hpp  |  25 ++--
 7 files changed, 82 insertions(+), 90 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
index 66d02aba9d..5d5ef73471 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -2678,7 +2678,7 @@ KernelBlitManager::writeRawData(
     const void* data
     ) const
 {
-    static_cast<pal::Memory&>(memory).writeRawData(gpu(), 0, size, data, false);
+    static_cast<pal::Memory&>(memory).writeRawData(gpu(), size, data, false);
 
     synchronize();
 }
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
index 608f676088..ab38c8059c 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
@@ -706,7 +706,6 @@ Device::create(Pal::IDevice* device)
     // palSettings ...
     palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
     palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
-    palSettings->cmdBufBatchedSubmitChainLimit = 0;
 
     // Commit the new settings for the device
     result = iDev()->CommitSettingsAndInit();
diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp
index 1d81036668..40d902b377 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp
@@ -619,7 +619,7 @@ PrintfDbgHSA::init(
 
         // Copy offset and number of bytes available for printf data
         // into the corresponding location in the debug buffer
-        dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true);
+        dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
     }
     return true;
 }
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
index b6f47d0f0e..54e56454b2 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
@@ -1072,10 +1072,9 @@ Resource::free()
 void
 Resource::writeRawData(
     VirtualGPU& gpu,
-    size_t      offset,
-    size_t      size,
+    size_t size,
     const void* data,
-    bool        waitForEvent) const
+    bool waitForEvent) const
 {
     GpuEvent    event;
 
@@ -1083,8 +1082,11 @@ Resource::writeRawData(
     // size needs to be DWORD aligned
     assert((size & 3) == 0);
     gpu.eventBegin(MainEngine);
+    //! @todo Remove cache flush
+    //! It's a workaround for a PAL crash with embedded data, allocated before any command
+    gpu.flushCUCaches();
     gpu.queue(MainEngine).addCmdMemRef(iMem());
-    gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast<const uint32_t*>(data));
+    gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast<const uint32_t*>(data));
     gpu.eventEnd(MainEngine, event);
 
     setBusy(gpu, event);
@@ -1936,7 +1938,7 @@ Resource::warmUpRenames(VirtualGPU& gpu)
         uint    dummy = 0;
         const bool NoWait = false;
         // Write 0 for the buffer paging by VidMM
-        writeRawData(gpu, 0, sizeof(dummy), &dummy, NoWait);
+        writeRawData(gpu, sizeof(dummy), &dummy, NoWait);
         const bool Force = true;
         rename(gpu, Force);
     }
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
index 9687674c6b..b1fac5b20c 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
@@ -240,11 +240,10 @@ public:
      *
      */
     void writeRawData(
-        VirtualGPU& gpu,            //!< Virtual GPU device object
-        size_t      offset,         //!< Offset for in the buffer for data
-        size_t      size,           //!< Size in bytes of data to be copied(multiple of DWORDS)
-        const void* data,           //!< Data to be copied
-        bool        waitForEvent    //!< Wait for event complete
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        size_t size,                    //!< Size in bytes of data to be copied(multiple of DWORDS)
+        const void* data,               //!< Data to be copied
+        bool waitForEvent               //!< Wait for event complete
         ) const;
 
     //! Returns the offset in GPU memory for aliases
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index e5beeac673..697ed8bb3c 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -155,11 +155,11 @@ VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
 }
 
 uint
-VirtualGPU::Queue::submit(bool forceFlush)
+VirtualGPU::Queue::submit()
 {
     cmdCnt_++;
     uint id = cmdBufIdCurrent_;
-    if ((cmdCnt_ > MaxCommands) || forceFlush) {
+    if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) {
         if (!flush()) {
             return GpuEvent::InvalidID;
         }
@@ -238,11 +238,6 @@ VirtualGPU::Queue::flush()
         return false;
     }
 
-    // Reset command buffer, so CB chunks could be reused
-    if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) {
-        LogError("PAL failed CB reset!");
-        return false;
-    }
     // Start command buffer building
     Pal::CmdBufferBuildInfo cmdBuildInfo = {};
     if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
@@ -596,44 +591,41 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
     if  ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
         return false;
     }
-
-    if (GPU_PRINT_CHILD_KERNEL != 0) {
-        address ptr  = reinterpret_cast<address>(
-            virtualQueue_->map(this, Resource::WriteOnly));
-        if (nullptr == ptr) {
-            return false;
-        }
+    address ptr  = reinterpret_cast<address>(
+        virtualQueue_->map(this, Resource::WriteOnly));
+    if (nullptr == ptr) {
+        return false;
     }
+    // Clear memory
+    memset(ptr, 0, allocSize);
+    uint64_t    vaBase = virtualQueue_->vmAddress();
+    AmdVQueueHeader* header = reinterpret_cast<AmdVQueueHeader*>(ptr);
 
-    uint64_t        vaBase = virtualQueue_->vmAddress();
-    AmdVQueueHeader header = {};
     // Initialize the virtual queue header
-    header.aql_slot_num    = numSlots;
-    header.event_slot_num  = dev().settings().numDeviceEvents_;
-    header.event_slot_mask = vaBase + eventMaskOffs;
-    header.event_slots     = vaBase + eventsOffs;
-    header.aql_slot_mask   = vaBase + slotMaskOffs;
-    header.wait_size       = dev().settings().numWaitEvents_;
-    header.arg_size        = dev().info().maxParameterSize_ + 64;
-    header.mask_groups     = maskGroups_;
-
+    header->aql_slot_num    = numSlots;
+    header->event_slot_num  = dev().settings().numDeviceEvents_;
+    header->event_slot_mask = vaBase + eventMaskOffs;
+    header->event_slots     = vaBase + eventsOffs;
+    header->aql_slot_mask   = vaBase + slotMaskOffs;
+    header->wait_size       = dev().settings().numWaitEvents_;
+    header->arg_size        = dev().info().maxParameterSize_ + 64;
+    header->mask_groups     = maskGroups_;
     vqHeader_ = new AmdVQueueHeader;
     if (nullptr == vqHeader_) {
         return false;
     }
-    *vqHeader_ = header;
-
-    virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);
+    *vqHeader_ = *header;
 
     // Go over all slots and perform initialization
-    AmdAqlWrap  slot = {};
-    size_t      offset = sizeof(AmdVQueueHeader);
+    AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
     for (uint i = 0; i < numSlots; ++i) {
         uint64_t argStart = vaBase + argOffs + i * singleArgSize;
-        slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
-        slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
-        virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
-        offset += sizeof(AmdAqlWrap);
+        slots[i].aql.kernarg_address = reinterpret_cast<void*>(argStart);
+        slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
+    }
+    // Upload data back to local memory
+    if (GPU_PRINT_CHILD_KERNEL == 0) {
+        virtualQueue_->unmap(this);
     }
 
     schedParams_ = new Memory(dev(), 64 * Ki);
@@ -641,7 +633,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
         return false;
     }
 
-    address ptr  = reinterpret_cast<address>(schedParams_->map(this));
+    ptr  = reinterpret_cast<address>(schedParams_->map(this));
 
     deviceQueueSize_ = deviceQueueSize;
 
@@ -697,9 +689,9 @@ VirtualGPU::create(bool profiling, uint  deviceQueueSize)
     state_.profiling_ = profiling;
 
     Pal::CmdAllocatorCreateInfo createInfo = {};
-    createInfo.flags.threadSafe = true;
     // \todo forces PAL to reuse CBs, but requires postamble
-    createInfo.flags.autoMemoryReuse = false;
+    createInfo.flags.autoMemoryReuse = true;
+    createInfo.flags.threadSafe = false;
     createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap =
         Pal::GpuHeapGartCacheable;
     createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki;
@@ -2108,28 +2100,34 @@ VirtualGPU::submitKernelInternal(
             }
 
             if (!dev().settings().useDeviceQueue_) {
+                Unimplemented();
+/*
                 // Add the termination handshake to the host queue
                 eventBegin(MainEngine);
-                //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
-                //    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
-                //    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
-                //    0, dev().settings().useDeviceQueue_);
+                cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
+                    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                    0, dev().settings().useDeviceQueue_);
                 eventEnd(MainEngine, gpuEvent);
+*/
             }
 
             // Get the global loop start before the scheduler
-            //Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
-            //static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
-            //    *gpuDefQueue->virtualQueue_,
-            //    *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
-            //    gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
+            Unimplemented();
+/*
+            mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
+            static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
+                *gpuDefQueue->virtualQueue_,
+                *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
+                gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
             const static bool FlushL2 = true;
             gpuDefQueue->flushCUCaches(FlushL2);
 
             // Get the address of PM4 template and add write it to params
             //! @note DMA flush must not occur between patch and the scheduler
+            mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
+*/
             Pal::gpusize patchStart = 0;
-            //Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
             // Program parameters for the scheduler
             SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
                 (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
@@ -2170,28 +2168,31 @@ VirtualGPU::submitKernelInternal(
 
             Pal::gpusize  signalAddr = gpuDefQueue->schedParams_->vmAddress() +
                 gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
+            Unimplemented();
+/*
             gpuDefQueue->eventBegin(MainEngine);
-            //gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
-            //    signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
-            //    (DeviceQueueMaskSize * maskGroups_));
-            // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
-            // Thus TS command for profiling has to follow in the next CB.
-            constexpr bool ForceSubmitFirst = true;
-            gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst);
-
+            gpuDefQueue->cs()->VirtualQueueDispatcherEnd(
+                gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
+                signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
+                (DeviceQueueMaskSize * maskGroups_));
+            gpuDefQueue->eventEnd(MainEngine, gpuEvent);
+*/
             // Set GPU event for the used resources
             for (uint i = 0; i < memList.size(); ++i) {
                 memList[i]->setBusy(*gpuDefQueue, gpuEvent);
             }
 
             if (dev().settings().useDeviceQueue_) {
+                Unimplemented();
+/*
                 // Add the termination handshake to the host queue
                 eventBegin(MainEngine);
-                //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
-                //    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
-                //    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
-                //    signalAddr, dev().settings().useDeviceQueue_);
+                cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
+                    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                    signalAddr, dev().settings().useDeviceQueue_);
                 eventEnd(MainEngine, gpuEvent);
+*/
             }
 
             ++gpuDefQueue->schedParamIdx_ %=
@@ -3249,7 +3250,7 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
 {
     const static bool Wait = true;
     vqHeader_->kernel_table = kernelTable;
-    virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
+    virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
 }
 
 void
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index 5f9a73d303..c566a3f646 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -79,7 +79,7 @@ public:
 
         //! Flushes the current command buffer to HW
         //! Returns ID associated with the submission
-        uint submit(bool forceFlush);
+        uint submit();
 
         bool flush();
 
@@ -401,17 +401,15 @@ public:
     //! Returns queue, associated with VirtualGPU
     Queue& queue(EngineType id) const { return *queues_[id]; }
 
-    void flushCUCaches(bool flushL2 = false) const
+    void flushCUCaches() const
     {
         Pal::BarrierInfo barrier = {};
         barrier.pipePointWaitCount = 1;
         Pal::HwPipePoint point = Pal::HwPipePostCs;
         barrier.pPipePoints = &point;
         barrier.transitionCount = 1;
-        uint32_t    cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
-        Pal::BarrierTransition trans = { cacheMask, cacheMask,
-            { nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 },
-            Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
+        Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
+            {nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
         barrier.pTransitions = &trans;
         barrier.waitPoint = Pal::HwPipePreCs;
         iCmd()->CmdBarrier(barrier);
@@ -422,17 +420,10 @@ public:
         profileEvent(engId, Begin);
     }
 
-    void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const {
-        constexpr bool End = false;
-        if (forceExec) {
-            constexpr bool ForceFlush = true;
-            event.id = queues_[engId]->submit(ForceFlush);
-            profileEvent(engId, End);
-        }
-        else {
-            profileEvent(engId, End);
-            event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
-        }
+    void eventEnd(EngineType engId, GpuEvent& event) const {
+        const static bool End = false;
+        profileEvent(engId, End);
+        event.id = queues_[engId]->submit();
         event.engineId_ = engId;
     }