From cd9d1dfd35b19a49fbe755092dc332c4172bb66f Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Thu, 7 Jul 2016 00:56:14 -0400
Subject: [PATCH] P4 to Git Change 1288063 by bsumner@bsumner-lin-opencl on
 2016/07/06 17:07:15

	SWDEV-97663 - optimize cbrt for AMD devices

Affected files ...

... //depot/stg/opencl/drivers/opencl/library/ocml/src/cbrtF.cl#5 edit
... //depot/stg/opencl/drivers/opencl/library/ocml/src/rcbrtF.cl#4 edit
---
 rocclr/runtime/device/pal/palblit.cpp     |   2 +-
 rocclr/runtime/device/pal/paldevice.cpp   |   1 -
 rocclr/runtime/device/pal/palprintf.cpp   |   2 +-
 rocclr/runtime/device/pal/palresource.cpp |  12 ++-
 rocclr/runtime/device/pal/palresource.hpp |   9 +-
 rocclr/runtime/device/pal/palvirtual.cpp  | 121 +++++++++++-----------
 rocclr/runtime/device/pal/palvirtual.hpp  |  25 ++---
 7 files changed, 82 insertions(+), 90 deletions(-)
diff --git a/rocclr/runtime/device/pal/palblit.cpp b/rocclr/runtime/device/pal/palblit.cpp
index 66d02aba9d..5d5ef73471 100644
--- a/rocclr/runtime/device/pal/palblit.cpp
+++ b/rocclr/runtime/device/pal/palblit.cpp
@@ -2678,7 +2678,7 @@ KernelBlitManager::writeRawData(
     const void* data
     ) const
 {
-    static_cast<pal::Memory&>(memory).writeRawData(gpu(), 0, size, data, false);
+    static_cast<pal::Memory&>(memory).writeRawData(gpu(), size, data, false);
 
     synchronize();
 }
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 608f676088..ab38c8059c 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -706,7 +706,6 @@ Device::create(Pal::IDevice* device)
     // palSettings ...
     palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
     palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
-    palSettings->cmdBufBatchedSubmitChainLimit = 0;
 
     // Commit the new settings for the device
     result = iDev()->CommitSettingsAndInit();
diff --git a/rocclr/runtime/device/pal/palprintf.cpp b/rocclr/runtime/device/pal/palprintf.cpp
index 1d81036668..40d902b377 100644
--- a/rocclr/runtime/device/pal/palprintf.cpp
+++ b/rocclr/runtime/device/pal/palprintf.cpp
@@ -619,7 +619,7 @@ PrintfDbgHSA::init(
 
         // Copy offset and number of bytes available for printf data
         // into the corresponding location in the debug buffer
-        dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true);
+        dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
     }
     return true;
 }
diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp
index b6f47d0f0e..54e56454b2 100644
--- a/rocclr/runtime/device/pal/palresource.cpp
+++ b/rocclr/runtime/device/pal/palresource.cpp
@@ -1072,10 +1072,9 @@ Resource::free()
 void
 Resource::writeRawData(
     VirtualGPU& gpu,
-    size_t      offset,
-    size_t      size,
+    size_t size,
     const void* data,
-    bool        waitForEvent) const
+    bool waitForEvent) const
 {
     GpuEvent    event;
 
@@ -1083,8 +1082,11 @@ Resource::writeRawData(
     // size needs to be DWORD aligned
     assert((size & 3) == 0);
     gpu.eventBegin(MainEngine);
+    //! @todo Remove cache flush
+    //! It's a workaround for a PAL crash with embedded data, allocated before any command
+    gpu.flushCUCaches();
     gpu.queue(MainEngine).addCmdMemRef(iMem());
-    gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast<const uint32_t*>(data));
+    gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast<const uint32_t*>(data));
     gpu.eventEnd(MainEngine, event);
 
     setBusy(gpu, event);
@@ -1936,7 +1938,7 @@ Resource::warmUpRenames(VirtualGPU& gpu)
         uint    dummy = 0;
         const bool NoWait = false;
         // Write 0 for the buffer paging by VidMM
-        writeRawData(gpu, 0, sizeof(dummy), &dummy, NoWait);
+        writeRawData(gpu, sizeof(dummy), &dummy, NoWait);
         const bool Force = true;
         rename(gpu, Force);
     }
diff --git a/rocclr/runtime/device/pal/palresource.hpp b/rocclr/runtime/device/pal/palresource.hpp
index 9687674c6b..b1fac5b20c 100644
--- a/rocclr/runtime/device/pal/palresource.hpp
+++ b/rocclr/runtime/device/pal/palresource.hpp
@@ -240,11 +240,10 @@ public:
      *
      */
     void writeRawData(
-        VirtualGPU& gpu,            //!< Virtual GPU device object
-        size_t      offset,         //!< Offset for in the buffer for data
-        size_t      size,           //!< Size in bytes of data to be copied(multiple of DWORDS)
-        const void* data,           //!< Data to be copied
-        bool        waitForEvent    //!< Wait for event complete
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        size_t size,                    //!< Size in bytes of data to be copied(multiple of DWORDS)
+        const void* data,               //!< Data to be copied
+        bool waitForEvent               //!< Wait for event complete
         ) const;
 
     //! Returns the offset in GPU memory for aliases
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index e5beeac673..697ed8bb3c 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -155,11 +155,11 @@ VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
 }
 
 uint
-VirtualGPU::Queue::submit(bool forceFlush)
+VirtualGPU::Queue::submit()
 {
     cmdCnt_++;
     uint id = cmdBufIdCurrent_;
-    if ((cmdCnt_ > MaxCommands) || forceFlush) {
+    if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) {
         if (!flush()) {
             return GpuEvent::InvalidID;
         }
@@ -238,11 +238,6 @@ VirtualGPU::Queue::flush()
         return false;
     }
 
-    // Reset command buffer, so CB chunks could be reused
-    if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) {
-        LogError("PAL failed CB reset!");
-        return false;
-    }
     // Start command buffer building
     Pal::CmdBufferBuildInfo cmdBuildInfo = {};
     if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
@@ -596,44 +591,41 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
     if  ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
         return false;
     }
-
-    if (GPU_PRINT_CHILD_KERNEL != 0) {
-        address ptr  = reinterpret_cast<address>(
-            virtualQueue_->map(this, Resource::WriteOnly));
-        if (nullptr == ptr) {
-            return false;
-        }
+    address ptr  = reinterpret_cast<address>(
+        virtualQueue_->map(this, Resource::WriteOnly));
+    if (nullptr == ptr) {
+        return false;
     }
+    // Clear memory
+    memset(ptr, 0, allocSize);
+    uint64_t    vaBase = virtualQueue_->vmAddress();
+    AmdVQueueHeader* header = reinterpret_cast<AmdVQueueHeader*>(ptr);
 
-    uint64_t        vaBase = virtualQueue_->vmAddress();
-    AmdVQueueHeader header = {};
     // Initialize the virtual queue header
-    header.aql_slot_num    = numSlots;
-    header.event_slot_num  = dev().settings().numDeviceEvents_;
-    header.event_slot_mask = vaBase + eventMaskOffs;
-    header.event_slots     = vaBase + eventsOffs;
-    header.aql_slot_mask   = vaBase + slotMaskOffs;
-    header.wait_size       = dev().settings().numWaitEvents_;
-    header.arg_size        = dev().info().maxParameterSize_ + 64;
-    header.mask_groups     = maskGroups_;
-
+    header->aql_slot_num    = numSlots;
+    header->event_slot_num  = dev().settings().numDeviceEvents_;
+    header->event_slot_mask = vaBase + eventMaskOffs;
+    header->event_slots     = vaBase + eventsOffs;
+    header->aql_slot_mask   = vaBase + slotMaskOffs;
+    header->wait_size       = dev().settings().numWaitEvents_;
+    header->arg_size        = dev().info().maxParameterSize_ + 64;
+    header->mask_groups     = maskGroups_;
     vqHeader_ = new AmdVQueueHeader;
     if (nullptr == vqHeader_) {
         return false;
     }
-    *vqHeader_ = header;
-
-    virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);
+    *vqHeader_ = *header;
 
     // Go over all slots and perform initialization
-    AmdAqlWrap  slot = {};
-    size_t      offset = sizeof(AmdVQueueHeader);
+    AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
     for (uint i = 0; i < numSlots; ++i) {
         uint64_t argStart = vaBase + argOffs + i * singleArgSize;
-        slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
-        slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
-        virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
-        offset += sizeof(AmdAqlWrap);
+        slots[i].aql.kernarg_address = reinterpret_cast<void*>(argStart);
+        slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
+    }
+    // Upload data back to local memory
+    if (GPU_PRINT_CHILD_KERNEL == 0) {
+        virtualQueue_->unmap(this);
     }
 
     schedParams_ = new Memory(dev(), 64 * Ki);
@@ -641,7 +633,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
         return false;
     }
 
-    address ptr  = reinterpret_cast<address>(schedParams_->map(this));
+    ptr  = reinterpret_cast<address>(schedParams_->map(this));
 
     deviceQueueSize_ = deviceQueueSize;
 
@@ -697,9 +689,9 @@ VirtualGPU::create(bool profiling, uint  deviceQueueSize)
     state_.profiling_ = profiling;
 
     Pal::CmdAllocatorCreateInfo createInfo = {};
-    createInfo.flags.threadSafe = true;
     // \todo forces PAL to reuse CBs, but requires postamble
-    createInfo.flags.autoMemoryReuse = false;
+    createInfo.flags.autoMemoryReuse = true;
+    createInfo.flags.threadSafe = false;
     createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap =
         Pal::GpuHeapGartCacheable;
     createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki;
@@ -2108,28 +2100,34 @@ VirtualGPU::submitKernelInternal(
             }
 
             if (!dev().settings().useDeviceQueue_) {
+                Unimplemented();
+/*
                 // Add the termination handshake to the host queue
                 eventBegin(MainEngine);
-                //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
-                //    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
-                //    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
-                //    0, dev().settings().useDeviceQueue_);
+                cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
+                    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                    0, dev().settings().useDeviceQueue_);
                 eventEnd(MainEngine, gpuEvent);
+*/
             }
 
             // Get the global loop start before the scheduler
-            //Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
-            //static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
-            //    *gpuDefQueue->virtualQueue_,
-            //    *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
-            //    gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
+            Unimplemented();
+/*
+            mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
+            static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
+                *gpuDefQueue->virtualQueue_,
+                *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
+                gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
             const static bool FlushL2 = true;
             gpuDefQueue->flushCUCaches(FlushL2);
 
             // Get the address of PM4 template and add write it to params
             //! @note DMA flush must not occur between patch and the scheduler
+            mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
+*/
             Pal::gpusize patchStart = 0;
-            //Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
             // Program parameters for the scheduler
             SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
                 (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
@@ -2170,28 +2168,31 @@ VirtualGPU::submitKernelInternal(
 
             Pal::gpusize  signalAddr = gpuDefQueue->schedParams_->vmAddress() +
                 gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
+            Unimplemented();
+/*
             gpuDefQueue->eventBegin(MainEngine);
-            //gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
-            //    signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
-            //    (DeviceQueueMaskSize * maskGroups_));
-            // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
-            // Thus TS command for profiling has to follow in the next CB.
-            constexpr bool ForceSubmitFirst = true;
-            gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst);
-
+            gpuDefQueue->cs()->VirtualQueueDispatcherEnd(
+                gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
+                signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
+                (DeviceQueueMaskSize * maskGroups_));
+            gpuDefQueue->eventEnd(MainEngine, gpuEvent);
+*/
             // Set GPU event for the used resources
             for (uint i = 0; i < memList.size(); ++i) {
                 memList[i]->setBusy(*gpuDefQueue, gpuEvent);
             }
 
             if (dev().settings().useDeviceQueue_) {
+                Unimplemented();
+/*
                 // Add the termination handshake to the host queue
                 eventBegin(MainEngine);
-                //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(),
-                //    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
-                //    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
-                //    signalAddr, dev().settings().useDeviceQueue_);
+                cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
+                    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                    signalAddr, dev().settings().useDeviceQueue_);
                 eventEnd(MainEngine, gpuEvent);
+*/
             }
 
             ++gpuDefQueue->schedParamIdx_ %=
@@ -3249,7 +3250,7 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
 {
     const static bool Wait = true;
     vqHeader_->kernel_table = kernelTable;
-    virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
+    virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
 }
 
 void
diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp
index 5f9a73d303..c566a3f646 100644
--- a/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/rocclr/runtime/device/pal/palvirtual.hpp
@@ -79,7 +79,7 @@ public:
 
         //! Flushes the current command buffer to HW
         //! Returns ID associated with the submission
-        uint submit(bool forceFlush);
+        uint submit();
 
         bool flush();
 
@@ -401,17 +401,15 @@ public:
     //! Returns queue, associated with VirtualGPU
     Queue& queue(EngineType id) const { return *queues_[id]; }
 
-    void flushCUCaches(bool flushL2 = false) const
+    void flushCUCaches() const
     {
         Pal::BarrierInfo barrier = {};
         barrier.pipePointWaitCount = 1;
         Pal::HwPipePoint point = Pal::HwPipePostCs;
         barrier.pPipePoints = &point;
         barrier.transitionCount = 1;
-        uint32_t    cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
-        Pal::BarrierTransition trans = { cacheMask, cacheMask,
-            { nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 },
-            Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
+        Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
+            {nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
         barrier.pTransitions = &trans;
         barrier.waitPoint = Pal::HwPipePreCs;
         iCmd()->CmdBarrier(barrier);
@@ -422,17 +420,10 @@ public:
         profileEvent(engId, Begin);
     }
 
-    void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const {
-        constexpr bool End = false;
-        if (forceExec) {
-            constexpr bool ForceFlush = true;
-            event.id = queues_[engId]->submit(ForceFlush);
-            profileEvent(engId, End);
-        }
-        else {
-            profileEvent(engId, End);
-            event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
-        }
+    void eventEnd(EngineType engId, GpuEvent& event) const {
+        const static bool End = false;
+        profileEvent(engId, End);
+        event.id = queues_[engId]->submit();
         event.engineId_ = engId;
     }