From 8fa184db38a510ab1d2b6aeb7450dc40a7fefeea Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Tue, 12 Aug 2014 18:49:08 -0400
Subject: [PATCH] P4 to Git Change 1065597 by gandryey@gera-dev-w7 on
 2014/08/12 18:38:45

	ECR #304775 - Device enqueuing
	- Provide scratch buffer offset for generic address space
	- Use single scratch buffer for all available queues. Each queue will have a unique subbuffer in the global buffer

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#454 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#129 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#329 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#120 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#37 edit
---
 rocclr/runtime/device/gpu/gpudevice.cpp       | 78 ++++++++++++++-----
 rocclr/runtime/device/gpu/gpudevice.hpp       | 10 ++-
 rocclr/runtime/device/gpu/gpusched.hpp        |  2 +-
 rocclr/runtime/device/gpu/gpuschedcl.cpp      | 20 ++---
 rocclr/runtime/device/gpu/gpuvirtual.cpp      | 19 ++---
 rocclr/runtime/device/gpu/gpuvirtual.hpp      |  1 -
 .../device/gpu/gslbe/src/rt/GSLContext.cpp    |  5 +-
 .../device/gpu/gslbe/src/rt/GSLContext.h      |  2 +-
 8 files changed, 93 insertions(+), 44 deletions(-)

diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index f8e5c32a66..036a75a995 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -373,6 +373,7 @@ Device::Device()
     , resourceCache_(NULL)
     , heapInitComplete_(false)
     , xferQueue_(NULL)
+    , globalScratchBuf_(NULL)
     , srdManager_(NULL)
 {
 }
@@ -389,6 +390,8 @@ Device::~Device()
         scratch_[s] = NULL;
     }
 
+    delete globalScratchBuf_;
+
     // Destroy transfer queue
     delete xferQueue_;
 
@@ -2273,7 +2276,6 @@ Device::ScratchBuffer::destroyMemory()
         delete memObjs_[i];
         memObjs_[i] = NULL;
     }
-    regNum_ = 0;
 }
 
 bool
@@ -2282,30 +2284,63 @@ Device::allocScratch(uint regNum, const VirtualGPU* vgpu)
     if (regNum > 0) {
         // Serialize the scratch buffer allocation code
         amd::ScopedLock lk(*lockAsyncOps_);
-        uint    s = vgpu->hwRing();
+        uint    sb = vgpu->hwRing();
 
         // Check if the current buffer isn't big enough
-        if (regNum > scratch_[s]->regNum_) {
+        if (regNum > scratch_[sb]->regNum_) {
             // Stall all command queues, since runtime will reallocate memory
             ScopedLockVgpus lock(*this);
-            std::vector<Memory*>& mems = scratch_[s]->memObjs_;
 
-            // Calculate the size of the new buffer
-            size_t size = calcScratchBufferSize(regNum);
+            scratch_[sb]->regNum_ = regNum;
+            size_t size = 0;
+            uint offset = 0;
 
-            scratch_[s]->destroyMemory();
-
-            // Loop through all memory objects and reallocate them
-            for (uint i = 0; i < mems.size(); ++i) {
-                // Allocate new buffer
-                mems[i] = new gpu::Memory(*this, size);
-                if ((mems[i] == NULL) || !mems[i]->create(Resource::Scratch)) {
-                    LogError("Couldn't allocate scratch memory");
-                    scratch_[s]->regNum_ = 0;
-                    return false;
+            // Destroy all views
+            for (uint s = 0; s < scratch_.size(); ++s) {
+                ScratchBuffer*  scratchBuf = scratch_[s];
+                if (scratchBuf->regNum_ > 0) {
+                    scratchBuf->destroyMemory();
+                    // Calculate the size of the scratch buffer for a queue
+                    scratchBuf->size_ = calcScratchBufferSize(scratchBuf->regNum_);
+                    scratchBuf->offset_ = offset;
+                    size += scratchBuf->size_ * scratchBuf->memObjs_.size();
+                    offset += scratchBuf->size_;
+                }
+            }
+
+            delete globalScratchBuf_;
+
+            // Allocate new buffer.
+            globalScratchBuf_ = new gpu::Memory(*this, size);
+            if ((globalScratchBuf_ == NULL) ||
+                !globalScratchBuf_->create(Resource::Scratch)) {
+                LogError("Couldn't allocate scratch memory");
+                for (uint s = 0; s < scratch_.size(); ++s) {
+                    scratch_[s]->regNum_ = 0;
+                }
+                return false;
+            }
+
+            for (uint s = 0; s < scratch_.size(); ++s) {
+                std::vector<Memory*>& mems = scratch_[s]->memObjs_;
+
+                // Loop through all memory objects and reallocate them
+                for (uint i = 0; i < mems.size(); ++i) {
+                    if (scratch_[s]->regNum_ > 0) {
+                        // Allocate new buffer
+                        mems[i] = new gpu::Memory(*this, scratch_[s]->size_);
+                        Resource::ViewParams    view;
+                        view.resource_ = globalScratchBuf_;
+                        view.offset_ = scratch_[s]->offset_ + i * scratch_[s]->size_;
+                        view.size_ = scratch_[s]->size_;
+                        if ((mems[i] == NULL) || !mems[i]->create(Resource::View, &view)) {
+                            LogError("Couldn't allocate a scratch view");
+                            scratch_[s]->regNum_ = 0;
+                            return false;
+                        }
+                    }
                 }
             }
-            scratch_[s]->regNum_ = regNum;
         }
     }
     return true;
@@ -2341,8 +2376,13 @@ Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* v
 void
 Device::destroyScratchBuffers()
 {
-    for (uint s = 0; s < scratch_.size(); ++s) {
-        scratch_[s]->destroyMemory();
+    if (globalScratchBuf_ != NULL) {
+        for (uint s = 0; s < scratch_.size(); ++s) {
+            scratch_[s]->destroyMemory();
+            scratch_[s]->regNum_ = 0;
+        }
+        delete globalScratchBuf_;
+        globalScratchBuf_ = NULL;
     }
 }
 
diff --git a/rocclr/runtime/device/gpu/gpudevice.hpp b/rocclr/runtime/device/gpu/gpudevice.hpp
index e7f384ecd5..ffcc5f056a 100644
--- a/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -297,9 +297,11 @@ public:
     {
         uint    regNum_;    //!< The number of used scratch registers
         std::vector<Memory*>   memObjs_;   //!< Memory objects for scratch buffers
+        uint    offset_;    //!< Offset from the global scratch store
+        uint    size_;      //!< Scratch buffer size on this queue
 
         //! Default constructor
-        ScratchBuffer(uint numMems): regNum_(0), memObjs_(numMems) {}
+        ScratchBuffer(uint numMems): regNum_(0), memObjs_(numMems), offset_(0) {}
 
         //! Default constructor
         ~ScratchBuffer();
@@ -524,6 +526,9 @@ public:
 
     const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; }
 
+    //! Returns the global scratch buffer
+    Memory* globalScratchBuf() const { return globalScratchBuf_; };
+
     //! Destroys scratch buffer memory
     void destroyScratchBuffers();
 
@@ -613,9 +618,10 @@ private:
     std::vector<amd::Memory*>*  mapCache_;  //!< Map cache info structure
     ResourceCache*  resourceCache_; //!< CAL resource cache
     Engines         engines_;       //!< Available engines on device
-    bool            heapInitComplete_; //!< Keep track of initialization status of heap resources
+    bool            heapInitComplete_;  //!< Keep track of initialization status of heap resources
     VirtualGPU*     xferQueue_;     //!< Transfer queue
     std::vector<ScratchBuffer*> scratch_;   //!< Scratch buffers for kernels
+    Memory*         globalScratchBuf_;  //!< Global scratch buffer
     SrdManager*     srdManager_;    //!< SRD manager object
 
     static AppProfile appProfile_; //!< application profile
diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp
index bb456c3b60..5ace3cb0c7 100644
--- a/rocclr/runtime/device/gpu/gpusched.hpp
+++ b/rocclr/runtime/device/gpu/gpusched.hpp
@@ -68,7 +68,7 @@ struct SchedulerParam {
     uint32_t    releaseHostCP;  //!< Releases CP on the host queue
     uint64_t    parentAQL;      //!< Host parent AmdAqlWrap packet
     uint32_t    dedicatedQueue; //!< Scheduler uses a dedicated queue
-    uint32_t    reserved;       //!< Reserved field
+    uint32_t    scratchOffset;  //!< Scratch buffer offset
 };
 
 } // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp
index 98be8f0c39..c08a88e02c 100644
--- a/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -92,7 +92,7 @@ typedef struct _SchedulerParam {
     uint    releaseHostCP;  //!< Releases CP on the host queue
     ulong   parentAQL;      //!< Host parent AmdAqlWrap packet
     uint    dedicatedQueue; //!< Scheduler uses a dedicated queue
-    uint    reserved;       //!< Reserved field
+    uint    scratchOffset;  //!< Scratch buffer offset
 } SchedulerParam;
 
 typedef struct _HwDispatch {
@@ -152,7 +152,7 @@ typedef struct _HwDispatch {
     uint    shPrivateHi;    // 0x00000000 ---- dstAddressHi
     uint    user4;          // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
     uint    offsUser4;      // 0x00000248 ---- OFFSET
-    uint    privOffs;       // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0
+    uint    scratchOffs;    // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0
     uint    privSize;       // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30
     uint    packet4;        // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE
     uint    glbSizeX;       // 0x00000000
@@ -170,10 +170,11 @@ static inline void
 dispatch(
     volatile __global HwDispatch*   dispatch,
     __global HsaAqlDispatchPacket*  aqlPkt,
-    uint                            scratchSize,
-    uint                            numMaxWaves,
-    ulong                           scratch,
-    ulong                           hsaQueue)
+    ulong   scratch,
+    ulong   hsaQueue,
+    uint    scratchSize,
+    uint    scratchOffset,
+    uint    numMaxWaves)
 {
     const uint UsrRegOffset = 0x240;
     const uint Pm4Nop = 0xC0001002;
@@ -258,8 +259,9 @@ dispatch(
     // flatScratchEna = (flags & 0x20);
     if (flags & 0x20) {
         dispatch->copyData = Pm4CopyReg;
-        dispatch->scratchAddrLo = (uint)(scratch >> 16);
+        dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16);
         dispatch->offsUser4 = UsrRegOffset + usrRegCnt;
+        dispatch->scratchOffs = scratchOffset;
         dispatch->privSize = privateSize;
     }
     else {
@@ -421,8 +423,8 @@ scheduler(
                                 (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
                         }
                         // Launch child kernel ....
-                        dispatch(hwDisp, &disp->aql, param->scratchSize, param->numMaxWaves,
-                            param->scratch, param->hsa_queue);
+                        dispatch(hwDisp, &disp->aql, param->scratch, param->hsa_queue,
+                            param->scratchSize, param->scratchOffset, param->numMaxWaves);
                         disp->state = AQL_WRAP_BUSY;
                         releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
                             disp->wait_num, (__global uint*)queue->event_slot_mask,
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 730c6e744e..3ddeb86371 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -377,7 +377,6 @@ VirtualGPU::VirtualGPU(
     , numVmMems_(0)
     , dmaFlushMgmt_(device)
     , numGrpCb_(NULL)
-    , scratchRegNum_(0)
     , hwRing_(0)
     , readjustTimeGPU_(0)
     , currTs_(NULL)
@@ -645,7 +644,7 @@ VirtualGPU::~VirtualGPU()
     //!@note OCLtst uses single device with multiple tests
     //! Release memory only if it's the last command queue.
     //! The first queue is reserved for the transfers on device
-    if ((scratchRegNum_ > 0) && (gpuDevice_.numOfVgpus_ <= 1)) {
+    if (gpuDevice_.numOfVgpus_ <= 1) {
         gpuDevice_.destroyScratchBuffers();
     }
 
@@ -1736,12 +1735,14 @@ VirtualGPU::submitKernelInternalHSA(
     }
 
     gslMemObject    scratch = NULL;
+    uint            scratchOffset = 0;
     // Check if the device allocated more registers than the old setup
     if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
-        const std::vector<Memory*>& mems = dev().scratch(hwRing())->memObjs_;
+        const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing());
+        const std::vector<Memory*>& mems = scratchObj->memObjs_;
         scratch = mems[0]->gslResource();
         memList.push_back(mems[0]);
-        scratchRegNum_ = dev().scratch(hwRing())->regNum_;
+        scratchOffset = scratchObj->offset_;
     }
 
     // Add GSL handle to the memory list for VidMM
@@ -1752,7 +1753,7 @@ VirtualGPU::submitKernelInternalHSA(
     GpuEvent    gpuEvent;
     // Run AQL dispatch in HW
     runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
-        scratch, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress());
+        scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress());
 
     if (hsaKernel.dynamicParallelism()) {
         // Make sure exculsive access to the device queue
@@ -1884,12 +1885,14 @@ VirtualGPU::submitKernelInternalHSA(
             param->scratchSize = scratchBuf->size();
             param->scratch = scratchBuf->vmAddress();
             param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
+            param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
             memList.push_back(scratchBuf);
         }
         else {
             param->numMaxWaves = 0;
             param->scratchSize = 0;
             param->scratch = 0;
+            param->scratchOffset = 0;
         }
 
         // Add all kernels in the program to the mem list.
@@ -2180,7 +2183,6 @@ VirtualGPU::releaseMemory(gslMemObject gslResource, bool wait)
         for (uint i = 0; i < mems.size(); ++i) {
             if ((mems[i] != NULL) && (mems[i]->gslResource() == gslResource)) {
                 setScratchBuffer(NULL, i);
-                scratchRegNum_ = 0;
             }
         }
     }
@@ -2986,14 +2988,13 @@ VirtualGPU::waitEventLock(CommandBatch* cb)
 void
 VirtualGPU::validateScratchBuffer(const Kernel* kernel)
 {
-    // Check if the device allocated more registers than the old setup
-    if (dev().scratch(hwRing())->regNum_ > scratchRegNum_) {
+    // Check if a scratch buffer is required
+    if (dev().scratch(hwRing())->regNum_ > 0) {
         const std::vector<Memory*>& mems = dev().scratch(hwRing())->memObjs_;
         for (uint i = 0; i < mems.size(); ++i) {
             // Setup scratch buffer
             setScratchBuffer(mems[i]->gslResource(), i);
         }
-        scratchRegNum_ = dev().scratch(hwRing())->regNum_;
     }
 }
 
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.hpp b/rocclr/runtime/device/gpu/gpuvirtual.hpp
index 93b09a5a0f..79e070bb33 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.hpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.hpp
@@ -519,7 +519,6 @@ private:
     CommandBatchList    cbList_;        //!< List of command batches
 
     ConstBuffer*    numGrpCb_;      //!< Constant buffer for 8xx workaround
-    uint            scratchRegNum_; //!< Number of scratch registers used in this queue
     uint            hwRing_;        //!< HW ring used on this virtual device
 
     uint64_t        readjustTimeGPU_;   //!< Readjust time between GPU and CPU timestamps
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
index 303ff0c120..6c657344e0 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
@@ -1266,10 +1266,11 @@ CALGSLContext::writeTimer(bool sdma, const gslMemObject mem, uint32 offset) cons
 
 void
 CALGSLContext::runAqlDispatch(GpuEvent& event, const void* aqlPacket,
-     const gslMemObject* mems, uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA)
+    const gslMemObject* mems, uint32 numMems, gslMemObject scratch, uint32 scratchOffset,
+    const void* cpuKernelCode, uint64 hsaQueueVA)
 {
     eventBegin(MainEngine);
-    m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, cpuKernelCode, hsaQueueVA);
+    m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA);
     eventEnd(MainEngine, event);
 }
 
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
index 2f7cc62cea..cd02a46bc3 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
@@ -44,7 +44,7 @@ public:
     bool             runProgramGrid(GpuEvent& event, const ProgramGrid* pProgramGrid, const gslMemObject* mems, uint32 numMems);
     bool             runProgramVideoDecode(GpuEvent& event, gslMemObject mo, const CALprogramVideoDecode& decode);
     void             runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems,
-                        uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA);
+                        uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA);
     mcaddr           virtualQueueDispatcherStart();
     void             virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart);
     void             virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,