From 34e526d77f3069a8df8f2b7bc9fa6be67be1f99f Mon Sep 17 00:00:00 2001
From: foreman <dl.swbuild@amd.com>
Date: Tue, 13 Aug 2019 17:43:10 -0400
Subject: [PATCH] P4 to Git Change 1982729 by gandryey@gera-win10 on 2019/08/13
 17:40:55

	SWDEV-79445 - OCL generic changes and code clean-up
	- Use max number of waves per SIMD in the scratch calculation to allow async kernel execution with the scratch buffer

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#155 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#43 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#147 edit
---
 rocclr/runtime/device/pal/paldevice.cpp  | 17 +++++++++++------
 rocclr/runtime/device/pal/paldevice.hpp  |  2 +-
 rocclr/runtime/device/pal/palvirtual.cpp |  2 +-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 3c18de4411..d58e56a3c6 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -1996,7 +1996,9 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
     // Calculate the size of the scratch buffer for a queue
     uint32_t numTotalCUs = properties().gfxipProperties.shaderCore.numAvailableCus;
     // Find max waves based on VGPR per SIMD
-    uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numAvailableVgprs / vgprs;
+    // note: Select maximum to allow possible kernel async execution,
+    // but optimal is (numAvailableVgprs / vgprs)
+    uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numWavefrontsPerSimd;
     // Find max waves per CU
     numMaxWaves *= properties().gfxipProperties.shaderCore.numSimdsPerCu;
     // Find max waves per device
@@ -2019,11 +2021,14 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
         ScratchBuffer* scratchBuf = scratch_[s];
         if (scratchBuf->size_ > 0) {
           scratchBuf->destroyMemory();
-          scratchBuf->size_ = std::min(newSize, info().maxMemAllocSize_);
-          scratchBuf->size_ = std::min(newSize, uint64_t(3 * Gi));
-          // Note: Generic address space setup in HW requires 64KB alignment for scratch
-          scratchBuf->size_ = amd::alignUp(newSize, 64 * Ki);
-          scratchBuf->privateMemSize_ = regNum * sizeof(uint32_t);
+          // Adjust the size for the current queue only
+          if (s == sb) {
+            scratchBuf->size_ = std::min(newSize, info().maxMemAllocSize_);
+            scratchBuf->size_ = std::min(newSize, uint64_t(3 * Gi));
+            // Note: Generic address space setup in HW requires 64KB alignment for scratch
+            scratchBuf->size_ = amd::alignUp(newSize, 64 * Ki);
+            scratchBuf->privateMemSize_ = regNum * sizeof(uint32_t);
+          }
           scratchBuf->offset_ = offset;
           size += scratchBuf->size_;
           offset += scratchBuf->size_;
diff --git a/rocclr/runtime/device/pal/paldevice.hpp b/rocclr/runtime/device/pal/paldevice.hpp
index 160a0a554c..72e79dac75 100644
--- a/rocclr/runtime/device/pal/paldevice.hpp
+++ b/rocclr/runtime/device/pal/paldevice.hpp
@@ -250,7 +250,7 @@ class Device : public NullDevice {
     Memory* memObj_;           //!< Memory objects for scratch buffers
     uint64_t offset_;          //!< Offset from the global scratch store
     uint64_t size_;            //!< Scratch buffer size on this queue
-    uint64_t privateMemSize_;  //!< Private memory size per thread, allowed by the current scratch
+    uint32_t privateMemSize_;  //!< Private memory size per thread, allowed by the current scratch
 
     //! Default constructor
     ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {}
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index 5e3e4dda5b..0e66799883 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -2444,7 +2444,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
       // Use maximum available slots for all dispatches to allow async on the same queue
       // HW value loaded into SGPR is an offset value calculated as
       // wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE
-      dispatchParam.workitemPrivateSegmentSize = scratch->privateMemSize_;
+      dispatchParam.workitemPrivateSegmentSize = std::max(hsaKernel.spillSegSize(), scratch->privateMemSize_);
     }
     dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
     dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();