From 0a2d4e2dc36e49cc7d37d28cb97575241596c3fe Mon Sep 17 00:00:00 2001
From: foreman <dl.swbuild@amd.com>
Date: Fri, 9 Aug 2019 18:04:15 -0400
Subject: [PATCH] P4 to Git Change 1981122 by gandryey@gera-win10 on 2019/08/09
 17:59:31

	SWDEV-79445 - OCL generic changes and code clean-up
	- Allow async execution with scratch on the same queue. COMPUTE_TMPRING_SIZE.WAVESIZE should be constant across all dispatches.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#151 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#40 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#145 edit
---
 rocclr/runtime/device/pal/paldevice.cpp  | 47 ++++++++++++------------
 rocclr/runtime/device/pal/paldevice.hpp  |  3 +-
 rocclr/runtime/device/pal/palvirtual.cpp |  5 ++-
 3 files changed, 29 insertions(+), 26 deletions(-)
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 101e90de41..7d6c2c94ff 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -2011,6 +2011,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
       ScopedLockVgpus lock(*this);
 
       scratch_[sb]->size_ = newSize;
+      scratch_[sb]->privateMemSize_ = regNum * sizeof(uint32_t);
+
       uint64_t size = 0;
       uint64_t offset = 0;
 
@@ -2064,8 +2066,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
   return true;
 }
 
-bool Device::validateKernel(
-    const amd::Kernel& kernel, const device::VirtualDevice* vdev, bool coop_groups) {
+bool Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev,
+                            bool coop_groups) {
   // Find the number of scratch registers used in the kernel
   const device::Kernel* devKernel = kernel.getDeviceKernel(*this);
   uint regNum = static_cast<uint>(devKernel->workGroupInfo()->scratchRegs_);
@@ -2239,33 +2241,30 @@ void Device::svmFree(void* ptr) const {
 }
 
 bool Device::AcquireExclusiveGpuAccess() {
-    // Lock the virtual GPU list
-    vgpusAccess().lock();
+  // Lock the virtual GPU list
+  vgpusAccess().lock();
 
-    // Find all available virtual GPUs and lock them
-    // from the execution of commands
-    for (uint idx = 0; idx < vgpus().size(); ++idx) {
-        vgpus()[idx]->execution().lock();
-        // Make sure a wait is done
-        vgpus()[idx]->WaitForIdleCompute();
-    }
-//    if (!hsa_exclusive_gpu_access_) {
-        // @todo call rocr
-//        hsa_exclusive_gpu_access_ = true;
-//    }
-    return true;
+  // Find all available virtual GPUs and lock them
+  // from the execution of commands
+  for (uint idx = 0; idx < vgpus().size(); ++idx) {
+    vgpus()[idx]->execution().lock();
+    // Make sure a wait is done
+    vgpus()[idx]->WaitForIdleCompute();
+  }
+
+  return true;
 }
 
 void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
-    vgpu.WaitForIdleCompute();
-    // Find all available virtual GPUs and unlock them
-    // for the execution of commands
-    for (uint idx = 0; idx < vgpus().size(); ++idx) {
-        vgpus()[idx]->execution().unlock();
-    }
+  vgpu.WaitForIdleCompute();
+  // Find all available virtual GPUs and unlock them
+  // for the execution of commands
+  for (uint idx = 0; idx < vgpus().size(); ++idx) {
+    vgpus()[idx]->execution().unlock();
+  }
 
-    // Unock the virtual GPU list
-    vgpusAccess().unlock();
+  // Unock the virtual GPU list
+  vgpusAccess().unlock();
 }
 
 Device::SrdManager::~SrdManager() {
diff --git a/rocclr/runtime/device/pal/paldevice.hpp b/rocclr/runtime/device/pal/paldevice.hpp
index 1f51b560fb..e04abfba94 100644
--- a/rocclr/runtime/device/pal/paldevice.hpp
+++ b/rocclr/runtime/device/pal/paldevice.hpp
@@ -242,9 +242,10 @@ class Device : public NullDevice {
     Memory* memObj_;   //!< Memory objects for scratch buffers
     uint64_t offset_;  //!< Offset from the global scratch store
     uint64_t size_;    //!< Scratch buffer size on this queue
+    uint64_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch
 
     //! Default constructor
-    ScratchBuffer() : memObj_(NULL), offset_(0), size_(0) {}
+    ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {}
 
     //! Default constructor
     ~ScratchBuffer();
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index 972c776cd3..281d307de3 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -2370,12 +2370,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
       dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
       dispatchParam.scratchSize = scratch->size_;
       dispatchParam.scratchOffset = scratch->offset_;
+      // Use maximum available slots for all dispatches to allow async on the same queue
+      // HW value loaded into SGPR is an offset value calculated as
+      // wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE
+      dispatchParam.workitemPrivateSegmentSize = scratch->privateMemSize_;
     }
     dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
     dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
     dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
     dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
-    dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
     dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
     // Run AQL dispatch in HW
     eventBegin(MainEngine);