P4 to Git Change 1981122 by gandryey@gera-win10 on 2019/08/09 17:59:31

SWDEV-79445 - OCL generic changes and code clean-up - Allow async execution with scratch on the same queue. COMPUTE_TMPRING_SIZE.WAVESIZE should be constant across all dispatches. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#151 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#40 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#145 edit
2019-08-09 18:04:15 -04:00
@@ -2011,6 +2011,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
      ScopedLockVgpus lock(*this);

      scratch_[sb]->size_ = newSize;
+      scratch_[sb]->privateMemSize_ = regNum * sizeof(uint32_t);
+
      uint64_t size = 0;
      uint64_t offset = 0;

@@ -2064,8 +2066,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
  return true;
 }

-bool Device::validateKernel(
-    const amd::Kernel& kernel, const device::VirtualDevice* vdev, bool coop_groups) {
+bool Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev,
+                            bool coop_groups) {
  // Find the number of scratch registers used in the kernel
  const device::Kernel* devKernel = kernel.getDeviceKernel(*this);
  uint regNum = static_cast<uint>(devKernel->workGroupInfo()->scratchRegs_);
@@ -2239,33 +2241,30 @@ void Device::svmFree(void* ptr) const {
 }

 bool Device::AcquireExclusiveGpuAccess() {
-    // Lock the virtual GPU list
-    vgpusAccess().lock();
+  // Lock the virtual GPU list
+  vgpusAccess().lock();

-    // Find all available virtual GPUs and lock them
-    // from the execution of commands
-    for (uint idx = 0; idx < vgpus().size(); ++idx) {
-        vgpus()[idx]->execution().lock();
-        // Make sure a wait is done
-        vgpus()[idx]->WaitForIdleCompute();
-    }
-//    if (!hsa_exclusive_gpu_access_) {
-        // @todo call rocr
-//        hsa_exclusive_gpu_access_ = true;
-//    }
-    return true;
+  // Find all available virtual GPUs and lock them
+  // from the execution of commands
+  for (uint idx = 0; idx < vgpus().size(); ++idx) {
+    vgpus()[idx]->execution().lock();
+    // Make sure a wait is done
+    vgpus()[idx]->WaitForIdleCompute();
+  }
+
+  return true;
 }

 void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
-    vgpu.WaitForIdleCompute();
-    // Find all available virtual GPUs and unlock them
-    // for the execution of commands
-    for (uint idx = 0; idx < vgpus().size(); ++idx) {
-        vgpus()[idx]->execution().unlock();
-    }
+  vgpu.WaitForIdleCompute();
+  // Find all available virtual GPUs and unlock them
+  // for the execution of commands
+  for (uint idx = 0; idx < vgpus().size(); ++idx) {
+    vgpus()[idx]->execution().unlock();
+  }

-    // Unock the virtual GPU list
-    vgpusAccess().unlock();
+  // Unock the virtual GPU list
+  vgpusAccess().unlock();
 }

 Device::SrdManager::~SrdManager() {
@@ -242,9 +242,10 @@ class Device : public NullDevice {
    Memory* memObj_;   //!< Memory objects for scratch buffers
    uint64_t offset_;  //!< Offset from the global scratch store
    uint64_t size_;    //!< Scratch buffer size on this queue
+    uint64_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch

    //! Default constructor
-    ScratchBuffer() : memObj_(NULL), offset_(0), size_(0) {}
+    ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {}

    //! Default constructor
    ~ScratchBuffer();
@@ -2370,12 +2370,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
      dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
      dispatchParam.scratchSize = scratch->size_;
      dispatchParam.scratchOffset = scratch->offset_;
+      // Use maximum available slots for all dispatches to allow async on the same queue
+      // HW value loaded into SGPR is an offset value calculated as
+      // wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE
+      dispatchParam.workitemPrivateSegmentSize = scratch->privateMemSize_;
    }
    dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
    dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
    dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
    dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
-    dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
    dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
    // Run AQL dispatch in HW
    eventBegin(MainEngine);