From 0a2d4e2dc36e49cc7d37d28cb97575241596c3fe Mon Sep 17 00:00:00 2001
From: foreman
Date: Fri, 9 Aug 2019 18:04:15 -0400
Subject: [PATCH] P4 to Git Change 1981122 by gandryey@gera-win10 on 2019/08/09
17:59:31
SWDEV-79445 - OCL generic changes and code clean-up
- Allow async execution with scratch on the same queue. COMPUTE_TMPRING_SIZE.WAVESIZE should be constant across all dispatches.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#151 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#40 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#145 edit
---
rocclr/runtime/device/pal/paldevice.cpp | 47 ++++++++++++------------
rocclr/runtime/device/pal/paldevice.hpp | 3 +-
rocclr/runtime/device/pal/palvirtual.cpp | 5 ++-
3 files changed, 29 insertions(+), 26 deletions(-)
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 101e90de41..7d6c2c94ff 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -2011,6 +2011,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
ScopedLockVgpus lock(*this);
scratch_[sb]->size_ = newSize;
+ scratch_[sb]->privateMemSize_ = regNum * sizeof(uint32_t);
+
uint64_t size = 0;
uint64_t offset = 0;
@@ -2064,8 +2066,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
return true;
}
-bool Device::validateKernel(
- const amd::Kernel& kernel, const device::VirtualDevice* vdev, bool coop_groups) {
+bool Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev,
+ bool coop_groups) {
// Find the number of scratch registers used in the kernel
const device::Kernel* devKernel = kernel.getDeviceKernel(*this);
uint regNum = static_cast(devKernel->workGroupInfo()->scratchRegs_);
@@ -2239,33 +2241,30 @@ void Device::svmFree(void* ptr) const {
}
bool Device::AcquireExclusiveGpuAccess() {
- // Lock the virtual GPU list
- vgpusAccess().lock();
+ // Lock the virtual GPU list
+ vgpusAccess().lock();
- // Find all available virtual GPUs and lock them
- // from the execution of commands
- for (uint idx = 0; idx < vgpus().size(); ++idx) {
- vgpus()[idx]->execution().lock();
- // Make sure a wait is done
- vgpus()[idx]->WaitForIdleCompute();
- }
-// if (!hsa_exclusive_gpu_access_) {
- // @todo call rocr
-// hsa_exclusive_gpu_access_ = true;
-// }
- return true;
+ // Find all available virtual GPUs and lock them
+ // from the execution of commands
+ for (uint idx = 0; idx < vgpus().size(); ++idx) {
+ vgpus()[idx]->execution().lock();
+ // Make sure a wait is done
+ vgpus()[idx]->WaitForIdleCompute();
+ }
+
+ return true;
}
void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
- vgpu.WaitForIdleCompute();
- // Find all available virtual GPUs and unlock them
- // for the execution of commands
- for (uint idx = 0; idx < vgpus().size(); ++idx) {
- vgpus()[idx]->execution().unlock();
- }
+ vgpu.WaitForIdleCompute();
+ // Find all available virtual GPUs and unlock them
+ // for the execution of commands
+ for (uint idx = 0; idx < vgpus().size(); ++idx) {
+ vgpus()[idx]->execution().unlock();
+ }
- // Unock the virtual GPU list
- vgpusAccess().unlock();
+ // Unock the virtual GPU list
+ vgpusAccess().unlock();
}
Device::SrdManager::~SrdManager() {
diff --git a/rocclr/runtime/device/pal/paldevice.hpp b/rocclr/runtime/device/pal/paldevice.hpp
index 1f51b560fb..e04abfba94 100644
--- a/rocclr/runtime/device/pal/paldevice.hpp
+++ b/rocclr/runtime/device/pal/paldevice.hpp
@@ -242,9 +242,10 @@ class Device : public NullDevice {
Memory* memObj_; //!< Memory objects for scratch buffers
uint64_t offset_; //!< Offset from the global scratch store
uint64_t size_; //!< Scratch buffer size on this queue
+ uint64_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch
//! Default constructor
- ScratchBuffer() : memObj_(NULL), offset_(0), size_(0) {}
+ ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {}
//! Default constructor
~ScratchBuffer();
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index 972c776cd3..281d307de3 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -2370,12 +2370,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
dispatchParam.scratchSize = scratch->size_;
dispatchParam.scratchOffset = scratch->offset_;
+ // Use maximum available slots for all dispatches to allow async on the same queue
+ // HW value loaded into SGPR is an offset value calculated as
+ // wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE
+ dispatchParam.workitemPrivateSegmentSize = scratch->privateMemSize_;
}
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
- dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
// Run AQL dispatch in HW
eventBegin(MainEngine);