From d31b976030ea33345db25101d7fe85f989aa5da8 Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 14 Aug 2019 13:27:56 -0400
Subject: [PATCH] P4 to Git Change 1983258 by gandryey@gera-win10 on 2019/08/14
13:22:17
SWDEV-79445 - OCL generic changes and code clean-up
- Use more optimal algorithm for max scratch size calculation which adds the current wave limit into the tracking
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#156 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#44 edit
[ROCm/clr commit: aa2a32975c95b409d4802f7709cf4b1f989954c5]
---
.../rocclr/runtime/device/pal/paldevice.cpp | 20 +++++++++++++------
.../rocclr/runtime/device/pal/paldevice.hpp | 3 ++-
2 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
index d58e56a3c6..cc0ae18e18 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
@@ -1993,18 +1993,25 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
regNum = threadSizeLimit;
}
+ // The algorithm below attempts to keep max possible size to allow concurrent execution,
+ // where the scratch offset will be kept constant - wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE
+
// Calculate the size of the scratch buffer for a queue
uint32_t numTotalCUs = properties().gfxipProperties.shaderCore.numAvailableCus;
// Find max waves based on VGPR per SIMD
- // note: Select maximum to allow possible kernel async execution,
- // but optimal is (numAvailableVgprs / vgprs)
- uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numWavefrontsPerSimd;
+ uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numAvailableVgprs / vgprs;
// Find max waves per CU
numMaxWaves *= properties().gfxipProperties.shaderCore.numSimdsPerCu;
// Find max waves per device
- numMaxWaves = std::min(settings().numScratchWavesPerCu_, numMaxWaves) * numTotalCUs;
+ numMaxWaves = std::min(settings().numScratchWavesPerCu_, numMaxWaves);
+ // Find max between current alloc and the new limit
+ numMaxWaves = std::max(numMaxWaves, scratch_[sb]->numMaxWaves_);
+ // Current private mem size
+ uint32_t privateMemSize = regNum * sizeof(uint32_t);
+ // Max between the allocation and current
+ privateMemSize = std::max(privateMemSize, scratch_[sb]->numMaxWaves_);
uint64_t newSize =
- static_cast(info().wavefrontWidth_) * regNum * numMaxWaves * sizeof(uint32_t);
+ static_cast(info().wavefrontWidth_) * privateMemSize * numMaxWaves * numTotalCUs;
// Check if the current buffer isn't big enough
if (newSize > scratch_[sb]->size_) {
@@ -2027,7 +2034,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
scratchBuf->size_ = std::min(newSize, uint64_t(3 * Gi));
// Note: Generic address space setup in HW requires 64KB alignment for scratch
scratchBuf->size_ = amd::alignUp(newSize, 64 * Ki);
- scratchBuf->privateMemSize_ = regNum * sizeof(uint32_t);
+ scratchBuf->privateMemSize_ = privateMemSize;
+ scratchBuf->numMaxWaves_ = numMaxWaves;
}
scratchBuf->offset_ = offset;
size += scratchBuf->size_;
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
index 72e79dac75..dfb95b0433 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
@@ -251,9 +251,10 @@ class Device : public NullDevice {
uint64_t offset_; //!< Offset from the global scratch store
uint64_t size_; //!< Scratch buffer size on this queue
uint32_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch
+ uint32_t numMaxWaves_; //!< The max number of waves for this scratch alloc
//! Default constructor
- ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {}
+ ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0), numMaxWaves_(0) {}
//! Default constructor
~ScratchBuffer();