diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp index d58e56a3c6..cc0ae18e18 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -1993,18 +1993,25 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) { regNum = threadSizeLimit; } + // The algorithm below attempts to keep max possible size to allow concurrent execution, + // where the scratch offset will be kept constant - wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE + // Calculate the size of the scratch buffer for a queue uint32_t numTotalCUs = properties().gfxipProperties.shaderCore.numAvailableCus; // Find max waves based on VGPR per SIMD - // note: Select maximum to allow possible kernel async execution, - // but optimal is (numAvailableVgprs / vgprs) - uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numWavefrontsPerSimd; + uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numAvailableVgprs / vgprs; // Find max waves per CU numMaxWaves *= properties().gfxipProperties.shaderCore.numSimdsPerCu; // Find max waves per device - numMaxWaves = std::min(settings().numScratchWavesPerCu_, numMaxWaves) * numTotalCUs; + numMaxWaves = std::min(settings().numScratchWavesPerCu_, numMaxWaves); + // Find max between current alloc and the new limit + numMaxWaves = std::max(numMaxWaves, scratch_[sb]->numMaxWaves_); + // Current private mem size + uint32_t privateMemSize = regNum * sizeof(uint32_t); + // Max between the allocation and current + privateMemSize = std::max(privateMemSize, scratch_[sb]->numMaxWaves_); uint64_t newSize = - static_cast(info().wavefrontWidth_) * regNum * numMaxWaves * sizeof(uint32_t); + static_cast(info().wavefrontWidth_) * privateMemSize * numMaxWaves * numTotalCUs; // Check if the current buffer isn't big enough if (newSize > scratch_[sb]->size_) { @@ -2027,7 +2034,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) { scratchBuf->size_ = std::min(newSize, uint64_t(3 * Gi)); // Note: Generic address space setup in HW requires 64KB alignment for scratch scratchBuf->size_ = amd::alignUp(newSize, 64 * Ki); - scratchBuf->privateMemSize_ = regNum * sizeof(uint32_t); + scratchBuf->privateMemSize_ = privateMemSize; + scratchBuf->numMaxWaves_ = numMaxWaves; } scratchBuf->offset_ = offset; size += scratchBuf->size_; diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp index 72e79dac75..dfb95b0433 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp @@ -251,9 +251,10 @@ class Device : public NullDevice { uint64_t offset_; //!< Offset from the global scratch store uint64_t size_; //!< Scratch buffer size on this queue uint32_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch + uint32_t numMaxWaves_; //!< The max number of waves for this scratch alloc //! Default constructor - ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {} + ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0), numMaxWaves_(0) {} //! Default constructor ~ScratchBuffer();