diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
index d58e56a3c6..cc0ae18e18 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
@@ -1993,18 +1993,25 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
       regNum = threadSizeLimit;
     }
 
+    // The algorithm below attempts to keep max possible size to allow concurrent execution,
+    // where the scratch offset will be kept constant - wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE
+
     // Calculate the size of the scratch buffer for a queue
     uint32_t numTotalCUs = properties().gfxipProperties.shaderCore.numAvailableCus;
     // Find max waves based on VGPR per SIMD
-    // note: Select maximum to allow possible kernel async execution,
-    // but optimal is (numAvailableVgprs / vgprs)
-    uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numWavefrontsPerSimd;
+    uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numAvailableVgprs / vgprs;
     // Find max waves per CU
     numMaxWaves *= properties().gfxipProperties.shaderCore.numSimdsPerCu;
     // Find max waves per device
-    numMaxWaves = std::min(settings().numScratchWavesPerCu_, numMaxWaves) * numTotalCUs;
+    numMaxWaves = std::min(settings().numScratchWavesPerCu_, numMaxWaves);
+    // Find max between current alloc and the new limit
+    numMaxWaves = std::max(numMaxWaves, scratch_[sb]->numMaxWaves_);
+    // Current private mem size
+    uint32_t privateMemSize = regNum * sizeof(uint32_t);
+    // Max between the allocation and current
+    privateMemSize = std::max(privateMemSize, scratch_[sb]->numMaxWaves_);
     uint64_t newSize =
-        static_cast<uint64_t>(info().wavefrontWidth_) * regNum * numMaxWaves * sizeof(uint32_t);
+        static_cast<uint64_t>(info().wavefrontWidth_) * privateMemSize * numMaxWaves * numTotalCUs;
 
     // Check if the current buffer isn't big enough
     if (newSize > scratch_[sb]->size_) {
@@ -2027,7 +2034,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
             scratchBuf->size_ = std::min(newSize, uint64_t(3 * Gi));
             // Note: Generic address space setup in HW requires 64KB alignment for scratch
             scratchBuf->size_ = amd::alignUp(newSize, 64 * Ki);
-            scratchBuf->privateMemSize_ = regNum * sizeof(uint32_t);
+            scratchBuf->privateMemSize_ = privateMemSize;
+            scratchBuf->numMaxWaves_ = numMaxWaves;
           }
           scratchBuf->offset_ = offset;
           size += scratchBuf->size_;
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
index 72e79dac75..dfb95b0433 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
@@ -251,9 +251,10 @@ class Device : public NullDevice {
     uint64_t offset_;          //!< Offset from the global scratch store
     uint64_t size_;            //!< Scratch buffer size on this queue
     uint32_t privateMemSize_;  //!< Private memory size per thread, allowed by the current scratch
+    uint32_t numMaxWaves_;     //!< The max number of waves for this scratch alloc
 
     //! Default constructor
-    ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {}
+    ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0), numMaxWaves_(0) {}
 
     //! Default constructor
     ~ScratchBuffer();