diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index 3c18de4411..d58e56a3c6 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -1996,7 +1996,9 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) { // Calculate the size of the scratch buffer for a queue uint32_t numTotalCUs = properties().gfxipProperties.shaderCore.numAvailableCus; // Find max waves based on VGPR per SIMD - uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numAvailableVgprs / vgprs; + // note: Select maximum to allow possible kernel async execution, + // but optimal is (numAvailableVgprs / vgprs) + uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numWavefrontsPerSimd; // Find max waves per CU numMaxWaves *= properties().gfxipProperties.shaderCore.numSimdsPerCu; // Find max waves per device @@ -2019,11 +2021,14 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) { ScratchBuffer* scratchBuf = scratch_[s]; if (scratchBuf->size_ > 0) { scratchBuf->destroyMemory(); - scratchBuf->size_ = std::min(newSize, info().maxMemAllocSize_); - scratchBuf->size_ = std::min(newSize, uint64_t(3 * Gi)); - // Note: Generic address space setup in HW requires 64KB alignment for scratch - scratchBuf->size_ = amd::alignUp(newSize, 64 * Ki); - scratchBuf->privateMemSize_ = regNum * sizeof(uint32_t); + // Adjust the size for the current queue only + if (s == sb) { + scratchBuf->size_ = std::min(newSize, info().maxMemAllocSize_); + scratchBuf->size_ = std::min(newSize, uint64_t(3 * Gi)); + // Note: Generic address space setup in HW requires 64KB alignment for scratch + scratchBuf->size_ = amd::alignUp(newSize, 64 * Ki); + scratchBuf->privateMemSize_ = regNum * sizeof(uint32_t); + } scratchBuf->offset_ = offset; size += scratchBuf->size_; offset += scratchBuf->size_; diff --git a/rocclr/runtime/device/pal/paldevice.hpp b/rocclr/runtime/device/pal/paldevice.hpp index 160a0a554c..72e79dac75 100644 --- a/rocclr/runtime/device/pal/paldevice.hpp +++ b/rocclr/runtime/device/pal/paldevice.hpp @@ -250,7 +250,7 @@ class Device : public NullDevice { Memory* memObj_; //!< Memory objects for scratch buffers uint64_t offset_; //!< Offset from the global scratch store uint64_t size_; //!< Scratch buffer size on this queue - uint64_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch + uint32_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch //! Default constructor ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {} diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index 5e3e4dda5b..0e66799883 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -2444,7 +2444,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const // Use maximum available slots for all dispatches to allow async on the same queue // HW value loaded into SGPR is an offset value calculated as // wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE - dispatchParam.workitemPrivateSegmentSize = scratch->privateMemSize_; + dispatchParam.workitemPrivateSegmentSize = std::max(hsaKernel.spillSegSize(), scratch->privateMemSize_); } dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();