diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index 101e90de41..7d6c2c94ff 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -2011,6 +2011,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) { ScopedLockVgpus lock(*this); scratch_[sb]->size_ = newSize; + scratch_[sb]->privateMemSize_ = regNum * sizeof(uint32_t); + uint64_t size = 0; uint64_t offset = 0; @@ -2064,8 +2066,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) { return true; } -bool Device::validateKernel( - const amd::Kernel& kernel, const device::VirtualDevice* vdev, bool coop_groups) { +bool Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev, + bool coop_groups) { // Find the number of scratch registers used in the kernel const device::Kernel* devKernel = kernel.getDeviceKernel(*this); uint regNum = static_cast(devKernel->workGroupInfo()->scratchRegs_); @@ -2239,33 +2241,30 @@ void Device::svmFree(void* ptr) const { } bool Device::AcquireExclusiveGpuAccess() { - // Lock the virtual GPU list - vgpusAccess().lock(); + // Lock the virtual GPU list + vgpusAccess().lock(); - // Find all available virtual GPUs and lock them - // from the execution of commands - for (uint idx = 0; idx < vgpus().size(); ++idx) { - vgpus()[idx]->execution().lock(); - // Make sure a wait is done - vgpus()[idx]->WaitForIdleCompute(); - } -// if (!hsa_exclusive_gpu_access_) { - // @todo call rocr -// hsa_exclusive_gpu_access_ = true; -// } - return true; + // Find all available virtual GPUs and lock them + // from the execution of commands + for (uint idx = 0; idx < vgpus().size(); ++idx) { + vgpus()[idx]->execution().lock(); + // Make sure a wait is done + vgpus()[idx]->WaitForIdleCompute(); + } + + return true; } void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const { - vgpu.WaitForIdleCompute(); - // Find all available virtual GPUs and unlock them - // for the execution of commands - for (uint idx = 0; idx < vgpus().size(); ++idx) { - vgpus()[idx]->execution().unlock(); - } + vgpu.WaitForIdleCompute(); + // Find all available virtual GPUs and unlock them + // for the execution of commands + for (uint idx = 0; idx < vgpus().size(); ++idx) { + vgpus()[idx]->execution().unlock(); + } - // Unock the virtual GPU list - vgpusAccess().unlock(); + // Unock the virtual GPU list + vgpusAccess().unlock(); } Device::SrdManager::~SrdManager() { diff --git a/rocclr/runtime/device/pal/paldevice.hpp b/rocclr/runtime/device/pal/paldevice.hpp index 1f51b560fb..e04abfba94 100644 --- a/rocclr/runtime/device/pal/paldevice.hpp +++ b/rocclr/runtime/device/pal/paldevice.hpp @@ -242,9 +242,10 @@ class Device : public NullDevice { Memory* memObj_; //!< Memory objects for scratch buffers uint64_t offset_; //!< Offset from the global scratch store uint64_t size_; //!< Scratch buffer size on this queue + uint64_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch //! Default constructor - ScratchBuffer() : memObj_(NULL), offset_(0), size_(0) {} + ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {} //! Default constructor ~ScratchBuffer(); diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index 972c776cd3..281d307de3 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -2370,12 +2370,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dispatchParam.scratchAddr = scratch->memObj_->vmAddress(); dispatchParam.scratchSize = scratch->size_; dispatchParam.scratchOffset = scratch->offset_; + // Use maximum available slots for all dispatches to allow async on the same queue + // HW value loaded into SGPR is an offset value calculated as + // wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE + dispatchParam.workitemPrivateSegmentSize = scratch->privateMemSize_; } dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress(); dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0; dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; - dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize(); dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize(); // Run AQL dispatch in HW eventBegin(MainEngine);