diff --git a/rocclr/device/pal/palkernel.cpp b/rocclr/device/pal/palkernel.cpp index 192a3dc109..84d4040fb1 100644 --- a/rocclr/device/pal/palkernel.cpp +++ b/rocclr/device/pal/palkernel.cpp @@ -514,7 +514,10 @@ bool LightningKernel::postLoad() { if (!setKernelCode(sym, &akc_)) { return false; } - + if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, + reinterpret_cast(&kernelHasDynamicCallStack_))) { + return false; + } if (!prog().isNull()) { codeSize_ = prog().codeSegGpu().owner()->getSize(); @@ -545,7 +548,7 @@ bool LightningKernel::postLoad() { // Copy wavefront size workGroupInfo_.wavefrontSize_ = device().info().wavefrontWidth_; - + workGroupInfo_.usedStackSize_ = kernelHasDynamicCallStack_; if (workGroupInfo_.size_ == 0) { return false; } diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index 10a896bc60..a1e3e92cc2 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -2678,6 +2678,13 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dispatchParam.scratchSize = scratch->size_; dispatchParam.scratchOffset = scratch->offset_; dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize(); + if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) { + dispatchParam.workitemPrivateSegmentSize = + std::max(dev().StackSize(), dispatchParam.workitemPrivateSegmentSize); + if (dispatchParam.workitemPrivateSegmentSize > 16 * Ki) { + dispatchParam.workitemPrivateSegmentSize = 16 * Ki; + } + } } dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();