diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp index 423b40312b..490f226cee 100644 --- a/projects/clr/rocclr/device/device.cpp +++ b/projects/clr/rocclr/device/device.cpp @@ -739,6 +739,18 @@ bool Device::ValidateHsail() { return true; } +size_t GetMaxStackSize(const std::string& procName) { + if (procName.find("gfx9") != std::string::npos || procName.find("gfx8") + != std::string::npos) { + return kMaxStackSize9X; + } else if (procName.find("gfx11") != std::string::npos || procName.find("gfx10") + != std::string::npos) { + return kMaxStackSize11X; + } else { + return kMaxStackSize12X; + } +} + bool Device::create(const Isa &isa) { assert(!vaCacheAccess_ && !vaCacheMap_); isa_ = &isa; @@ -755,6 +767,7 @@ bool Device::create(const Isa &isa) { if (!amd::IS_HIP) { stack_size_ = 16 * Ki; } + maxStackSize_ = GetMaxStackSize(isa_->processorName()); return true; } @@ -930,7 +943,7 @@ bool Device::disableP2P(amd::Device* ptrDev) { } bool Device::UpdateStackSize(uint64_t stackSize) { - if (stackSize > kMaxStackSize) { + if (stackSize > maxStackSize_) { return false; } stack_size_ = amd::alignUp(stackSize, 16); diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index bc08ea9775..eb4627b9ea 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -126,6 +126,16 @@ enum MemRangeAttribute : uint32_t { constexpr int CpuDeviceId = static_cast(-1); constexpr int InvalidDeviceId = static_cast(-2); +// Max scratch size is device dependent. +constexpr size_t kWave32 = 32; +constexpr size_t kWave64 = 64; +constexpr size_t kScratchBits12X = 18; +constexpr size_t kScratchBits9X = 15; +constexpr size_t kCompilerRequired = 64; +constexpr size_t kMaxStackSize12X = (((1 << kScratchBits12X) - 1) * 256 / kWave32) - kCompilerRequired; +constexpr size_t kMaxStackSize11X = (((1 << kScratchBits9X) - 1) * 256 / kWave32) - kCompilerRequired; +constexpr size_t kMaxStackSize9X = (((1 << kScratchBits9X) - 1) * 256 / kWave64) - kCompilerRequired; + enum class ExternalSemaphoreHandleType : uint32_t { OpaqueFd = 1, // Handle is an opaque file descriptor OpaqueWin32 = 2, // Handle is an opaque shared NT handle @@ -1652,11 +1662,9 @@ class Device : public RuntimeObject { static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo); static constexpr size_t kSGInfoSize = kMGSyncDataSize; - // Amount of space used by each wave is in units of 256 dwords. - // As per COMPUTE_TMPRING_SIZE.WAVE_SIZE 24:12 - // The field size supports a range of 0->(2M-256) dwords per wave64. - // Per lane this works out to 131056 bytes or 128K - 16 - static constexpr size_t kMaxStackSize = ((128 * Ki) - 16); + // Max Scratch size is based on ISA and thus per device. + // Def value is as per GFX9 being the least among supported devices. + size_t maxStackSize_ = kMaxStackSize9X; typedef std::list CommandQueues; @@ -2131,6 +2139,9 @@ class Device : public RuntimeObject { return nullptr; } + //! Returns stack size set for the device + size_t MaxStackSize() const { return maxStackSize_; } + #if defined(__clang__) #if __has_feature(address_sanitizer) virtual device::UriLocator* createUriLocator() const = 0; diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index d12cc72ccc..4a6dc96bbc 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -2752,6 +2752,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) { privateMemSize = std::max(static_cast(device().StackSize()), hsaKernel.workGroupInfo()->scratchRegs_ * sizeof(uint32_t)) ; + // Validate privateMemSize is more than max allowed. + size_t maxStackSize = device().MaxStackSize(); + if (privateMemSize > maxStackSize) { + ClPrint(amd::LOG_INFO, amd::LOG_KERN, + "Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s", + privateMemSize, maxStackSize, hsaKernel.name().c_str()); + LogError("Scratch size exceeds max allowed."); + return false; + } } // Set up the dispatch information diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 491b72e802..3ad0098bab 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -3585,9 +3585,18 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_; if ((devKernel->workGroupInfo()->usedStackSize_ & 0x1) == 0x1) { - dispatchPacket.private_segment_size = std::min( - std::max(dev().StackSize(), dispatchPacket.private_segment_size), - Device::kMaxStackSize); + dispatchPacket.private_segment_size = std::max(dev().StackSize(), + dispatchPacket.private_segment_size); + // Validate privateMemSize is more than max allowed. + size_t maxStackSize = dev().MaxStackSize(); + if (dispatchPacket.private_segment_size > maxStackSize) { + ClPrint(amd::LOG_INFO, amd::LOG_KERN, + "Scratch size (%u) exceeds max allowed (%zu) for kernel : %s", + dispatchPacket.private_segment_size, maxStackSize, + gpuKernel.getDemangledName().c_str()); + LogError("Scratch size exceeds max allowed."); + return false; + } } // Pass the header accordingly