SWDEV-521011 - Allow max stack size as per ISA. (#73)
此提交包含在:
+14
-1
@@ -739,6 +739,18 @@ bool Device::ValidateHsail() {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t GetMaxStackSize(const std::string& procName) {
|
||||
if (procName.find("gfx9") != std::string::npos || procName.find("gfx8")
|
||||
!= std::string::npos) {
|
||||
return kMaxStackSize9X;
|
||||
} else if (procName.find("gfx11") != std::string::npos || procName.find("gfx10")
|
||||
!= std::string::npos) {
|
||||
return kMaxStackSize11X;
|
||||
} else {
|
||||
return kMaxStackSize12X;
|
||||
}
|
||||
}
|
||||
|
||||
bool Device::create(const Isa &isa) {
|
||||
assert(!vaCacheAccess_ && !vaCacheMap_);
|
||||
isa_ = &isa;
|
||||
@@ -755,6 +767,7 @@ bool Device::create(const Isa &isa) {
|
||||
if (!amd::IS_HIP) {
|
||||
stack_size_ = 16 * Ki;
|
||||
}
|
||||
maxStackSize_ = GetMaxStackSize(isa_->processorName());
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -930,7 +943,7 @@ bool Device::disableP2P(amd::Device* ptrDev) {
|
||||
}
|
||||
|
||||
bool Device::UpdateStackSize(uint64_t stackSize) {
|
||||
if (stackSize > kMaxStackSize) {
|
||||
if (stackSize > maxStackSize_) {
|
||||
return false;
|
||||
}
|
||||
stack_size_ = amd::alignUp(stackSize, 16);
|
||||
|
||||
+16
-5
@@ -126,6 +126,16 @@ enum MemRangeAttribute : uint32_t {
|
||||
constexpr int CpuDeviceId = static_cast<int>(-1);
|
||||
constexpr int InvalidDeviceId = static_cast<int>(-2);
|
||||
|
||||
// Max scratch size is device dependent.
|
||||
constexpr size_t kWave32 = 32;
|
||||
constexpr size_t kWave64 = 64;
|
||||
constexpr size_t kScratchBits12X = 18;
|
||||
constexpr size_t kScratchBits9X = 15;
|
||||
constexpr size_t kCompilerRequired = 64;
|
||||
constexpr size_t kMaxStackSize12X = (((1 << kScratchBits12X) - 1) * 256 / kWave32) - kCompilerRequired;
|
||||
constexpr size_t kMaxStackSize11X = (((1 << kScratchBits9X) - 1) * 256 / kWave32) - kCompilerRequired;
|
||||
constexpr size_t kMaxStackSize9X = (((1 << kScratchBits9X) - 1) * 256 / kWave64) - kCompilerRequired;
|
||||
|
||||
enum class ExternalSemaphoreHandleType : uint32_t {
|
||||
OpaqueFd = 1, // Handle is an opaque file descriptor
|
||||
OpaqueWin32 = 2, // Handle is an opaque shared NT handle
|
||||
@@ -1652,11 +1662,9 @@ class Device : public RuntimeObject {
|
||||
static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo);
|
||||
static constexpr size_t kSGInfoSize = kMGSyncDataSize;
|
||||
|
||||
// Amount of space used by each wave is in units of 256 dwords.
|
||||
// As per COMPUTE_TMPRING_SIZE.WAVE_SIZE 24:12
|
||||
// The field size supports a range of 0->(2M-256) dwords per wave64.
|
||||
// Per lane this works out to 131056 bytes or 128K - 16
|
||||
static constexpr size_t kMaxStackSize = ((128 * Ki) - 16);
|
||||
// Max Scratch size is based on ISA and thus per device.
|
||||
// Def value is as per GFX9 being the least among supported devices.
|
||||
size_t maxStackSize_ = kMaxStackSize9X;
|
||||
|
||||
typedef std::list<CommandQueue*> CommandQueues;
|
||||
|
||||
@@ -2131,6 +2139,9 @@ class Device : public RuntimeObject {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
//! Returns stack size set for the device
|
||||
size_t MaxStackSize() const { return maxStackSize_; }
|
||||
|
||||
#if defined(__clang__)
|
||||
#if __has_feature(address_sanitizer)
|
||||
virtual device::UriLocator* createUriLocator() const = 0;
|
||||
|
||||
@@ -2752,6 +2752,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
|
||||
privateMemSize = std::max<uint32_t>(static_cast<uint32_t>(device().StackSize()),
|
||||
hsaKernel.workGroupInfo()->scratchRegs_ * sizeof(uint32_t)) ;
|
||||
// Validate privateMemSize is more than max allowed.
|
||||
size_t maxStackSize = device().MaxStackSize();
|
||||
if (privateMemSize > maxStackSize) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN,
|
||||
"Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s",
|
||||
privateMemSize, maxStackSize, hsaKernel.name().c_str());
|
||||
LogError("Scratch size exceeds max allowed.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Set up the dispatch information
|
||||
|
||||
@@ -3585,9 +3585,18 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
|
||||
|
||||
if ((devKernel->workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
|
||||
dispatchPacket.private_segment_size = std::min<uint64_t>(
|
||||
std::max<uint64_t>(dev().StackSize(), dispatchPacket.private_segment_size),
|
||||
Device::kMaxStackSize);
|
||||
dispatchPacket.private_segment_size = std::max<uint64_t>(dev().StackSize(),
|
||||
dispatchPacket.private_segment_size);
|
||||
// Validate privateMemSize is more than max allowed.
|
||||
size_t maxStackSize = dev().MaxStackSize();
|
||||
if (dispatchPacket.private_segment_size > maxStackSize) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN,
|
||||
"Scratch size (%u) exceeds max allowed (%zu) for kernel : %s",
|
||||
dispatchPacket.private_segment_size, maxStackSize,
|
||||
gpuKernel.getDemangledName().c_str());
|
||||
LogError("Scratch size exceeds max allowed.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Pass the header accordingly
|
||||
|
||||
新增問題並參考
封鎖使用者