diff --git a/wddm/queue.cpp b/wddm/queue.cpp index ff74a8d086..c10a034705 100644 --- a/wddm/queue.cpp +++ b/wddm/queue.cpp @@ -252,9 +252,11 @@ ComputeQueue::ComputeQueue(WDDMDevice *device, platform_atomic_support_(false), signal_addr_(NULL), thread_stop_(false), - scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()), + max_scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()), + dispatch_waves_(0), scratch_size_per_wave_(0), scratch_size_(0), + total_scratch_size_(0), scratch_base_(nullptr) { bool ret = device->CreateQueue(this); assert(ret); @@ -270,6 +272,11 @@ ComputeQueue::ComputeQueue(WDDMDevice *device, amd_queue_rocr_ = (amd_queue_v2_t*)((char*)ring_rptr - offsetof(amd_queue_v2_t, read_dispatch_id)); aql_to_pm4_thread_ = std::thread(AqlToPm4Thread, this); + + if (device->Major() >= 11) + scratch_mem_alignment_size_ = 256; + else + scratch_mem_alignment_size_ = 1024; } ComputeQueue::~ComputeQueue() { @@ -411,22 +418,29 @@ void ComputeQueue::InitScratchSRD() { // then the effective size for a 64 lane wave is halved. amd_queue_->scratch_wave64_lane_byte_size = scratch_size_per_wave_ / 64; + uint64_t num_waves; if (device->Major() < 11) { COMPUTE_TMPRING_SIZE tmpring_size; - tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / 1024; - tmpring_size.bits.WAVES = scratch_waves_; + // Scratch Size per Wave is specified in terms of scratch_mem_alignment_size_ + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_; + num_waves = scratch_size_ / scratch_size_per_wave_; + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_); amd_queue_->compute_tmpring_size = tmpring_size.u32All; } else if (device->Major() == 11) { COMPUTE_TMPRING_SIZE_GFX11 tmpring_size; - tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ >> 8; - tmpring_size.bits.WAVES = scratch_waves_ / device->NumShaderEngine(); + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_; + // For GFX11 we specify number of waves per engine instead of total + num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine(); + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_); amd_queue_->compute_tmpring_size = tmpring_size.u32All; } else { COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {}; - tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ >> 8; - tmpring_size.bits.WAVES = scratch_waves_ / device->NumShaderEngine(); + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_; + // For GFX12 we specify number of waves per engine instead of total + num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine(); + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_); amd_queue_->compute_tmpring_size = tmpring_size.u32All; } @@ -434,22 +448,77 @@ void ComputeQueue::InitScratchSRD() { return; } -bool ComputeQueue::UpdateScratch(uint32_t private_segment_size, bool wave32) { - const uint32_t wavefront = wave32 ? 32 : 64; - const uint32_t alignment = 1024 / wavefront; - private_segment_size = AlignUp(private_segment_size, alignment); +uint64_t ComputeQueue::CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet) +{ + const uint64_t lanes_per_group = + (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z; - uint32_t scratch_size_per_wave = private_segment_size * wavefront; - uint32_t scratch_size = scratch_size_per_wave * scratch_waves_; + uint64_t groups = ((uint64_t(packet->grid_size_x) + packet->workgroup_size_x - 1) / + packet->workgroup_size_x) * + ((uint64_t(packet->grid_size_y) + packet->workgroup_size_y - 1) / + packet->workgroup_size_y) * + ((uint64_t(packet->grid_size_z) + packet->workgroup_size_z - 1) / + packet->workgroup_size_z); + const uint32_t cu_count = device->ComputeUnitCount(); + const uint32_t engines = device->NumShaderEngine(); - if (scratch_size_ >= scratch_size) + const uint32_t symmetric_cus = AlignDown(cu_count, engines); + const uint32_t asymmetryPerRound = cu_count - symmetric_cus; + const uint64_t rounds = groups / cu_count; + const uint64_t asymmetricGroups = rounds * asymmetryPerRound; + const uint64_t symmetricGroups = groups - asymmetricGroups; + + uint64_t maxGroupsPerEngine = + ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0); + + // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each + // engine. + if (device->Major() >= 10 && + maxGroupsPerEngine < 16 && + lanes_per_group * maxGroupsPerEngine < 256) { + uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group; + maxGroupsPerEngine = std::min(groups_per_interleave, uint64_t(16ul)); + } + + return maxGroupsPerEngine * engines; +} + +uint64_t ComputeQueue::CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet, + bool wave32) +{ + const uint32_t lanes_per_wave = wave32 ? 32 : 64; + + const uint64_t lanes_per_group = + (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z; + + return (lanes_per_group + lanes_per_wave - 1) / lanes_per_wave; +} + +bool ComputeQueue::UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32) { + const uint32_t lanes_per_wave = wave32 ? 32 : 64; + const uint64_t size_per_thread = AlignUp(packet->private_segment_size, + scratch_mem_alignment_size_ / lanes_per_wave); + + scratch_size_per_wave_ = size_per_thread * lanes_per_wave; + + uint64_t groups = CalcDispatchGroups(packet); + uint64_t waves_per_group = CalcDispatchWavesPerGroup(packet, wave32); + + dispatch_waves_ = groups * waves_per_group; + + const uint64_t max_scratch_size = scratch_size_per_wave_ * max_scratch_waves_; + const uint64_t dispatch_size = scratch_size_per_wave_ * dispatch_waves_; + + scratch_size_ = std::min(dispatch_size, max_scratch_size); + + if (total_scratch_size_ >= scratch_size_) return true; pr_debug("need realloc scratch buffer, size %x -> %x\n", - scratch_size_, scratch_size); + total_scratch_size_, scratch_size_); GpuMemoryCreateInfo create_info{}; - create_info.size = scratch_size; + create_info.size = scratch_size_; create_info.domain = thunk_proxy::kLocal; GpuMemory *gpu_mem = nullptr; auto code = device->CreateGpuMemory(create_info, &gpu_mem); @@ -461,8 +530,7 @@ bool ComputeQueue::UpdateScratch(uint32_t private_segment_size, bool wave32) { delete scratch_gpu_mem; } - scratch_size_per_wave_ = scratch_size_per_wave; - scratch_size_ = scratch_size; + total_scratch_size_ = scratch_size_; scratch_base_ = reinterpret_cast(gpu_mem->GpuAddress()); scratch_mem_ = gpu_mem->GetGpuMemoryHandle(); @@ -647,7 +715,9 @@ ComputeQueue::KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *pa AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32); assert(packet->private_segment_size >= kernel_object->workitem_private_segment_byte_size); - UpdateScratch(packet->private_segment_size, wave32); + + if (packet->private_segment_size != 0) + UpdateScratch(packet, wave32); amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;