From c3f55c8e59562cb217c3e01f9fa9f797b7d2c000 Mon Sep 17 00:00:00 2001
From: Longlong Yao <Longlong.Yao@amd.com>
Date: Mon, 5 Jan 2026 14:58:54 +0800
Subject: [PATCH] wsl/librocdxg: Change scratch memory allocation

Calculate the actual scratch memory size required based on the
packet information for kernel dispatch.

If the required size exceeds the total allocated memory, scratch
memory must be reallocated. Otherwise, no action is needed.

miopen_gtest: Full/GPU_MIOpenDriverRegressionTest_FP16.MIOpenDriverRegressionHalf/0

Signed-off-by: Longlong Yao <Longlong.Yao@amd.com>
Reviewed-by: Flora Cui <flora.cui@amd.com>
Reviewed-by: Horatio Zhang <Hongkun.Zhang@amd.com>
---
 wddm/queue.cpp | 108 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 89 insertions(+), 19 deletions(-)

diff --git a/wddm/queue.cpp b/wddm/queue.cpp
index ff74a8d086..c10a034705 100644
--- a/wddm/queue.cpp
+++ b/wddm/queue.cpp
@@ -252,9 +252,11 @@ ComputeQueue::ComputeQueue(WDDMDevice *device,
                platform_atomic_support_(false),
                signal_addr_(NULL),
                thread_stop_(false),
-               scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()),
+               max_scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()),
+               dispatch_waves_(0),
                scratch_size_per_wave_(0),
                scratch_size_(0),
+               total_scratch_size_(0),
                scratch_base_(nullptr) {
   bool ret = device->CreateQueue(this);
   assert(ret);
@@ -270,6 +272,11 @@ ComputeQueue::ComputeQueue(WDDMDevice *device,
 
   amd_queue_rocr_ = (amd_queue_v2_t*)((char*)ring_rptr - offsetof(amd_queue_v2_t, read_dispatch_id));
   aql_to_pm4_thread_ = std::thread(AqlToPm4Thread, this);
+
+  if (device->Major() >= 11)
+    scratch_mem_alignment_size_ = 256;
+  else
+    scratch_mem_alignment_size_ = 1024;
 }
 
 ComputeQueue::~ComputeQueue() {
@@ -411,22 +418,29 @@ void ComputeQueue::InitScratchSRD() {
   // then the effective size for a 64 lane wave is halved.
   amd_queue_->scratch_wave64_lane_byte_size = scratch_size_per_wave_ / 64;
 
+  uint64_t num_waves;
   if (device->Major() < 11) {
     COMPUTE_TMPRING_SIZE tmpring_size;
-    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / 1024;
-    tmpring_size.bits.WAVES = scratch_waves_;
+    // Scratch Size per Wave is specified in terms of scratch_mem_alignment_size_
+    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_;
+    num_waves = scratch_size_ / scratch_size_per_wave_;
+    tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_);
 
     amd_queue_->compute_tmpring_size = tmpring_size.u32All;
   } else if (device->Major() == 11) {
     COMPUTE_TMPRING_SIZE_GFX11 tmpring_size;
-    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ >> 8;
-    tmpring_size.bits.WAVES = scratch_waves_ / device->NumShaderEngine();
+    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_;
+    // For GFX11 we specify number of waves per engine instead of total
+    num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine();
+    tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_);
 
     amd_queue_->compute_tmpring_size = tmpring_size.u32All;
   } else {
     COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {};
-    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ >> 8;
-    tmpring_size.bits.WAVES = scratch_waves_ / device->NumShaderEngine();
+    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_;
+    // For GFX12 we specify number of waves per engine instead of total
+    num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine();
+    tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_);
 
     amd_queue_->compute_tmpring_size = tmpring_size.u32All;
   }
@@ -434,22 +448,77 @@ void ComputeQueue::InitScratchSRD() {
   return;
 }
 
-bool ComputeQueue::UpdateScratch(uint32_t private_segment_size, bool wave32) {
-  const uint32_t wavefront = wave32 ? 32 : 64;
-  const uint32_t alignment = 1024 / wavefront;
-  private_segment_size = AlignUp(private_segment_size, alignment);
+uint64_t ComputeQueue::CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet)
+{
+  const uint64_t lanes_per_group =
+      (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z;
 
-  uint32_t scratch_size_per_wave = private_segment_size * wavefront;
-  uint32_t scratch_size = scratch_size_per_wave * scratch_waves_;
+  uint64_t groups = ((uint64_t(packet->grid_size_x) + packet->workgroup_size_x - 1) /
+                      packet->workgroup_size_x) *
+                      ((uint64_t(packet->grid_size_y) + packet->workgroup_size_y - 1) /
+                      packet->workgroup_size_y) *
+                      ((uint64_t(packet->grid_size_z) + packet->workgroup_size_z - 1) /
+                      packet->workgroup_size_z);
+  const uint32_t cu_count = device->ComputeUnitCount();
+  const uint32_t engines = device->NumShaderEngine();
 
-  if (scratch_size_ >= scratch_size)
+  const uint32_t symmetric_cus = AlignDown(cu_count, engines);
+  const uint32_t asymmetryPerRound = cu_count - symmetric_cus;
+  const uint64_t rounds = groups / cu_count;
+  const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
+  const uint64_t symmetricGroups = groups - asymmetricGroups;
+
+  uint64_t maxGroupsPerEngine =
+        ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);
+
+  // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each
+  // engine.
+  if (device->Major() >= 10 &&
+      maxGroupsPerEngine < 16 &&
+      lanes_per_group * maxGroupsPerEngine < 256) {
+    uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group;
+    maxGroupsPerEngine = std::min(groups_per_interleave, uint64_t(16ul));
+  }
+
+  return maxGroupsPerEngine * engines;
+}
+
+uint64_t ComputeQueue::CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet,
+                                                  bool wave32)
+{
+  const uint32_t lanes_per_wave = wave32 ? 32 : 64;
+
+  const uint64_t lanes_per_group =
+      (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z;
+
+  return (lanes_per_group + lanes_per_wave - 1) / lanes_per_wave;
+}
+
+bool ComputeQueue::UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32) {
+  const uint32_t lanes_per_wave = wave32 ? 32 : 64;
+  const uint64_t size_per_thread = AlignUp(packet->private_segment_size,
+                                  scratch_mem_alignment_size_ / lanes_per_wave);
+
+  scratch_size_per_wave_ = size_per_thread * lanes_per_wave;
+
+  uint64_t groups = CalcDispatchGroups(packet);
+  uint64_t waves_per_group = CalcDispatchWavesPerGroup(packet, wave32);
+
+  dispatch_waves_ = groups * waves_per_group;
+
+  const uint64_t max_scratch_size = scratch_size_per_wave_ * max_scratch_waves_;
+  const uint64_t dispatch_size = scratch_size_per_wave_ * dispatch_waves_;
+
+  scratch_size_ = std::min(dispatch_size, max_scratch_size);
+
+  if (total_scratch_size_ >= scratch_size_)
     return true;
 
   pr_debug("need realloc scratch buffer, size %x -> %x\n",
-           scratch_size_, scratch_size);
+           total_scratch_size_, scratch_size_);
 
   GpuMemoryCreateInfo create_info{};
-  create_info.size = scratch_size;
+  create_info.size = scratch_size_;
   create_info.domain = thunk_proxy::kLocal;
   GpuMemory *gpu_mem = nullptr;
   auto code = device->CreateGpuMemory(create_info, &gpu_mem);
@@ -461,8 +530,7 @@ bool ComputeQueue::UpdateScratch(uint32_t private_segment_size, bool wave32) {
     delete scratch_gpu_mem;
   }
 
-  scratch_size_per_wave_ = scratch_size_per_wave;
-  scratch_size_ = scratch_size;
+  total_scratch_size_ = scratch_size_;
   scratch_base_ = reinterpret_cast<void *>(gpu_mem->GpuAddress());
   scratch_mem_ = gpu_mem->GetGpuMemoryHandle();
 
@@ -647,7 +715,9 @@ ComputeQueue::KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *pa
 		     AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32);
 
   assert(packet->private_segment_size >= kernel_object->workitem_private_segment_byte_size);
-  UpdateScratch(packet->private_segment_size, wave32);
+
+  if (packet->private_segment_size != 0)
+    UpdateScratch(packet, wave32);
 
   amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;