From 552dcead937e565e0fbc86d477bebbe1476bfc7c Mon Sep 17 00:00:00 2001
From: Sean Keely <Sean.Keely@amd.com>
Date: Mon, 28 Feb 2022 21:03:49 -0600
Subject: [PATCH] Correct scratch allocation logic to account for asymmetric
 harvest.

With asym. harvest hw does not issue groups equally to each SE,
occasionally hw will skip an SE so that the distribution reflects
each SE's CU count.  Scratch resources must be allocated to reflect
this asymmetric distribution of groups.

Change-Id: I65e26206500483ea18e6e8796e65ecba5354b029
---
 .../core/runtime/amd_aql_queue.cpp            | 22 +++++++++++++++----
 runtime/hsa-runtime/core/util/utils.h         |  3 +--
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
index 6b89400f46..e66ac355a1 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
@@ -831,8 +831,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
       assert((scratch_request != 0) &&
              "Scratch memory request from packet with no scratch demand.  Possible bad kernel code object.");
 
+      // Get the hw maximum scratch slot count taking into consideration asymmetric harvest.
+      const uint32_t engines = queue->agent_->properties().NumShaderBanks;
+      const uint32_t cu_count = queue->amd_queue_.max_cu_id + 1;
       const uint32_t MaxScratchSlots =
-          (queue->amd_queue_.max_cu_id + 1) * queue->agent_->properties().MaxSlotsScratchCU;
+          AlignUp(cu_count, engines) * queue->agent_->properties().MaxSlotsScratchCU;
 
       scratch.size_per_thread = scratch_request;
       scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
@@ -840,6 +843,9 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
       scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
       scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
 
+      // Smaller dispatches may not need to reach full device occupancy.
+      // For these we need to ensure that the scratch we give doesn't restrict the dispatch even
+      // though it does not fill the device. Figure the total requested dispatch size.
       uint64_t lanes_per_group =
           (uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
           pkt.dispatch.workgroup_size_z;
@@ -854,9 +860,17 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
           ((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
            pkt.dispatch.workgroup_size_z);
 
-      // Assign an equal number of groups to each engine, clipping to capacity limits
-      const uint32_t engines = queue->agent_->properties().NumShaderBanks;
-      groups = ((groups + engines - 1) / engines) * engines;
+      // Find the maximum number of groups assigned to any engine.
+      const uint32_t symmetric_cus = AlignDown(cu_count, engines);
+      const uint32_t asymmetryPerRound = cu_count - symmetric_cus;
+      const uint64_t rounds = groups / cu_count;
+      const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
+      const uint64_t symmetricGroups = groups - asymmetricGroups;
+      const uint64_t maxGroupsPerEngine =
+          ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);
+
+      // Populate all engines at max group occupancy, then clip down to device limits.
+      groups = maxGroupsPerEngine * engines;
       scratch.wanted_slots = groups * waves_per_group;
       scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
       scratch.dispatch_size =
diff --git a/runtime/hsa-runtime/core/util/utils.h b/runtime/hsa-runtime/core/util/utils.h
index b1901b6520..ab536ba796 100644
--- a/runtime/hsa-runtime/core/util/utils.h
+++ b/runtime/hsa-runtime/core/util/utils.h
@@ -237,8 +237,7 @@ static __forceinline bool IsPowerOfTwo(T val) {
 /// @return: T.
 template <typename T>
 static __forceinline T AlignDown(T value, size_t alignment) {
-  assert(IsPowerOfTwo(alignment));
-  return (T)(value & ~(alignment - 1));
+  return (T)((value / alignment) * alignment);
 }
 
 /// @brief: Same as previous one, but first parameter becomes pointer, for more