From 761653fa00fd4fb35ebd13bf5ef418c521e267ca Mon Sep 17 00:00:00 2001
From: Sean Keely <Sean.Keely@amd.com>
Date: Fri, 22 Apr 2022 20:31:31 -0500
Subject: [PATCH] Handle scratch interleave per SE for gfx10+

On gfx10+ we need to issue a minimum count of active lanes or
groups before ADC moves on.  Ensure that scratch allocations
attempt to reach this limit.

Occupancy throttling due to OOM condition may still drop below this
limit.

Change-Id: I0edf2e40fbe1a95e9a262564cebd2b6a82501a0b


[ROCm/ROCR-Runtime commit: 2eedf953f3cc9d79526cbab62ded127e2b48f8b5]
---
 .../runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
index 683fcd864a..55feab0c29 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
@@ -866,9 +866,17 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
       const uint64_t rounds = groups / cu_count;
       const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
       const uint64_t symmetricGroups = groups - asymmetricGroups;
-      const uint64_t maxGroupsPerEngine =
+      uint64_t maxGroupsPerEngine =
           ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);
 
+      // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each
+      // engine.
+      if (queue->agent_->isa()->GetMajorVersion() >= 10 && maxGroupsPerEngine < 16 &&
+          lanes_per_group * maxGroupsPerEngine < 256) {
+        uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group;
+        maxGroupsPerEngine = Min(groups_per_interleave, 16ul);
+      }
+
       // Populate all engines at max group occupancy, then clip down to device limits.
       groups = maxGroupsPerEngine * engines;
       scratch.wanted_slots = groups * waves_per_group;