Handle scratch interleave per SE for gfx10+

On gfx10+ we need to issue a minimum count of active lanes or groups before ADC moves on. Ensure that scratch allocations attempt to reach this limit. Occupancy throttling due to OOM condition may still drop below this limit. Change-Id: I0edf2e40fbe1a95e9a262564cebd2b6a82501a0b [ROCm/ROCR-Runtime commit: 2eedf953f3]
2022-04-22 20:31:31 -05:00
@@ -866,9 +866,17 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
      const uint64_t rounds = groups / cu_count;
      const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
      const uint64_t symmetricGroups = groups - asymmetricGroups;
-      const uint64_t maxGroupsPerEngine =
+      uint64_t maxGroupsPerEngine =
          ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);

+      // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each
+      // engine.
+      if (queue->agent_->isa()->GetMajorVersion() >= 10 && maxGroupsPerEngine < 16 &&
+          lanes_per_group * maxGroupsPerEngine < 256) {
+        uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group;
+        maxGroupsPerEngine = Min(groups_per_interleave, 16ul);
+      }
+
      // Populate all engines at max group occupancy, then clip down to device limits.
      groups = maxGroupsPerEngine * engines;
      scratch.wanted_slots = groups * waves_per_group;