From 761653fa00fd4fb35ebd13bf5ef418c521e267ca Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Fri, 22 Apr 2022 20:31:31 -0500 Subject: [PATCH] Handle scratch interleave per SE for gfx10+ On gfx10+ we need to issue a minimum count of active lanes or groups before ADC moves on. Ensure that scratch allocations attempt to reach this limit. Occupancy throttling due to OOM condition may still drop below this limit. Change-Id: I0edf2e40fbe1a95e9a262564cebd2b6a82501a0b [ROCm/ROCR-Runtime commit: 2eedf953f3cc9d79526cbab62ded127e2b48f8b5] --- .../runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 683fcd864a..55feab0c29 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -866,9 +866,17 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { const uint64_t rounds = groups / cu_count; const uint64_t asymmetricGroups = rounds * asymmetryPerRound; const uint64_t symmetricGroups = groups - asymmetricGroups; - const uint64_t maxGroupsPerEngine = + uint64_t maxGroupsPerEngine = ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0); + // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each + // engine. + if (queue->agent_->isa()->GetMajorVersion() >= 10 && maxGroupsPerEngine < 16 && + lanes_per_group * maxGroupsPerEngine < 256) { + uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group; + maxGroupsPerEngine = Min(groups_per_interleave, 16ul); + } + // Populate all engines at max group occupancy, then clip down to device limits. groups = maxGroupsPerEngine * engines; scratch.wanted_slots = groups * waves_per_group;