diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 683fcd864a..55feab0c29 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -866,9 +866,17 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { const uint64_t rounds = groups / cu_count; const uint64_t asymmetricGroups = rounds * asymmetryPerRound; const uint64_t symmetricGroups = groups - asymmetricGroups; - const uint64_t maxGroupsPerEngine = + uint64_t maxGroupsPerEngine = ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0); + // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each + // engine. + if (queue->agent_->isa()->GetMajorVersion() >= 10 && maxGroupsPerEngine < 16 && + lanes_per_group * maxGroupsPerEngine < 256) { + uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group; + maxGroupsPerEngine = Min(groups_per_interleave, 16ul); + } + // Populate all engines at max group occupancy, then clip down to device limits. groups = maxGroupsPerEngine * engines; scratch.wanted_slots = groups * waves_per_group;