From 53b5692d07cf3975df9f6efea3370c23b24da4d7 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 12 May 2023 14:31:48 -0400 Subject: [PATCH] Fix scratch allocation occupancy reduction loop If the required scratch allocation is too large, ROCr will attempt to reduce it by lowering the dispatch's targeted occupancy. The reduction loop however was prone to overflow if waves_per_cu was not a multiple of waves_per_group. Ensure no overflow by aligning waves_per_cu to waves_per_group. On GC 9.4.3 dGPU, dispatches with a large grid size and a waves_per_group of e.g. 16 may require to reduce occupancy such that waves_per_cu is less than waves_per_group to ensure the allocation size is small enough. Allow this while also ensuring the tmpring scratch wave count is kept divisible by the number of SEs per XCC. Signed-off-by: Graham Sider Change-Id: Ie4016dcd8166a9ae69e9decc26a3eec882b49480 [ROCm/ROCR-Runtime commit: bd63e5045c363404c1e9cfd0250705d7d28bce65] --- .../hsa-runtime/core/runtime/amd_gpu_agent.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 2f8b9b1257..c695ba4116 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -1486,8 +1486,11 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n", scratch.size); const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU; + const uint64_t se_per_xcc = properties_.NumShaderBanks / properties_.NumXcc; + const uint64_t total_waves = scratch.size / size_per_wave; - uint64_t waves_per_cu = total_waves / num_cus; + uint64_t waves_per_cu = AlignUp(total_waves / num_cus, scratch.waves_per_group); + while (waves_per_cu != 0) { size_t size = waves_per_cu * num_cus * size_per_wave; void* base = scratch_pool_.alloc_high(size); @@ -1507,7 +1510,14 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { return; } scratch_pool_.free(base); - waves_per_cu = waves_per_cu - scratch.waves_per_group; + + // Wave count must be divisible by #SEs in an XCC. If occupancy must be reduced + // such that waves_per_cu < waves_per_group, continue reducing by #SEs per XCC + // (only allowed if waves_per_group is a multiple #SEs per XCC). + waves_per_cu -= (waves_per_cu <= scratch.waves_per_group && + se_per_xcc < scratch.waves_per_group && + scratch.waves_per_group % se_per_xcc == 0) ? + se_per_xcc : scratch.waves_per_group; } // Failed to allocate minimal scratch