diff --git a/projects/hip/api/hip/hip_platform.cpp b/projects/hip/api/hip/hip_platform.cpp index d284570322..1b8a7b5c96 100644 --- a/projects/hip/api/hip/hip_platform.cpp +++ b/projects/hip/api/hip/hip_platform.cpp @@ -492,9 +492,19 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, // Find threads accupancy per CU => simd_per_cu * GPR usage constexpr size_t MaxWavesPerSimd = 8; // Limited by SPI 32 per CU, hence 8 per SIMD - size_t alu_accupancy = device->info().simdPerCU_ * - std::min(MaxWavesPerSimd, (wrkGrpInfo->availableVGPRs_ / amd::alignUp(wrkGrpInfo->usedVGPRs_, 4))); + size_t VgprWaves = wrkGrpInfo->availableVGPRs_ / amd::alignUp(wrkGrpInfo->usedVGPRs_, 4); + size_t GprWaves; + if (wrkGrpInfo->usedSGPRs_ > 0) { + const size_t maxSGPRs = (device->info().gfxipVersion_ < 800) ? 512 : 800; + size_t SgprWaves = maxSGPRs / amd::alignUp(wrkGrpInfo->usedSGPRs_, 16); + GprWaves = std::min(VgprWaves, SgprWaves); + } + else { + GprWaves = VgprWaves; + } + + size_t alu_accupancy = device->info().simdPerCU_ * std::min(MaxWavesPerSimd, GprWaves); alu_accupancy *= wrkGrpInfo->wavefrontSize_; // Calculate blocks occupancy per CU *numBlocks = alu_accupancy / amd::alignUp(blockSize, wrkGrpInfo->wavefrontSize_);