diff --git a/src/hip_module.cpp b/src/hip_module.cpp index d98b98a378..b3afdd4ffe 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -1326,17 +1326,18 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); - size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS); + size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS)); // Calculate blocks occupancy per CU based on VGPR usage *numBlocks = vgprs_alu_occupancy / numWavefronts; const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; - size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd + size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); size_t total_used_lds = usedLDS + dynSharedMemPerBlk; if (total_used_lds != 0) {