From db6de71b54bdbb283ef310f7f91ea5f542bb9a35 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Wed, 27 May 2020 14:39:30 -0500
Subject: [PATCH] Fix occupancy calculation functions in ROCclr path

The hipOccupancyMaxPotentialBlockSize API is meant to return the
number of threads for the highest-occupancy workgroup, and the number
of those workgroups. It was previously calculating the number of
maximum-sized workgroups that would fit on a single CU. This is
a mixture of the API we wanted (to calculate max potential block size)
and the MaxBlocksPerMultiprocessor function.

This patch fixes it up so that the internal occupancy calculation
function works for two uses: the traditional function that calculates
the maximum blocks per multiprocessor when a user passes in a fixed
block size (used for hipMaxBlocksPerMultiprocessor style functions)
and a function that calculates the size of a block that would lead
to maximum occupancy, and how many blocks of that size would be
needed to fill the whole GPU (for hipOccupancyMaxPotentialBlockSize
style functions).

This also updates the occupancy calculation function to prepare for
gfx10, which does not have SGPR-based occupancy limits.

Change-Id: Ie007b3f9d5ebc4e166b50a3a051498af35650f35


[ROCm/clr commit: 90453b68d3f2fb3d8fe26a05665f205ffe740da3]
---
 projects/clr/hipamd/rocclr/hip_module.cpp   |   7 +-
 projects/clr/hipamd/rocclr/hip_platform.cpp | 126 +++++++++++++-------
 projects/clr/hipamd/rocclr/hip_platform.hpp |   4 +-
 3 files changed, 89 insertions(+), 48 deletions(-)

diff --git a/projects/clr/hipamd/rocclr/hip_module.cpp b/projects/clr/hipamd/rocclr/hip_module.cpp
index 8ae9ad799c..8f3d4ca936 100755
--- a/projects/clr/hipamd/rocclr/hip_module.cpp
+++ b/projects/clr/hipamd/rocclr/hip_module.cpp
@@ -392,11 +392,12 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
       return hipErrorLaunchFailure;
     }
     int num_blocks = 0;
-    int num_grids = 0;
+    int max_blocks_per_grid = 0;
+    int best_block_size = 0;
     int block_size = blockDimX * blockDimY * blockDimZ;
     hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-      &num_blocks, &num_grids, device, f, block_size, sharedMemBytes, true);
-    if (((gridDimX * gridDimY * gridDimZ) / block_size) > unsigned(num_grids)) {
+      &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, block_size, sharedMemBytes, true);
+    if (((gridDimX * gridDimY * gridDimZ) / block_size) > unsigned(max_blocks_per_grid)) {
       return hipErrorCooperativeLaunchTooLarge;
     }
   }
diff --git a/projects/clr/hipamd/rocclr/hip_platform.cpp b/projects/clr/hipamd/rocclr/hip_platform.cpp
index 0be928f9a4..5f5d1e74c9 100755
--- a/projects/clr/hipamd/rocclr/hip_platform.cpp
+++ b/projects/clr/hipamd/rocclr/hip_platform.cpp
@@ -855,31 +855,37 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor
 
 namespace hip_impl {
 hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    int* numBlocks, int* numGrids,
-    const amd::Device& device, hipFunction_t func, int  blockSize,
+    int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize,
+    const amd::Device& device, hipFunction_t func, int inputBlockSize,
     size_t dynamicSMemSize, bool bCalcPotentialBlkSz)
 {
   hip::Function* function = hip::Function::asFunction(func);
   const amd::Kernel& kernel = *function->function_;
 
   const device::Kernel::WorkGroupInfo* wrkGrpInfo = kernel.getDeviceKernel(device)->workGroupInfo();
-  if (blockSize == 0) {
-    if (bCalcPotentialBlkSz == false){
+  if (bCalcPotentialBlkSz == false) {
+    if (inputBlockSize == 0) {
       return hipErrorInvalidValue;
     }
-    else {
-      blockSize = device.info().maxWorkGroupSize_; // maxwavefrontperblock
+    *bestBlockSize = 0;
+    // Make sure the requested block size is smaller than max supported
+    if (inputBlockSize > int(device.info().maxWorkGroupSize_)) {
+        *maxBlocksPerCU = 0;
+        *numBlocksPerGrid = 0;
+        return hipSuccess;
     }
   }
-
-  // Make sure the requested block size is smaller than max supported
-  if (blockSize > int(device.info().maxWorkGroupSize_)) {
-    numBlocks = 0;
-    numGrids = 0;
-    return hipSuccess;
+  else {
+    if (inputBlockSize > device.info().maxWorkGroupSize_ ||
+            inputBlockSize == 0) {
+      // The user wrote the kernel to work with a workgroup size
+      // bigger than this hardware can support. Or they do not care
+      // about the size So just assume its maximum size is
+      // constrained by hardware
+      inputBlockSize = device.info().maxWorkGroupSize_;
+    }
   }
-
-  // Find threads accupancy per CU => simd_per_cu * GPR usage
+  // Find wave occupancy per CU => simd_per_cu * GPR usage
   constexpr size_t MaxWavesPerSimd = 8;  // Limited by SPI 32 per CU, hence 8 per SIMD
   size_t VgprWaves = MaxWavesPerSimd;
   if (wrkGrpInfo->usedVGPRs_ > 0) {
@@ -888,26 +894,49 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
 
   size_t GprWaves = VgprWaves;
   if (wrkGrpInfo->usedSGPRs_ > 0) {
-    const size_t maxSGPRs = (device.info().gfxipVersion_ < 800) ? 512 : 800;
-    size_t SgprWaves = maxSGPRs / amd::alignUp(wrkGrpInfo->usedSGPRs_, 16);
+    size_t maxSGPRs;
+    if (device.info().gfxipVersion_ < 800) {
+      maxSGPRs = 512;
+    }
+    else if (device.info().gfxipVersion_ < 1000) {
+      maxSGPRs = 800;
+    }
+    else {
+      maxSGPRs = SIZE_MAX; // gfx10+ does not share SGPRs between waves
+    }
+    const size_t SgprWaves = maxSGPRs / amd::alignUp(wrkGrpInfo->usedSGPRs_, 16);
     GprWaves = std::min(VgprWaves, SgprWaves);
   }
 
-  size_t alu_accupancy = device.info().simdPerCU_ * std::min(MaxWavesPerSimd, GprWaves);
-  alu_accupancy *= wrkGrpInfo->wavefrontSize_;
-  // Calculate blocks occupancy per CU
-  *numBlocks = alu_accupancy / amd::alignUp(blockSize, wrkGrpInfo->wavefrontSize_);
+  const size_t alu_occupancy = device.info().simdPerCU_ * std::min(MaxWavesPerSimd, GprWaves);
+  const int alu_limited_threads = alu_occupancy * wrkGrpInfo->wavefrontSize_;
 
-  size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize;
+  int lds_occupancy_wgs = INT_MAX;
+  const size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize;
   if (total_used_lds != 0) {
-    // Calculate LDS occupancy per CU. lds_per_cu / (static_lsd + dynamic_lds)
-    int lds_occupancy = static_cast<int>(device.info().localMemSize_ / total_used_lds);
-    *numBlocks = std::min(*numBlocks, lds_occupancy);
+    lds_occupancy_wgs = static_cast<int>(device.info().localMemSize_ / total_used_lds);
   }
+  // Calculate how many blocks of inputBlockSize we can fit per CU
+  // Need to align with hardware wavefront size. If they want 65 threads, but
+  // waves are 64, then we need 128 threads per block.
+  // So this calculates how many blocks we can fit.
+  *maxBlocksPerCU = alu_limited_threads / amd::alignUp(inputBlockSize, wrkGrpInfo->wavefrontSize_);
+  // Unless those blocks are further constrained by LDS size.
+  *maxBlocksPerCU = std::min(*maxBlocksPerCU, lds_occupancy_wgs);
 
-  if (bCalcPotentialBlkSz) {
-    *numGrids = *numBlocks * device.info().numRTCUs_;
-  }
+  // Some callers of this function want to return the block size, in threads, that
+  // leads to the maximum occupancy. In that case, inputBlockSize is the maximum
+  // workgroup size the user wants to allow, or that the hardware can allow.
+  // It is either the number of threads that we are limited to due to occupancy, or
+  // the maximum available block size for this kernel, which could have come from the
+  // user. e.g., if the user indicates the maximum block size is 64 threads, but we
+  // calculate that 128 threads can fit in each CU, we have to give up and return 64.
+  *bestBlockSize = std::min(alu_limited_threads, amd::alignUp(inputBlockSize, wrkGrpInfo->wavefrontSize_));
+  // If the best block size is smaller than the block size used to fit the maximum,
+  // then we need to make the grid bigger for full occupancy.
+  const int bestBlocksPerCU = alu_limited_threads / (*bestBlockSize);
+  // Unless those blocks are further constrained by LDS size.
+  *numBlocksPerGrid = device.info().maxComputeUnits_ * std::min(bestBlocksPerCU, lds_occupancy_wgs);
 
   return hipSuccess;
 }
@@ -927,13 +956,14 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
     return HIP_RETURN(hipErrorInvalidValue);
   }
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
-  int num_grids = 0;
+  int max_blocks_per_grid = 0;
   int num_blocks = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &num_grids, device, func, 0, dynSharedMemPerBlk,true);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSizeLimit, dynSharedMemPerBlk,true);
   if (ret == hipSuccess) {
-    *blockSize = num_blocks;
-    *gridSize = num_grids;
+    *blockSize = best_block_size;
+    *gridSize = max_blocks_per_grid;
   }
   HIP_RETURN(ret);
 }
@@ -947,13 +977,14 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize
     return HIP_RETURN(hipErrorInvalidValue);
   }
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
-  int num_grids = 0;
+  int max_blocks_per_grid = 0;
   int num_blocks = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &num_grids, device, f, 0, dynSharedMemPerBlk,true);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, dynSharedMemPerBlk,true);
   if (ret == hipSuccess) {
-    *blockSize = num_blocks;
-    *gridSize = num_grids;
+    *blockSize = best_block_size;
+    *gridSize = max_blocks_per_grid;
   }
   HIP_RETURN(ret);
 }
@@ -967,13 +998,14 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int*
     return HIP_RETURN(hipErrorInvalidValue);
   }
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
-  int num_grids = 0;
+  int max_blocks_per_grid = 0;
   int num_blocks = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &num_grids, device, f, 0, dynSharedMemPerBlk,true);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, dynSharedMemPerBlk,true);
   if (ret == hipSuccess) {
-    *blockSize = num_blocks;
-    *gridSize = num_grids;
+    *blockSize = best_block_size;
+    *gridSize = max_blocks_per_grid;
   }
   HIP_RETURN(ret);
 }
@@ -988,8 +1020,10 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
 
   int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, nullptr, device, f, blockSize, dynSharedMemPerBlk, false);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, false);
   *numBlocks = num_blocks;
   HIP_RETURN(ret);
 }
@@ -1005,8 +1039,10 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numB
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
 
   int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, nullptr, device, f, blockSize, dynSharedMemPerBlk, false);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, false);
   *numBlocks = num_blocks;
   HIP_RETURN(ret);
 }
@@ -1027,8 +1063,10 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
 
   int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, nullptr, device, func, blockSize, dynamicSMemSize, false);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, false);
   *numBlocks = num_blocks;
   HIP_RETURN(ret);
 }
@@ -1050,8 +1088,10 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
 
   int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, nullptr, device, func, blockSize, dynamicSMemSize, false);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, false);
   *numBlocks = num_blocks;
   HIP_RETURN(ret);
 }
diff --git a/projects/clr/hipamd/rocclr/hip_platform.hpp b/projects/clr/hipamd/rocclr/hip_platform.hpp
index 8e5eaa191f..fcbfb53bbb 100644
--- a/projects/clr/hipamd/rocclr/hip_platform.hpp
+++ b/projects/clr/hipamd/rocclr/hip_platform.hpp
@@ -23,7 +23,7 @@
 
 namespace hip_impl {
 hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    int* numBlocks, int* numGrids,
+    int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize,
     const amd::Device& device, hipFunction_t func, int  blockSize,
     size_t dynamicSMemSize, bool bCalcPotentialBlkSz);
-}
\ No newline at end of file
+}