diff --git a/projects/clr/hipamd/rocclr/hip_module.cpp b/projects/clr/hipamd/rocclr/hip_module.cpp
index 8ae9ad799c..8f3d4ca936 100755
--- a/projects/clr/hipamd/rocclr/hip_module.cpp
+++ b/projects/clr/hipamd/rocclr/hip_module.cpp
@@ -392,11 +392,12 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
       return hipErrorLaunchFailure;
     }
     int num_blocks = 0;
-    int num_grids = 0;
+    int max_blocks_per_grid = 0;
+    int best_block_size = 0;
     int block_size = blockDimX * blockDimY * blockDimZ;
     hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-      &num_blocks, &num_grids, device, f, block_size, sharedMemBytes, true);
-    if (((gridDimX * gridDimY * gridDimZ) / block_size) > unsigned(num_grids)) {
+      &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, block_size, sharedMemBytes, true);
+    if (((gridDimX * gridDimY * gridDimZ) / block_size) > unsigned(max_blocks_per_grid)) {
       return hipErrorCooperativeLaunchTooLarge;
     }
   }
diff --git a/projects/clr/hipamd/rocclr/hip_platform.cpp b/projects/clr/hipamd/rocclr/hip_platform.cpp
index 0be928f9a4..5f5d1e74c9 100755
--- a/projects/clr/hipamd/rocclr/hip_platform.cpp
+++ b/projects/clr/hipamd/rocclr/hip_platform.cpp
@@ -855,31 +855,37 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor
 
 namespace hip_impl {
 hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    int* numBlocks, int* numGrids,
-    const amd::Device& device, hipFunction_t func, int  blockSize,
+    int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize,
+    const amd::Device& device, hipFunction_t func, int inputBlockSize,
     size_t dynamicSMemSize, bool bCalcPotentialBlkSz)
 {
   hip::Function* function = hip::Function::asFunction(func);
   const amd::Kernel& kernel = *function->function_;
 
   const device::Kernel::WorkGroupInfo* wrkGrpInfo = kernel.getDeviceKernel(device)->workGroupInfo();
-  if (blockSize == 0) {
-    if (bCalcPotentialBlkSz == false){
+  if (bCalcPotentialBlkSz == false) {
+    if (inputBlockSize == 0) {
       return hipErrorInvalidValue;
     }
-    else {
-      blockSize = device.info().maxWorkGroupSize_; // maxwavefrontperblock
+    *bestBlockSize = 0;
+    // Make sure the requested block size is smaller than max supported
+    if (inputBlockSize > int(device.info().maxWorkGroupSize_)) {
+        *maxBlocksPerCU = 0;
+        *numBlocksPerGrid = 0;
+        return hipSuccess;
     }
   }
-
-  // Make sure the requested block size is smaller than max supported
-  if (blockSize > int(device.info().maxWorkGroupSize_)) {
-    numBlocks = 0;
-    numGrids = 0;
-    return hipSuccess;
+  else {
+    if (inputBlockSize > device.info().maxWorkGroupSize_ ||
+            inputBlockSize == 0) {
+      // The user wrote the kernel to work with a workgroup size
+      // bigger than this hardware can support. Or they do not care
+      // about the size So just assume its maximum size is
+      // constrained by hardware
+      inputBlockSize = device.info().maxWorkGroupSize_;
+    }
   }
-
-  // Find threads accupancy per CU => simd_per_cu * GPR usage
+  // Find wave occupancy per CU => simd_per_cu * GPR usage
   constexpr size_t MaxWavesPerSimd = 8;  // Limited by SPI 32 per CU, hence 8 per SIMD
   size_t VgprWaves = MaxWavesPerSimd;
   if (wrkGrpInfo->usedVGPRs_ > 0) {
@@ -888,26 +894,49 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
 
   size_t GprWaves = VgprWaves;
   if (wrkGrpInfo->usedSGPRs_ > 0) {
-    const size_t maxSGPRs = (device.info().gfxipVersion_ < 800) ? 512 : 800;
-    size_t SgprWaves = maxSGPRs / amd::alignUp(wrkGrpInfo->usedSGPRs_, 16);
+    size_t maxSGPRs;
+    if (device.info().gfxipVersion_ < 800) {
+      maxSGPRs = 512;
+    }
+    else if (device.info().gfxipVersion_ < 1000) {
+      maxSGPRs = 800;
+    }
+    else {
+      maxSGPRs = SIZE_MAX; // gfx10+ does not share SGPRs between waves
+    }
+    const size_t SgprWaves = maxSGPRs / amd::alignUp(wrkGrpInfo->usedSGPRs_, 16);
     GprWaves = std::min(VgprWaves, SgprWaves);
   }
 
-  size_t alu_accupancy = device.info().simdPerCU_ * std::min(MaxWavesPerSimd, GprWaves);
-  alu_accupancy *= wrkGrpInfo->wavefrontSize_;
-  // Calculate blocks occupancy per CU
-  *numBlocks = alu_accupancy / amd::alignUp(blockSize, wrkGrpInfo->wavefrontSize_);
+  const size_t alu_occupancy = device.info().simdPerCU_ * std::min(MaxWavesPerSimd, GprWaves);
+  const int alu_limited_threads = alu_occupancy * wrkGrpInfo->wavefrontSize_;
 
-  size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize;
+  int lds_occupancy_wgs = INT_MAX;
+  const size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize;
   if (total_used_lds != 0) {
-    // Calculate LDS occupancy per CU. lds_per_cu / (static_lsd + dynamic_lds)
-    int lds_occupancy = static_cast<int>(device.info().localMemSize_ / total_used_lds);
-    *numBlocks = std::min(*numBlocks, lds_occupancy);
+    lds_occupancy_wgs = static_cast<int>(device.info().localMemSize_ / total_used_lds);
   }
+  // Calculate how many blocks of inputBlockSize we can fit per CU
+  // Need to align with hardware wavefront size. If they want 65 threads, but
+  // waves are 64, then we need 128 threads per block.
+  // So this calculates how many blocks we can fit.
+  *maxBlocksPerCU = alu_limited_threads / amd::alignUp(inputBlockSize, wrkGrpInfo->wavefrontSize_);
+  // Unless those blocks are further constrained by LDS size.
+  *maxBlocksPerCU = std::min(*maxBlocksPerCU, lds_occupancy_wgs);
 
-  if (bCalcPotentialBlkSz) {
-    *numGrids = *numBlocks * device.info().numRTCUs_;
-  }
+  // Some callers of this function want to return the block size, in threads, that
+  // leads to the maximum occupancy. In that case, inputBlockSize is the maximum
+  // workgroup size the user wants to allow, or that the hardware can allow.
+  // It is either the number of threads that we are limited to due to occupancy, or
+  // the maximum available block size for this kernel, which could have come from the
+  // user. e.g., if the user indicates the maximum block size is 64 threads, but we
+  // calculate that 128 threads can fit in each CU, we have to give up and return 64.
+  *bestBlockSize = std::min(alu_limited_threads, amd::alignUp(inputBlockSize, wrkGrpInfo->wavefrontSize_));
+  // If the best block size is smaller than the block size used to fit the maximum,
+  // then we need to make the grid bigger for full occupancy.
+  const int bestBlocksPerCU = alu_limited_threads / (*bestBlockSize);
+  // Unless those blocks are further constrained by LDS size.
+  *numBlocksPerGrid = device.info().maxComputeUnits_ * std::min(bestBlocksPerCU, lds_occupancy_wgs);
 
   return hipSuccess;
 }
@@ -927,13 +956,14 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
     return HIP_RETURN(hipErrorInvalidValue);
   }
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
-  int num_grids = 0;
+  int max_blocks_per_grid = 0;
   int num_blocks = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &num_grids, device, func, 0, dynSharedMemPerBlk,true);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSizeLimit, dynSharedMemPerBlk,true);
   if (ret == hipSuccess) {
-    *blockSize = num_blocks;
-    *gridSize = num_grids;
+    *blockSize = best_block_size;
+    *gridSize = max_blocks_per_grid;
   }
   HIP_RETURN(ret);
 }
@@ -947,13 +977,14 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize
     return HIP_RETURN(hipErrorInvalidValue);
   }
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
-  int num_grids = 0;
+  int max_blocks_per_grid = 0;
   int num_blocks = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &num_grids, device, f, 0, dynSharedMemPerBlk,true);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, dynSharedMemPerBlk,true);
   if (ret == hipSuccess) {
-    *blockSize = num_blocks;
-    *gridSize = num_grids;
+    *blockSize = best_block_size;
+    *gridSize = max_blocks_per_grid;
   }
   HIP_RETURN(ret);
 }
@@ -967,13 +998,14 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int*
     return HIP_RETURN(hipErrorInvalidValue);
   }
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
-  int num_grids = 0;
+  int max_blocks_per_grid = 0;
   int num_blocks = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &num_grids, device, f, 0, dynSharedMemPerBlk,true);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, dynSharedMemPerBlk,true);
   if (ret == hipSuccess) {
-    *blockSize = num_blocks;
-    *gridSize = num_grids;
+    *blockSize = best_block_size;
+    *gridSize = max_blocks_per_grid;
   }
   HIP_RETURN(ret);
 }
@@ -988,8 +1020,10 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
 
   int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, nullptr, device, f, blockSize, dynSharedMemPerBlk, false);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, false);
   *numBlocks = num_blocks;
   HIP_RETURN(ret);
 }
@@ -1005,8 +1039,10 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numB
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
 
   int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, nullptr, device, f, blockSize, dynSharedMemPerBlk, false);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, false);
   *numBlocks = num_blocks;
   HIP_RETURN(ret);
 }
@@ -1027,8 +1063,10 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
 
   int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, nullptr, device, func, blockSize, dynamicSMemSize, false);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, false);
   *numBlocks = num_blocks;
   HIP_RETURN(ret);
 }
@@ -1050,8 +1088,10 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
   const amd::Device& device = *hip::getCurrentDevice()->devices()[0];
 
   int num_blocks = 0;
+  int max_blocks_per_grid = 0;
+  int best_block_size = 0;
   hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, nullptr, device, func, blockSize, dynamicSMemSize, false);
+    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, false);
   *numBlocks = num_blocks;
   HIP_RETURN(ret);
 }
diff --git a/projects/clr/hipamd/rocclr/hip_platform.hpp b/projects/clr/hipamd/rocclr/hip_platform.hpp
index 8e5eaa191f..fcbfb53bbb 100644
--- a/projects/clr/hipamd/rocclr/hip_platform.hpp
+++ b/projects/clr/hipamd/rocclr/hip_platform.hpp
@@ -23,7 +23,7 @@
 
 namespace hip_impl {
 hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    int* numBlocks, int* numGrids,
+    int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize,
     const amd::Device& device, hipFunction_t func, int  blockSize,
     size_t dynamicSMemSize, bool bCalcPotentialBlkSz);
-}
\ No newline at end of file
+}