diff --git a/include/hip/hcc_detail/functional_grid_launch.hpp b/include/hip/hcc_detail/functional_grid_launch.hpp index 111a6e2b70..f502fddf42 100644 --- a/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/include/hip/hcc_detail/functional_grid_launch.hpp @@ -151,6 +151,20 @@ void hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, dynSharedMemPerBlk, blockSizeLimit); } +template +inline +void hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel, + uint32_t blockSize, size_t dynSharedMemPerBlk) { + + using namespace hip_impl; + + hip_impl::hip_init(); + auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), + target_agent(0)); + + hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk); +} + template inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 979efebe4d..7e887d1172 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -2749,7 +2749,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi * * @param [out] gridSize minimum grid size for maximum potential occupancy * @param [out] blockSize block size for maximum potential occupancy - * @param [in] f kernel to launch + * @param [in] f kernel function for which occupancy is calulated * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block * @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit * @@ -2765,10 +2765,10 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block * @param [out] numBlocks Returned occupancy * @param [in] func Kernel function for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with - * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes + * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize); + uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk); /** * @brief Returns occupancy for a device function. @@ -2776,11 +2776,11 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @param [out] numBlocks Returned occupancy * @param [in] func Kernel function for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with - * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes + * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block * @param [in] flags Extra flags for occupancy calculation (currently ignored) */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags); + uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags); /** * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched @@ -3123,19 +3123,6 @@ hipError_t hipBindTextureToMipmappedArray(const texture& tex, return hipSuccess; } -template -inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) { - return hipOccupancyMaxActiveBlocksPerMultiprocessor( - numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize); -} - -template -inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) { - return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize, flags); -} template inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 2d3d052036..9e972da246 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -879,6 +879,30 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const return ihipLogStatus(hipSuccess); } +void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS) +{ + bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos; + if (is_code_object_v3) { + const auto header = reinterpret_cast(f->_header); + // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 + // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) + *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2; + // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1 + // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1) + *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4; + *usedLDS = header->group_segment_fixed_size; + } + else { + const auto header = f->_header; + // VGPRs granularity is 4 + *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2; + // adding 2 to take into account the 2 VCC registers & handle the granularity of 16 + *usedSGPRS = header->wavefront_sgpr_count + 2; + *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4; + *usedLDS = header->workgroup_group_segment_byte_size; + } +} + hipError_t ihipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) @@ -886,10 +910,8 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* bloc using namespace hip_impl; auto ctx = ihipGetTlsDefaultCtx(); - hipError_t ret = hipSuccess; - if (ctx == nullptr) { - ret = hipErrorInvalidDevice; + return hipErrorInvalidDevice; } hipDeviceProp_t prop{}; @@ -900,26 +922,7 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* bloc size_t usedVGPRS = 0; size_t usedSGPRS = 0; size_t usedLDS = 0; - bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos; - if (is_code_object_v3) { - const auto header = reinterpret_cast(f->_header); - // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 - // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) - usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2; - // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1 - // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1) - usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4; - usedLDS = header->group_segment_fixed_size; - } - else { - const auto header = f->_header; - // VGPRs granularity is 4 - usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2; - // adding 2 to take into account the 2 VCC registers & handle the granularity of 16 - usedSGPRS = header->wavefront_sgpr_count + 2; - usedSGPRS = ((usedSGPRS + 15) >> 4) << 4; - usedLDS = header->workgroup_group_segment_byte_size; - } + getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS); // try different workgroup sizes to find the maximum potential occupancy // based on the usage of VGPRs and LDS @@ -1009,10 +1012,9 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* bloc *blockSize = maxWavefronts * wavefrontSize; *gridSize = min((maxThreadsCnt + *blockSize - 1) / *blockSize, prop.multiProcessorCount); - return ret; + return hipSuccess; } - hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) @@ -1022,3 +1024,74 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize( gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); } + +hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( + uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) +{ + using namespace hip_impl; + + auto ctx = ihipGetTlsDefaultCtx(); + if (ctx == nullptr) { + return hipErrorInvalidDevice; + } + + hipDeviceProp_t prop{}; + ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId); + + prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; + + size_t usedVGPRS = 0; + size_t usedSGPRS = 0; + size_t usedLDS = 0; + getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS); + + // Due to SPI and private memory limitations, the max of wavefronts per CU in 32 + size_t wavefrontSize = prop.warpSize; + size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32); + + const size_t simdPerCU = 4; + const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU; + + size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; + + size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); + size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS); + + // Calculate blocks occupancy per CU based on VGPR usage + *numBlocks = vgprs_alu_occupancy / numWavefronts; + + const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; + size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); + + // Calculate blocks occupancy per CU based on SGPR usage + *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + + size_t total_used_lds = usedLDS + dynSharedMemPerBlk; + if (total_used_lds != 0) { + // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) + size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; + *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); + } + + return hipSuccess; +} + +hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( + uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) +{ + HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); + + return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, f, blockSize, dynSharedMemPerBlk)); +} + +hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, + unsigned int flags) +{ + HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); + + return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, f, blockSize, dynSharedMemPerBlk)); +} diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp b/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp new file mode 100644 index 0000000000..ebf656b72f --- /dev/null +++ b/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp @@ -0,0 +1,78 @@ +/* +Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Test the Grid_Launch syntax. + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "test_common.h" + +#define fileName "vcpy_kernel.code" +#define kernel_name "hello_world" + + +__global__ void f1(float *a) { *a = 1.0; } + +template +__global__ void f2(T *a) { *a = 1; } + + + +int main(int argc, char* argv[]) { + + // test case for using kernel function pointer + uint32_t gridSize = 0; + uint32_t blockSize = 0; + hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); + assert(gridSize != 0 && blockSize != 0); + + uint32_t numBlock = 0; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0); + assert(numBlock != 0); + + + // test case for using kernel function pointer with template + gridSize = 0; + blockSize = 0; + hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f2, 0, 0); + assert(gridSize != 0 && blockSize != 0); + + numBlock = 0; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, blockSize, 0); + assert(numBlock != 0); + + + // test case for using kernel with hipFunction_t type + numBlock = 0; + hipModule_t Module; + hipFunction_t Function; + HIPCHECK(hipModuleLoad(&Module, fileName)); + HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0)); + assert(numBlock != 0); + + passed(); +}