diff --git a/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp b/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp index 76a04fa355..9eb738cf04 100644 --- a/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp @@ -154,20 +154,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block dynSharedMemPerBlk, blockSizeLimit); } -template -inline -hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel, - uint32_t blockSize, size_t dynSharedMemPerBlk) { - - using namespace hip_impl; - - hip_impl::hip_init(); - auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), - target_agent(0)); - - return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk); -} - template inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 0712db17f9..659a6c3c3a 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -2948,7 +2948,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk); + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk); /** * @brief Returns occupancy for a device function. @@ -2960,7 +2960,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @param [in] flags Extra flags for occupancy calculation (currently ignored) */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags); + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags); #if __HIP_VDI__ && !defined(__HCC__) /** @@ -3230,21 +3230,6 @@ hipError_t hipLaunchKernel(const void* function_address, } /* extern "c" */ #endif -#if defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__) -template -static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk) { - return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (hipFunction_t)func, blockSize, - dynSharedMemPerBlk); -} -template -static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { - return ::hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - numBlocks, (hipFunction_t)func, blockSize, dynSharedMemPerBlk, flags); -} -#endif // defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__) - #if USE_PROF_API #include #endif @@ -3385,6 +3370,20 @@ hipError_t hipBindTextureToMipmappedArray(const texture& tex, return hipSuccess; } +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) { + return hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, reinterpret_cast(f), blockSize, dynSharedMemPerBlk); +} + +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { + return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, reinterpret_cast(f), blockSize, dynSharedMemPerBlk, flags); +} + #if __HIP_VDI__ && !defined(__HCC__) template inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, diff --git a/projects/hip/samples/2_Cookbook/13_occupancy/occupancy.cpp b/projects/hip/samples/2_Cookbook/13_occupancy/occupancy.cpp index a9f4e198b0..605c7724b2 100644 --- a/projects/hip/samples/2_Cookbook/13_occupancy/occupancy.cpp +++ b/projects/hip/samples/2_Cookbook/13_occupancy/occupancy.cpp @@ -86,8 +86,8 @@ void launchKernel(float* C, float* A, float* B, bool manual){ printf("kernel Execution time = %6.3fms\n", eventMs); //Calculate Occupancy - uint32_t numBlock = 0; - HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0)); + int numBlock = 0; + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply,(int)blockSize, 0)); if(devProp.maxThreadsPerMultiProcessor){ std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl; diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index b11197703f..a8255ea725 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -1368,7 +1368,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block } hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( - TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) { using namespace hip_impl; @@ -1408,35 +1408,41 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts)); size_t total_used_lds = usedLDS + dynSharedMemPerBlk; if (total_used_lds != 0) { // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; - *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); + *numBlocks = std::min(*numBlocks, (int) lds_occupancy); } return hipSuccess; } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk) { HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); + + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynSharedMemPerBlk)); } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); + return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynSharedMemPerBlk)); } hipError_t hipLaunchKernel( diff --git a/projects/hip/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp b/projects/hip/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp index c76685fa89..896738892d 100644 --- a/projects/hip/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp +++ b/projects/hip/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp @@ -116,7 +116,7 @@ int main() { dimBlock.x = workgroups[i]; // Calculate the device occupancy to know how many blocks can be run concurrently - hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast(&numBlocks), + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)); dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32); diff --git a/projects/hip/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp b/projects/hip/tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp similarity index 80% rename from projects/hip/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp rename to projects/hip/tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp index ebf656b72f..8e0dd033bc 100644 --- a/projects/hip/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp +++ b/projects/hip/tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp @@ -30,10 +30,6 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template @@ -49,11 +45,10 @@ int main(int argc, char* argv[]) { hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); - uint32_t numBlock = 0; - hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0); + int numBlock = 0; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, (int)blockSize, 0); assert(numBlock != 0); - // test case for using kernel function pointer with template gridSize = 0; blockSize = 0; @@ -61,17 +56,7 @@ int main(int argc, char* argv[]) { assert(gridSize != 0 && blockSize != 0); numBlock = 0; - hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, blockSize, 0); - assert(numBlock != 0); - - - // test case for using kernel with hipFunction_t type - numBlock = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0)); + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, (int)blockSize, 0); assert(numBlock != 0); passed(); diff --git a/projects/hip/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp b/projects/hip/tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp similarity index 99% rename from projects/hip/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp rename to projects/hip/tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp index a81862952d..d29100d9a9 100644 --- a/projects/hip/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp +++ b/projects/hip/tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp @@ -33,7 +33,6 @@ THE SOFTWARE. #define fileName "vcpy_kernel.code" #define kernel_name "hello_world" - __global__ void f1(float *a) { *a = 1.0; } template