diff --git a/include/hip/hcc_detail/functional_grid_launch.hpp b/include/hip/hcc_detail/functional_grid_launch.hpp index 5abe1095df..8f07e48d46 100644 --- a/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/include/hip/hcc_detail/functional_grid_launch.hpp @@ -140,10 +140,10 @@ void hipLaunchKernelGGLImpl( } // Namespace hip_impl. -template +template inline -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) { +hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) { using namespace hip_impl; @@ -151,22 +151,24 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), target_agent(0)); - return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, + return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit); } -template +template inline -hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel, - uint32_t blockSize, size_t dynSharedMemPerBlk) { +hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize, + T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int flags = 0 ) { using namespace hip_impl; hip_impl::hip_init(); + if(flags != hipOccupancyDefault) return hipErrorNotSupported; auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), target_agent(0)); - return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk); + return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, + dynSharedMemPerBlk, blockSizeLimit); } template diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 9103b7e3ff..12fd9b7a91 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -266,7 +266,6 @@ typedef enum hipSharedMemConfig { ///< when adjacent threads access data 4 bytes apart. } hipSharedMemConfig; - /** * Struct for data in 3D * @@ -2940,9 +2939,28 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi * * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue */ -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, + +//TODO - Match CUoccupancyB2DSize +hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit); + int blockSizeLimit); + +/** + * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel + * + * @param [out] gridSize minimum grid size for maximum potential occupancy + * @param [out] blockSize block size for maximum potential occupancy + * @param [in] f kernel function for which occupancy is calulated + * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit + * @param [in] flags Extra flags for occupancy calculation (only default supported) + * + * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue + */ +//TODO - Match CUoccupancyB2DSize +hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize, + hipFunction_t f, size_t dynSharedMemPerBlk, + int blockSizeLimit, unsigned int flags); /** * @brief Returns occupancy for a device function. @@ -2953,7 +2971,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk); + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk); /** * @brief Returns occupancy for a device function. @@ -2963,7 +2981,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @param [in] blockSize Block size the kernel is intended to be launched with * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block */ -hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessor( +hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor( int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk); /** @@ -2976,7 +2994,7 @@ hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessor( * @param [in] flags Extra flags for occupancy calculation (currently ignored) */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault)); + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault)); /** * @brief Returns occupancy for a device function. @@ -2985,9 +3003,9 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( * @param [in] f Kernel function(hipFunction_t) for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block - * @param [in] flags Extra flags for occupancy calculation (currently ignored) + * @param [in] flags Extra flags for occupancy calculation (only default supported) */ -hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( +hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags); #if __HIP_VDI__ && !defined(__HCC__) @@ -3258,21 +3276,6 @@ hipError_t hipLaunchKernel(const void* function_address, } /* extern "c" */ #endif -#if defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__) -template -static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk) { - return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (hipFunction_t)func, blockSize, - dynSharedMemPerBlk); -} -template -static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { - return ::hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - numBlocks, (hipFunction_t)func, blockSize, dynSharedMemPerBlk, flags); -} -#endif // defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__) - #if USE_PROF_API #include #endif @@ -3295,6 +3298,20 @@ const char* hipKernelNameRef(const hipFunction_t f); #ifdef __cplusplus +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) { + return hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, reinterpret_cast(f), blockSize, dynSharedMemPerBlk); +} + +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { + return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, reinterpret_cast(f), blockSize, dynSharedMemPerBlk, flags); +} + class TlsData; hipError_t hipBindTexture(size_t* offset, textureReference* tex, const void* devPtr, diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index d9eb3e4146..3890028950 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -1295,10 +1295,50 @@ inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBl const void* func, int blockSize, size_t dynamicSMemSize) { - cudaError_t cerror; - cerror = - cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize); - return hipCUDAErrorTohipError(cerror); + return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, + blockSize, dynamicSMemSize)); +} + +inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, + const void* func, + int blockSize, + size_t dynamicSMemSize, + unsigned int flags) { + return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func, + blockSize, dynamicSMemSize, flags)); +} + +inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, + hipFunction_t f, + int blockSize, + size_t dynamicSMemSize ){ + return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, + blockSize, dynamicSMemSize)); +} + +inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, + hipFunction_t f, + int blockSize, + size_t dynamicSMemSize, + unsigned int flags ) { + return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks,f, + blockSize, dynamicSMemSize, flags)); +} + +//TODO - Match CUoccupancyB2DSize +inline static hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + hipFunction_t f, size_t dynSharedMemPerBlk, + int blockSizeLimit){ + return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, NULL, + dynSharedMemPerBlk, blockSizeLimit)); +} + +//TODO - Match CUoccupancyB2DSize +inline static hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize, + hipFunction_t f, size_t dynSharedMemPerBlk, + int blockSizeLimit, unsigned int flags){ + return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, f, NULL, + dynSharedMemPerBlk, blockSizeLimit, flags)); } inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr) { @@ -1713,19 +1753,31 @@ inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBl T func, int blockSize, size_t dynamicSMemSize) { - cudaError_t cerror; - cerror = - cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize); - return hipCUDAErrorTohipError(cerror); + return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, + blockSize, dynamicSMemSize)); } template inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func, size_t dynamicSMemSize = 0, int blockSizeLimit = 0) { - cudaError_t cerror; - cerror = cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit); - return hipCUDAErrorTohipError(cerror); + return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, + dynamicSMemSize, blockSizeLimit)); +} + +template +inline static hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0, unsigned int flags = 0) { + return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, + dynamicSMemSize, blockSizeLimit, flags)); +} + +template +inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int* numBlocks, T func, + int blockSize, size_t dynamicSMemSize,unsigned int flags) { + return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func, + blockSize, dynamicSMemSize, flags)); } template diff --git a/samples/2_Cookbook/13_occupancy/occupancy.cpp b/samples/2_Cookbook/13_occupancy/occupancy.cpp index a9f4e198b0..01fa7aafed 100644 --- a/samples/2_Cookbook/13_occupancy/occupancy.cpp +++ b/samples/2_Cookbook/13_occupancy/occupancy.cpp @@ -56,9 +56,9 @@ void launchKernel(float* C, float* A, float* B, bool manual){ const unsigned threadsperblock = 32; const unsigned blocks = (NUM/threadsperblock)+1; - uint32_t mingridSize = 0; - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int mingridSize = 0; + int gridSize = 0; + int blockSize = 0; if (manual){ blockSize = threadsperblock; @@ -86,7 +86,7 @@ void launchKernel(float* C, float* A, float* B, bool manual){ printf("kernel Execution time = %6.3fms\n", eventMs); //Calculate Occupancy - uint32_t numBlock = 0; + int numBlock = 0; HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0)); if(devProp.maxThreadsPerMultiProcessor){ diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 9692876695..d0ec0df9de 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -421,7 +421,7 @@ void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size } static hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( - TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) { using namespace hip_impl; @@ -469,13 +469,13 @@ static hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts)); size_t total_used_lds = usedLDS + dynSharedMemPerBlk; if (total_used_lds != 0) { // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; - *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); + *numBlocks = std::min(*numBlocks, (int) lds_occupancy); } return hipSuccess; @@ -545,7 +545,7 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, std::pair>*>(kargs.getHandle()); GET_TLS(); - uint32_t numBlocksPerSm = 0; + int numBlocksPerSm = 0; result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd, blockDim.x * blockDim.y * blockDim.z, sharedMemBytes); if (result != hipSuccess) { @@ -712,7 +712,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL kds[i]->_kernarg_layout = *reinterpret_cast>*>( kargs.getHandle()); - uint32_t numBlocksPerSm = 0; + int numBlocksPerSm = 0; result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i], lp.blockDim.x * lp.blockDim.y * lp.blockDim.z, lp.sharedMem); if (result != hipSuccess) { @@ -1481,9 +1481,9 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const return ihipLogStatus(hipSuccess); } -hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize, +hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit) + int blockSizeLimit) { using namespace hip_impl; @@ -1593,51 +1593,66 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, return hipSuccess; } -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, +hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit) + int blockSizeLimit) { - HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit); + HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit); + return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls, + gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); +} + +hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize, + hipFunction_t f, size_t dynSharedMemPerBlk, + int blockSizeLimit, unsigned int flags) +{ + HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSizeWithFlags, gridSize, blockSize, f, dynSharedMemPerBlk, + blockSizeLimit, flags); + if(flags != hipOccupancyDefault) return ihipLogStatus(hipErrorNotSupported); return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk) { HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); + return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( + tls, numBlocks, F, blockSize, dynSharedMemPerBlk)); +} + +hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) +{ + HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); } -hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessor( - int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) -{ - HIP_INIT_API(hipDrvOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); - - return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, (uint32_t*) numBlocks, f, blockSize, dynSharedMemPerBlk)); -} - hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); - + if(flags != hipOccupancyDefault) return ihipLogStatus(hipErrorNotSupported); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynSharedMemPerBlk)); } -hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( +hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { - HIP_INIT_API(hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); + HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); + if(flags != hipOccupancyDefault) return ihipLogStatus(hipErrorNotSupported); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, (uint32_t*) numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); } hipError_t hipLaunchKernel( diff --git a/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp index b4d57a7693..c565426f2d 100644 --- a/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp +++ b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp @@ -101,7 +101,6 @@ int main() { uint* dA[MaxGPUs]; long* dB[MaxGPUs]; long* dC; - hipModule_t Module; hipStream_t stream[MaxGPUs]; uint32_t* init = new uint32_t[BufferSizeInDwords]; @@ -156,8 +155,8 @@ int main() { for (int i = 0; i < nGpu; i++) { HIPCHECK(hipSetDevice(i)); dimBlock.x = workgroups[set]; - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast(&numBlocks), - (hipFunction_t)test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long))); + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, + test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long))); std::cout << "GPU(" << i << ") Block size: " << dimBlock.x << " Num blocks per CU: " << numBlocks << "\n"; diff --git a/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp index c76685fa89..896738892d 100644 --- a/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp +++ b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp @@ -116,7 +116,7 @@ int main() { dimBlock.x = workgroups[i]; // Calculate the device occupancy to know how many blocks can be run concurrently - hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast(&numBlocks), + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)); dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32); diff --git a/tests/src/runtimeApi/module/hipModuleOccupancyMaxPotentialActiveBlockSize.cpp b/tests/src/runtimeApi/module/hipModuleOccupancyMaxPotentialActiveBlockSize.cpp new file mode 100644 index 0000000000..f6935d0d68 --- /dev/null +++ b/tests/src/runtimeApi/module/hipModuleOccupancyMaxPotentialActiveBlockSize.cpp @@ -0,0 +1,54 @@ +/* +Copyright (c) 2019 - prsent Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * TEST: %t + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "test_common.h" + +#define fileName "vcpy_kernel.code" +#define kernel_name "hello_world" + +int main(int argc, char* argv[]) { + + int gridSize = 0; + int blockSize = 0; + int numBlock = 0; + HIPCHECK(hipInit(0)); + + hipDevice_t device; + hipCtx_t context; + HIPCHECK(hipDeviceGet(&device, 0)); + HIPCHECK(hipCtxCreate(&context, 0, device)); + + hipModule_t Module; + hipFunction_t Function; + HIPCHECK(hipModuleLoad(&Module, fileName)); + HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); + HIPCHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, Function, 0, 0)); + assert(gridSize != 0 && blockSize != 0); + HIPCHECK(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0)); + assert(numBlock != 0); + HIPCHECK(hipCtxDestroy(context)); + passed(); +} diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp b/tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp similarity index 77% rename from tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp rename to tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp index ebf656b72f..33ca8263e1 100644 --- a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp +++ b/tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. // Test the Grid_Launch syntax. /* HIT_START - * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -30,10 +30,6 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template @@ -44,12 +40,12 @@ __global__ void f2(T *a) { *a = 1; } int main(int argc, char* argv[]) { // test case for using kernel function pointer - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int gridSize = 0; + int blockSize = 0; hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); - uint32_t numBlock = 0; + int numBlock = 0; hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0); assert(numBlock != 0); @@ -64,15 +60,5 @@ int main(int argc, char* argv[]) { hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, blockSize, 0); assert(numBlock != 0); - - // test case for using kernel with hipFunction_t type - numBlock = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0)); - assert(numBlock != 0); - passed(); } diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp b/tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp similarity index 75% rename from tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp rename to tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp index a81862952d..fc8538df26 100644 --- a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp +++ b/tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. // Test the Grid_Launch syntax. /* HIT_START - * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -30,10 +30,6 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template @@ -44,8 +40,8 @@ __global__ void f2(T *a) { *a = 1; } int main(int argc, char* argv[]) { // test case for using kernel function pointer - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int gridSize = 0; + int blockSize = 0; hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); @@ -54,16 +50,5 @@ int main(int argc, char* argv[]) { blockSize = 0; hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f2, 0, 0); assert(gridSize != 0 && blockSize != 0); - - // test case for using kernel with hipFunction_t type - gridSize = 0; - blockSize = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, Function, 0, 0)); - assert(gridSize != 0 && blockSize != 0); - passed(); }