From ee95b284c35d5dd240eb6078adf5e3bfe574613c Mon Sep 17 00:00:00 2001 From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com> Date: Sun, 9 Jul 2023 08:55:01 +0530 Subject: [PATCH] SWDEV-380340 - [catch2][dtest] DynamicLoading tests migrated from direct to catch2 (#346) Change-Id: I5f82ddd565d4e86270498d2d8b155200ce568849 --- catch/include/hip_test_defgroups.hh | 7 + catch/unit/CMakeLists.txt | 3 +- catch/unit/dynamicLoading/CMakeLists.txt | 49 ++++ .../dynamicLoading/bit_extract_kernel.cpp | 32 +++ .../complex_loading_behavior.cc | 147 ++++++++++++ .../unit/dynamicLoading/hipApiDynamicLoad.cc | 169 ++++++++++++++ catch/unit/dynamicLoading/liblazyLoad.cc | 212 ++++++++++++++++++ 7 files changed, 618 insertions(+), 1 deletion(-) create mode 100644 catch/unit/dynamicLoading/CMakeLists.txt create mode 100644 catch/unit/dynamicLoading/bit_extract_kernel.cpp create mode 100644 catch/unit/dynamicLoading/complex_loading_behavior.cc create mode 100644 catch/unit/dynamicLoading/hipApiDynamicLoad.cc create mode 100644 catch/unit/dynamicLoading/liblazyLoad.cc diff --git a/catch/include/hip_test_defgroups.hh b/catch/include/hip_test_defgroups.hh index 83d2c4610b..d8fdc5b26d 100644 --- a/catch/include/hip_test_defgroups.hh +++ b/catch/include/hip_test_defgroups.hh @@ -101,6 +101,13 @@ THE SOFTWARE. * @} */ +/** + * @defgroup DynamicLoadingTest Kernel Loading Management + * @{ + * This section describes the different kernel launch approaches. + * @} + */ + /** * @defgroup MemoryTest memory Management APIs * @{ diff --git a/catch/unit/CMakeLists.txt b/catch/unit/CMakeLists.txt index ae5c866e32..5b3bbf66b6 100644 --- a/catch/unit/CMakeLists.txt +++ b/catch/unit/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -36,6 +36,7 @@ add_subdirectory(compiler) add_subdirectory(errorHandling) add_subdirectory(cooperativeGrps) add_subdirectory(context) +add_subdirectory(dynamicLoading) add_subdirectory(g++) add_subdirectory(module) diff --git a/catch/unit/dynamicLoading/CMakeLists.txt b/catch/unit/dynamicLoading/CMakeLists.txt new file mode 100644 index 0000000000..d58f4ec436 --- /dev/null +++ b/catch/unit/dynamicLoading/CMakeLists.txt @@ -0,0 +1,49 @@ +# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +if(UNIX) + set(TEST_SRC ${TEST_SRC} + complex_loading_behavior.cc) + set(AMD_TEST_SRC + hipApiDynamicLoad.cc) + +if(HIP_PLATFORM MATCHES "amd") + set(TEST_SRC ${TEST_SRC} ${AMD_TEST_SRC}) +endif() + +hip_add_exe_to_target(NAME dynamicLoading + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests) + +if(HIP_PLATFORM MATCHES "amd") +add_custom_target(libLazyLoad.so COMMAND ${CMAKE_CXX_COMPILER} -fPIC -lpthread -shared ${OFFLOAD_ARCH_STR} ${CMAKE_CURRENT_SOURCE_DIR}/liblazyLoad.cc -I${CMAKE_CURRENT_SOURCE_DIR}/../../include -I${CMAKE_CURRENT_SOURCE_DIR}/../../external/Catch2 -o libLazyLoad.so) +elseif(HIP_PLATFORM MATCHES "nvidia") +add_custom_target(libLazyLoad.so COMMAND ${CMAKE_CXX_COMPILER} -Xcompiler -fPIC -lpthread -shared ${CMAKE_CURRENT_SOURCE_DIR}/liblazyLoad.cc -I${CMAKE_CURRENT_SOURCE_DIR}/../../include -I${CMAKE_CURRENT_SOURCE_DIR}/../../external/Catch2 -o libLazyLoad.so) +endif() + +add_custom_target(bit_extract_kernel.code COMMAND ${CMAKE_CXX_COMPILER} --genco ${OFFLOAD_ARCH_STR} ${CMAKE_CURRENT_SOURCE_DIR}/bit_extract_kernel.cpp -o ${CMAKE_CURRENT_BINARY_DIR}/../dynamicLoading/bit_extract_kernel.code -I${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/ -I${CMAKE_CURRENT_SOURCE_DIR}/../../include) + +if(HIP_PLATFORM MATCHES "amd") +hip_add_exe_to_target(NAME Dynamic + TEST_SRC ${LINUX_TEST_SRC} + TEST_TARGET_NAME build_tests + LINKER_LIBS ${CMAKE_DL_LIBS}) +endif() +add_dependencies(build_tests bit_extract_kernel.code libLazyLoad.so) +endif() diff --git a/catch/unit/dynamicLoading/bit_extract_kernel.cpp b/catch/unit/dynamicLoading/bit_extract_kernel.cpp new file mode 100644 index 0000000000..7eff993665 --- /dev/null +++ b/catch/unit/dynamicLoading/bit_extract_kernel.cpp @@ -0,0 +1,32 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +extern "C" __global__ void bit_extract_kernel(uint32_t* C_d, const uint32_t* + A_d, size_t N) { + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + for (size_t i = offset; i < N; i += stride) { +#if HT_AMD + C_d[i] = __bitextract_u32(A_d[i], 8, 4); +#else /* defined __HIP_PLATFORM_NVIDIA__ or other path */ + C_d[i] = ((A_d[i] & 0xf00) >> 8); +#endif + } +} + diff --git a/catch/unit/dynamicLoading/complex_loading_behavior.cc b/catch/unit/dynamicLoading/complex_loading_behavior.cc new file mode 100644 index 0000000000..c1c412052f --- /dev/null +++ b/catch/unit/dynamicLoading/complex_loading_behavior.cc @@ -0,0 +1,147 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include +#include +/** +* @addtogroup hipLaunchKernelGGL hipLaunchCooperativeKernel +* @{ +* @ingroup DynamicLoading +* `hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, + std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` - +* launches Kernel with launch parameters and shared memory on stream with arguments passed +* `hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, + void** kernelParams, unsigned int sharedMemBytes, + hipStream_t stream))` - +* launches kernel f with launch parameters and shared memory on stream with arguments passed +* to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute +*/ + +/** + * Test Description + * ------------------------ + * - Test case to verify locally loaded kernels and dynamically loaded kernels from the library. + + * Test source + * ------------------------ + * - catch/unit/dynamicLoading/complex_loading_behavior.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +__global__ static void vector_add(float* C, float* A, float* B, size_t N) { + size_t offset = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = offset; i < N; i += stride) { + C[i] = A[i] + B[i]; + } +} + +static bool launch_local_kernel() { + bool testResult = true; + float *A_d, *B_d, *C_d; + float *A_h, *B_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(float); + static int device = 0; + + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + + A_h = reinterpret_cast(malloc(Nbytes)); + HIPCHECK(A_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + B_h = reinterpret_cast(malloc(Nbytes)); + HIPCHECK(B_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + C_h = reinterpret_cast(malloc(Nbytes)); + HIPCHECK(C_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.618f + i; + B_h[i] = 1.618f + i; + } + + HIPCHECK(hipMalloc(&A_d, Nbytes)); + HIPCHECK(hipMalloc(&B_d, Nbytes)); + HIPCHECK(hipMalloc(&C_d, Nbytes)); + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + const unsigned blocks = 512; + const unsigned threadsPerBlock = 256; + hipLaunchKernelGGL(vector_add, dim3(blocks), dim3(threadsPerBlock), + 0, 0, C_d, A_d, B_d, N); + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + for (size_t i=0; i < N ; i++) { + if (C_h[i] != (A_h[i] + B_h[i])) { + testResult = false; + break; + } + } + + HIPCHECK(hipFree(A_d)); + HIPCHECK(hipFree(B_d)); + HIPCHECK(hipFree(C_d)); + + free(A_h); + free(B_h); + free(C_h); + return testResult; +} + +static bool launch_dynamically_loaded_kernel() { + bool testResult = true; + int ret = 1; + + void* handle = dlopen("./libLazyLoad.so", RTLD_LAZY); + if (!handle) { + INFO("dlopen Error: " << dlerror() << "\n"); + testResult = false; + return testResult; + } + void* sym = dlsym(handle, "lazyLoad"); + if (!sym) { + INFO("unable to locate lazyLoad within lazyLoad.so\n"); + dlclose(handle); + testResult = false; + return testResult; + } + + int(*fp)() = reinterpret_cast(sym); + ret = fp(); + + if (ret == 0) { + testResult = false; + } + + dlclose(handle); + return testResult; +} + +TEST_CASE("Unit_dynamic_loading_device_kernels_from_library") { + bool testResult = true; + + testResult &= launch_local_kernel(); + testResult &= launch_dynamically_loaded_kernel(); + + REQUIRE(testResult == true); +} diff --git a/catch/unit/dynamicLoading/hipApiDynamicLoad.cc b/catch/unit/dynamicLoading/hipApiDynamicLoad.cc new file mode 100644 index 0000000000..e583f4a3d1 --- /dev/null +++ b/catch/unit/dynamicLoading/hipApiDynamicLoad.cc @@ -0,0 +1,169 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include +#include +#include +#include +#include +#include + +#define fileName "bit_extract_kernel.code" + +#define LEN 64 +#define SIZE LEN * sizeof(float) + +/** +* @addtogroup dyn_hipModuleLoad dyn_hipModuleGetFunction dyn_hipModuleLaunchKernel +* @{ +* @ingroup DynamicLoading +* ` hipError_t (*dyn_hipModuleLoad)(hipModule_t*, const char*) = reinterpret_cast + (sym_hipModuleLoad)` - +* Loads code object from file into a module the currrent context +* `hipError_t (*dyn_hipModuleGetFunction)(hipFunction_t*, hipModule_t, + const char*) = reinterpret_cast < hipError_t (*)(hipFunction_t*, + hipModule_t, const char*)>(sym_hipModuleGetFunction)` - +* Function with kernelname will be extracted if present in module +* `hipError_t (*dyn_hipModuleLaunchKernel)(hipFunction_t, unsigned int, + unsigned int, unsigned int, unsigned int, unsigned int, + unsigned int, unsigned int, hipStream_t, void**, void**) + = reinterpret_cast(sym_hipModuleLaunchKernel)` - +* launches Kernel with launch parameters and shared memory on stream with arguments passed +*/ + +/** + * Test Description + * ------------------------ + * - Test is to load hip runtime using dlopen and get function pointer using dlsym for hip apis. + + * Test source + * ------------------------ + * - catch/unit/dynamicLoading/hipApiDynamicLoad.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +TEST_CASE("Unit_hipApiDynamicLoad") { + uint32_t *A_d, *C_d; + uint32_t *A_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(uint32_t); + + void* handle = dlopen("libamdhip64.so", RTLD_LAZY); + REQUIRE(handle != NULL); + + void* sym_hipGetDevice = dlsym(handle, "hipGetDevice"); + void* sym_hipMalloc = dlsym(handle, "hipMalloc"); + void* sym_hipMemcpyHtoD = dlsym(handle, "hipMemcpyHtoD"); + void* sym_hipMemcpyDtoH = dlsym(handle, "hipMemcpyDtoH"); + void* sym_hipModuleLoad = dlsym(handle, "hipModuleLoad"); + void* sym_hipGetDeviceProperties = dlsym(handle, "hipGetDeviceProperties"); + void* sym_hipModuleGetFunction = dlsym(handle, "hipModuleGetFunction"); + void* sym_hipModuleLaunchKernel = dlsym(handle, "hipModuleLaunchKernel"); + + dlclose(handle); + hipError_t (*dyn_hipGetDevice)(hipDevice_t*, int) = reinterpret_cast + (sym_hipGetDevice); + + hipError_t (*dyn_hipMalloc)(void**, uint32_t) = reinterpret_cast + (sym_hipMalloc); + + hipError_t (*dyn_hipMemcpyHtoD)(hipDeviceptr_t, void*, size_t) = + reinterpret_cast(sym_hipMemcpyHtoD); + + hipError_t (*dyn_hipMemcpyDtoH)(void*, hipDeviceptr_t, size_t) = + reinterpret_cast(sym_hipMemcpyDtoH); + + hipError_t (*dyn_hipModuleLoad)(hipModule_t*, const char*) = reinterpret_cast + (sym_hipModuleLoad); + + hipError_t (*dyn_hipGetDeviceProperties)(hipDeviceProp_t*, int) = + reinterpret_cast + (sym_hipGetDeviceProperties); + + hipError_t (*dyn_hipModuleGetFunction)(hipFunction_t*, hipModule_t, + const char*) = reinterpret_cast < hipError_t (*)(hipFunction_t*, + hipModule_t, const char*)>(sym_hipModuleGetFunction); + + hipError_t (*dyn_hipModuleLaunchKernel)(hipFunction_t, unsigned int, + unsigned int, unsigned int, unsigned int, unsigned int, + unsigned int, unsigned int, hipStream_t, void**, void**) + = reinterpret_cast(sym_hipModuleLaunchKernel); + + hipDevice_t device; + HIPCHECK(dyn_hipGetDevice(&device, 0)); + + hipDeviceProp_t props; + HIPCHECK(dyn_hipGetDeviceProperties(&props, device)); + A_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != NULL); + C_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(C_h != NULL); + + for (size_t i = 0; i < N; i++) { + A_h[i] = i; + } + + HIPCHECK(dyn_hipMalloc(reinterpret_cast(&A_d), Nbytes)); + HIPCHECK(dyn_hipMalloc(reinterpret_cast(&C_d), Nbytes)); + + HIPCHECK(dyn_hipMemcpyHtoD((hipDeviceptr_t)(A_d), A_h, Nbytes)); + + struct { + void* _Cd; + void* _Ad; + size_t _N; + } args; + args._Cd = reinterpret_cast (C_d); + args._Ad = reinterpret_cast (A_d); + args._N = static_cast (N); + size_t size = sizeof(args); + + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END}; + + hipModule_t Module; + HIPCHECK(dyn_hipModuleLoad(&Module, fileName)); + + hipFunction_t Function; + HIPCHECK(dyn_hipModuleGetFunction(&Function, Module, "bit_extract_kernel")); + + HIPCHECK(dyn_hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, + reinterpret_cast(&config))); + + HIPCHECK(dyn_hipMemcpyDtoH(C_h, (hipDeviceptr_t)(C_d), Nbytes)); + + for (size_t i = 0; i < N; i++) { + unsigned Agold = ((A_h[i] & 0xf00) >> 8); + REQUIRE(C_h[i] == Agold); + } + HIPCHECK(hipFree(A_d)); + HIPCHECK(hipFree(C_d)); + free(A_h); + free(C_h); +} diff --git a/catch/unit/dynamicLoading/liblazyLoad.cc b/catch/unit/dynamicLoading/liblazyLoad.cc new file mode 100644 index 0000000000..54e18bbcab --- /dev/null +++ b/catch/unit/dynamicLoading/liblazyLoad.cc @@ -0,0 +1,212 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include + +#define CHECK_RET_VAL(cmd) \ +{ \ + hipError_t error = cmd;\ + if (error != hipSuccess) {\ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), \ + error, __FILE__, __LINE__);\ + exit(EXIT_FAILURE);\ + }\ +} + +__global__ static void addition(float* C, float* A, float* B, size_t N) { + size_t offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + size_t stride = hipBlockDim_x * hipGridDim_x; + + for (size_t i = offset; i < N; i += stride) { + C[i] = A[i] + B[i]; + } +} + +static int vectorAddKernelTest() { + int testResult = 1; + float *A_d, *B_d, *C_d; + float *A_h, *B_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(float); + static int device = 0; + + CHECK_RET_VAL(hipSetDevice(device)); + hipDeviceProp_t props; + CHECK_RET_VAL(hipGetDeviceProperties(&props, device)); + A_h = reinterpret_cast(malloc(Nbytes)); + CHECK_RET_VAL(A_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + B_h = reinterpret_cast(malloc(Nbytes)); + CHECK_RET_VAL(B_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + C_h = reinterpret_cast(malloc(Nbytes)); + CHECK_RET_VAL(C_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.618f + i; + B_h[i] = 1.618f + i; + } + + CHECK_RET_VAL(hipMalloc(&A_d, Nbytes)); + CHECK_RET_VAL(hipMalloc(&B_d, Nbytes)); + CHECK_RET_VAL(hipMalloc(&C_d, Nbytes)); + CHECK_RET_VAL(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + CHECK_RET_VAL(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + const unsigned blocks = 512; + const unsigned threadsPerBlock = 256; + hipLaunchKernelGGL(addition, dim3(blocks), dim3(threadsPerBlock), + 0, 0, C_d, A_d, B_d, N); + CHECK_RET_VAL(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + for (size_t i=0; i < N ; i++) { + if (C_h[i] != (A_h[i] + B_h[i])) { + testResult = 0; + break; + } + } + + CHECK_RET_VAL(hipFree(A_d)); + CHECK_RET_VAL(hipFree(B_d)); + CHECK_RET_VAL(hipFree(C_d)); + + free(A_h); + free(B_h); + free(C_h); + return testResult; +} + +#include "hip/hip_cooperative_groups.h" + +namespace cg = cooperative_groups; + +static const uint BufferSizeInDwords = 448 * 1024 * 1024; + +__global__ static void test_gws(uint* buf, uint bufSize, + int64_t* tmpBuf, int64_t* result) { + extern __shared__ int64_t tmp[]; + uint offset = blockIdx.x * blockDim.x + threadIdx.x; + uint stride = gridDim.x * blockDim.x; + cg::grid_group gg = cg::this_grid(); + + int64_t sum = 0; + + for (uint i = offset; i < bufSize; i += stride) { + sum += buf[i]; + } + + tmp[threadIdx.x] = sum; + __syncthreads(); + + if (threadIdx.x == 0) { + sum = 0; + for (uint i = 0; i < blockDim.x; i++) { + sum += tmp[i]; + } + tmpBuf[blockIdx.x] = sum; + } + + gg.sync(); + + if (offset == 0) { + for (uint i = 1; i < gridDim.x; ++i) { + sum += tmpBuf[i]; + } + *result = sum; + } +} + +static int cooperativeKernelTest() { + int testResult = 1; + uint* dA; + int64_t* dB; + int64_t* dC; + int64_t* Ah; + + hipDeviceProp_t deviceProp; + hipGetDeviceProperties(&deviceProp, 0); + + if (!deviceProp.cooperativeLaunch) { + return testResult; + } + + uint32_t* init = new uint32_t[BufferSizeInDwords]; + + for (uint32_t i = 0; i < BufferSizeInDwords; ++i) { + init[i] = i; + } + size_t SIZE = BufferSizeInDwords * sizeof(uint); + + CHECK_RET_VAL(hipMalloc(reinterpret_cast(&dA), SIZE)); + CHECK_RET_VAL(hipMalloc(reinterpret_cast(&dC), sizeof(int64_t))); + CHECK_RET_VAL(hipMemcpy(dA, init, SIZE, hipMemcpyHostToDevice)); + Ah = reinterpret_cast(malloc(sizeof(int64_t))); + + hipStream_t stream; + CHECK_RET_VAL(hipStreamCreate(&stream)); + + dim3 dimBlock = dim3(1); + dim3 dimGrid = dim3(1); + + int numBlocks = 0; + uint workgroups[4] = {32, 64, 128, 256}; + + for (uint i = 0; i < 4; ++i) { + dimBlock.x = workgroups[i]; + /* Calculate the device occupancy to know how many blocks can be + run concurrently */ + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, + test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * + sizeof(int64_t)); + dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32); + CHECK_RET_VAL(hipMalloc(reinterpret_cast(&dB), + dimGrid.x * sizeof(int64_t))); + + void *params[4]; + params[0] = reinterpret_cast(&dA); + params[1] = (void*)(&BufferSizeInDwords); // NOLINT + params[2] = reinterpret_cast(&dB); + params[3] = reinterpret_cast(&dC); + + CHECK_RET_VAL(hipLaunchCooperativeKernel(reinterpret_cast(test_gws), + dimGrid, dimBlock, params, + dimBlock.x * sizeof(int64_t), stream)); + + CHECK_RET_VAL(hipMemcpy(Ah, dC, sizeof(int64_t), hipMemcpyDeviceToHost)); + + if (*Ah != (((int64_t)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) + / 2)) { + CHECK_RET_VAL(hipFree(dB)); + testResult = 0; + break; + + } else { + CHECK_RET_VAL(hipFree(dB)); + } + } + CHECK_RET_VAL(hipStreamDestroy(stream)); + CHECK_RET_VAL(hipFree(dC)); + CHECK_RET_VAL(hipFree(dA)); + delete [] init; + free(Ah); + return testResult; +} + +extern "C" int lazyLoad() { + return vectorAddKernelTest() & cooperativeKernelTest(); +}