From e112d1e13f0680d56dc1ac3f7c22d74b8190ad1f Mon Sep 17 00:00:00 2001 From: Sudheer Kumar Date: Thu, 27 Aug 2020 01:29:32 +0530 Subject: [PATCH] [dtest] Tests for loading hip apis dynamically SWDEV-238517 for enhancing hip unit tests Change-Id: I9b6ef41db2f02ee3679360d0fae709a404a0ea4f --- .../src/dynamicLoading/bit_extract_kernel.cpp | 34 ++ .../complex_loading_behavior.cpp | 411 ++++++++++++------ .../src/dynamicLoading/hipApiDynamicLoad.cpp | 163 +++++++ 3 files changed, 481 insertions(+), 127 deletions(-) create mode 100644 tests/src/dynamicLoading/bit_extract_kernel.cpp create mode 100644 tests/src/dynamicLoading/hipApiDynamicLoad.cpp diff --git a/tests/src/dynamicLoading/bit_extract_kernel.cpp b/tests/src/dynamicLoading/bit_extract_kernel.cpp new file mode 100644 index 0000000000..54b33bdcf1 --- /dev/null +++ b/tests/src/dynamicLoading/bit_extract_kernel.cpp @@ -0,0 +1,34 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" + +extern "C" __global__ void bit_extract_kernel(uint32_t* C_d, const uint32_t* + A_d, size_t N) { + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x; + + for (size_t i = offset; i < N; i += stride) { +#ifdef __HIP_PLATFORM_HCC__ + C_d[i] = __bitextract_u32(A_d[i], 8, 4); +#else /* defined __HIP_PLATFORM_NVCC__ or other path */ + C_d[i] = ((A_d[i] & 0xf00) >> 8); +#endif + } +} diff --git a/tests/src/dynamicLoading/complex_loading_behavior.cpp b/tests/src/dynamicLoading/complex_loading_behavior.cpp index 8288e15723..6b7a09dbc1 100644 --- a/tests/src/dynamicLoading/complex_loading_behavior.cpp +++ b/tests/src/dynamicLoading/complex_loading_behavior.cpp @@ -1,6 +1,5 @@ /* -Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -20,10 +19,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* Test for loading device kernels from a library created with extern "C" function + */ + /* HIT_START - * BUILD_CMD: libfoo_amd %hc %S/%s -o libfoo.so -Xcompiler -fPIC -lpthread -shared -DTEST_SHARED_LIBRARY EXCLUDE_HIP_PLATFORM nvcc - * BUILD_CMD: libfoo_nvidia %hc %S/%s -o libfoo.so -Xcompiler -fPIC -lpthread -shared -DTEST_SHARED_LIBRARY EXCLUDE_HIP_PLATFORM hcc rocclr - * BUILD_CMD: %t %hc %S/%s -o %T/%t -ldl + * BUILD_CMD: libLazyLoad_amd %hc %S/%s -o liblazyLoad.so -I%S/.. -fPIC -lpthread -shared -DTEST_SHARED_LIBRARY EXCLUDE_HIP_PLATFORM nvcc + * BUILD_CMD: libLazyLoad_nvidia %hc %S/%s --std=c++11 -o liblazyLoad.so -I%S/.. -Xcompiler -fPIC -lpthread -shared -DTEST_SHARED_LIBRARY EXCLUDE_HIP_PLATFORM hcc rocclr + * BUILD_CMD: %t %hc %S/%s --std=c++11 -o %T/%t -I%S/.. -ldl * TEST: %t * HIT_END */ @@ -32,17 +34,7 @@ THE SOFTWARE. #include #include -#include - -#define CHECK(cmd) \ - { \ - hipError_t error = cmd; \ - if (error != hipSuccess) { \ - fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error, \ - __FILE__, __LINE__); \ - return (EXIT_FAILURE); \ - } \ - } +#include "test_common.h" __global__ void vector_add(float* C, float* A, float* B, size_t N) { size_t offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -52,160 +44,325 @@ __global__ void vector_add(float* C, float* A, float* B, size_t N) { } } -int launch_local_kernel() { - float *A_d, *B_d, *C_d; - float *A_h, *B_h, *C_h; - size_t N = 1000000; - size_t Nbytes = N * sizeof(float); - static int device = 0; - CHECK(hipSetDevice(device)); - hipDeviceProp_t props; - CHECK(hipGetDeviceProperties(&props, device /*deviceID*/)); - A_h = (float*)malloc(Nbytes); - CHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess); - B_h = (float*)malloc(Nbytes); - CHECK(B_h == 0 ? hipErrorOutOfMemory : hipSuccess); - C_h = (float*)malloc(Nbytes); - CHECK(C_h == 0 ? hipErrorOutOfMemory : hipSuccess); - // Fill with Phi + i - for (size_t i = 0; i < N; i++) { - A_h[i] = 1.618f + i; - B_h[i] = 1.618f + i; +bool launch_local_kernel() { + bool testResult = true; + float *A_d, *B_d, *C_d; + float *A_h, *B_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(float); + static int device = 0; + + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device /*deviceID*/)); + + A_h = reinterpret_cast(malloc(Nbytes)); + HIPCHECK(A_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + B_h = reinterpret_cast(malloc(Nbytes)); + HIPCHECK(B_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + C_h = reinterpret_cast(malloc(Nbytes)); + HIPCHECK(C_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.618f + i; + B_h[i] = 1.618f + i; + } + + HIPCHECK(hipMalloc(&A_d, Nbytes)); + HIPCHECK(hipMalloc(&B_d, Nbytes)); + HIPCHECK(hipMalloc(&C_d, Nbytes)); + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + const unsigned blocks = 512; + const unsigned threadsPerBlock = 256; + hipLaunchKernelGGL(vector_add, dim3(blocks), dim3(threadsPerBlock), + 0, 0, C_d, A_d, B_d, N); + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + for (size_t i=0; i < N ; i++) { + if (C_h[i] != (A_h[i] + B_h[i])) { + printf("data mismatch. Local kernel failed"); + testResult = false; + break; } + } - CHECK(hipMalloc(&A_d, Nbytes)); - CHECK(hipMalloc(&B_d, Nbytes)); - CHECK(hipMalloc(&C_d, Nbytes)); - CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); - CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipFree(A_d)); + HIPCHECK(hipFree(B_d)); + HIPCHECK(hipFree(C_d)); - const unsigned blocks = 512; - const unsigned threadsPerBlock = 256; - hipLaunchKernelGGL(vector_add, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, B_d, N); - CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + free(A_h); + free(B_h); + free(C_h); - CHECK(hipFree(A_d)); - CHECK(hipFree(B_d)); - CHECK(hipFree(C_d)); - - free(A_h); - free(B_h); - free(C_h); - - std::cout << "PASSED!\n"; - return 0; + std::cout << "Local kernel executed successfully\n"; + return testResult; } -int launch_dynamically_loaded_kernel() { - void* handle = dlopen("./libfoo.so", RTLD_LAZY); +bool launch_dynamically_loaded_kernel() { + bool testResult = true; + int ret = 1; + + void* handle = dlopen("./liblazyLoad.so", RTLD_LAZY); + if (!handle) { std::cout << dlerror() << "\n"; - return -1; + testResult = false; + return testResult; } - std::cout << "loaded libfoo.so\n"; - void* sym = dlsym(handle, "foo"); + std::cout << "loaded liblazyLoad.so\n"; + + void* sym = dlsym(handle, "lazyLoad"); if (!sym) { - std::cout << "unable to locate foo within libfoo.so\n"; + std::cout << "unable to locate lazyLoad within lazyLoad.so\n"; std::cout << dlerror() << "\n"; dlclose(handle); - return -1; + testResult = false; + return testResult; } int(*fp)() = reinterpret_cast(sym); - int ret = fp(); - if (ret) { + ret = fp(); + + if (ret == 0) { std::cout << "dynamic launch failed\n"; + testResult = false; } else { std::cout << "dynamic launch succeeded\n"; } dlclose(handle); - return ret; + return testResult; } int main() { - int ret = 0; - ret = launch_local_kernel(); - if (ret) { - return ret; - } + bool testResult = true; - ret = launch_dynamically_loaded_kernel(); - if (ret) { - return ret; - } + testResult &= launch_local_kernel(); + testResult &= launch_dynamically_loaded_kernel(); - return 0; + if (testResult == true) { + passed(); + } else { + failed("One or more tests failed"); + } } -#else // !defined(TEST_SHARED_LIBRARY) +#else // !defined(TEST_SHARED_LIBRARY) -#include #include -#include +#include "test_common.h" -#define CHECK(cmd) \ - { \ - hipError_t error = cmd; \ - if (error != hipSuccess) { \ - fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error, \ - __FILE__, __LINE__); \ - return (EXIT_FAILURE); \ - } \ - } - -__global__ void vadd(float* C, float* A, float* B, size_t N) { +__global__ void vAdd(float* C, float* A, float* B, size_t N) { size_t offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; size_t stride = hipBlockDim_x * hipGridDim_x; + for (size_t i = offset; i < N; i += stride) { C[i] = A[i] + B[i]; } } -extern "C" int foo() { - float *A_d, *B_d, *C_d; - float *A_h, *B_h, *C_h; - size_t N = 1000000; - size_t Nbytes = N * sizeof(float); - static int device = 0; - CHECK(hipSetDevice(device)); - hipDeviceProp_t props; - CHECK(hipGetDeviceProperties(&props, device /*deviceID*/)); - A_h = (float*)malloc(Nbytes); - CHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess); - B_h = (float*)malloc(Nbytes); - CHECK(B_h == 0 ? hipErrorOutOfMemory : hipSuccess); - C_h = (float*)malloc(Nbytes); - CHECK(C_h == 0 ? hipErrorOutOfMemory : hipSuccess); - // Fill with Phi + i - for (size_t i = 0; i < N; i++) { - A_h[i] = 1.618f + i; - B_h[i] = 1.618f + i; +int vectorAddKernelTest() { + int testResult = 1; + float *A_d, *B_d, *C_d; + float *A_h, *B_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(float); + static int device = 0; + + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device /*deviceID*/)); + A_h = reinterpret_cast(malloc(Nbytes)); + HIPCHECK(A_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + B_h = reinterpret_cast(malloc(Nbytes)); + HIPCHECK(B_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + C_h = reinterpret_cast(malloc(Nbytes)); + HIPCHECK(C_h == nullptr ? hipErrorOutOfMemory : hipSuccess); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.618f + i; + B_h[i] = 1.618f + i; + } + + HIPCHECK(hipMalloc(&A_d, Nbytes)); + HIPCHECK(hipMalloc(&B_d, Nbytes)); + HIPCHECK(hipMalloc(&C_d, Nbytes)); + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + const unsigned blocks = 512; + const unsigned threadsPerBlock = 256; + + std::cout << "info: Launching vAdd kernel\n"; + hipLaunchKernelGGL(vAdd, dim3(blocks), dim3(threadsPerBlock), + 0, 0, C_d, A_d, B_d, N); + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + for (size_t i=0; i < N ; i++) { + if (C_h[i] != (A_h[i] + B_h[i])) { + printf("info: data mismatch. vAdd kernel failed"); + testResult = 0; + break; } + } - CHECK(hipMalloc(&A_d, Nbytes)); - CHECK(hipMalloc(&B_d, Nbytes)); - CHECK(hipMalloc(&C_d, Nbytes)); - CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); - CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + if (testResult) { + std::cout << "info: vAdd kernel executed fine\n"; + } - const unsigned blocks = 512; - const unsigned threadsPerBlock = 256; - std::cout << "Launch vadd\n"; - hipLaunchKernelGGL(vadd, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, B_d, N); - CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipFree(A_d)); + HIPCHECK(hipFree(B_d)); + HIPCHECK(hipFree(C_d)); - CHECK(hipFree(A_d)); - CHECK(hipFree(B_d)); - CHECK(hipFree(C_d)); - - free(A_h); - free(B_h); - free(C_h); - - return 0; + free(A_h); + free(B_h); + free(C_h); + return testResult; } -#endif // !defined(TEST_SHARED_LIBRARY) +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#include "hip/hip_cooperative_groups.h" + +namespace cg = cooperative_groups; + +static const uint BufferSizeInDwords = 448 * 1024 * 1024; + +__global__ void test_gws(uint* buf, uint bufSize, + long* tmpBuf, long* result) { + extern __shared__ long tmp[]; + uint offset = blockIdx.x * blockDim.x + threadIdx.x; + uint stride = gridDim.x * blockDim.x; + cg::grid_group gg = cg::this_grid(); + + long sum = 0; + + for (uint i = offset; i < bufSize; i += stride) { + sum += buf[i]; + } + + tmp[threadIdx.x] = sum; + __syncthreads(); + + if (threadIdx.x == 0) { + sum = 0; + for (uint i = 0; i < blockDim.x; i++) { + sum += tmp[i]; + } + tmpBuf[blockIdx.x] = sum; + } + + gg.sync(); + + if (offset == 0) { + for (uint i = 1; i < gridDim.x; ++i) { + sum += tmpBuf[i]; + } + *result = sum; + } +} + +int cooperativeKernelTest() { + int testResult = 1; + uint* dA; + long* dB; + long* dC; + long* Ah; + + hipDeviceProp_t deviceProp; + hipGetDeviceProperties(&deviceProp, 0); + + if (!deviceProp.cooperativeLaunch) { + std::cout << "info: Device doesn't support cooperative launch!" + "skipping the test!\n"; + return testResult; + } + + uint32_t* init = new uint32_t[BufferSizeInDwords]; + + for (uint32_t i = 0; i < BufferSizeInDwords; ++i) { + init[i] = i; + } + + std::cout << "info: Launch kernel to test hipLaunchCooperativeKernel api\n"; + std::cout << "info: running on bus 0x" << deviceProp.pciBusID << " " << + deviceProp.name << "\n"; + + size_t SIZE = BufferSizeInDwords * sizeof(uint); + + HIPCHECK(hipMalloc(reinterpret_cast(&dA), SIZE)); + HIPCHECK(hipMalloc(reinterpret_cast(&dC), sizeof(long))); + HIPCHECK(hipMemcpy(dA, init, SIZE, hipMemcpyHostToDevice)); + Ah = reinterpret_cast(malloc(sizeof(long))); + + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + dim3 dimBlock = dim3(1); + dim3 dimGrid = dim3(1); + + int numBlocks = 0; + uint workgroups[4] = {32, 64, 128, 256}; + + for (uint i = 0; i < 4; ++i) { + dimBlock.x = workgroups[i]; + /* Calculate the device occupancy to know how many blocks can be + run concurrently */ + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, + test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)); + dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32); + HIPCHECK(hipMalloc(reinterpret_cast(&dB), + dimGrid.x * sizeof(long))); + + void *params[4]; + params[0] = reinterpret_cast(&dA); + params[1] = (void*)&BufferSizeInDwords; + params[2] = reinterpret_cast(&dB); + params[3] = reinterpret_cast(&dC); + + std::cout << "Testing with grid size = " << dimGrid.x << + " and block size = " << dimBlock.x << "\n"; + + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_gws), + dimGrid, dimBlock, params, + dimBlock.x * sizeof(long), stream)); + + HIPCHECK(hipMemcpy(Ah, dC, sizeof(long), hipMemcpyDeviceToHost)); + + if (*Ah != (((long)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2)) { + std::cout << "Data validation failed for grid size = " << dimGrid.x << + " and block size = " << dimBlock.x << "\n"; + HIPCHECK(hipFree(dB)); + std::cout << "Test failed! \n"; + testResult = 0; + break; + + } else { + std::cout << "info: data validated!\n"; + HIPCHECK(hipFree(dB)); + } + } + + if (testResult) { + std::cout <<"info: hipLaunchCooperativeKernel api executed fine\n"; + } + + HIPCHECK(hipStreamDestroy(stream)); + HIPCHECK(hipFree(dC)); + HIPCHECK(hipFree(dA)); + delete [] init; + free(Ah); + return testResult; +} + +extern "C" int lazyLoad() { + return vectorAddKernelTest() & cooperativeKernelTest(); +} + +#endif // !defined(TEST_SHARED_LIBRARY) diff --git a/tests/src/dynamicLoading/hipApiDynamicLoad.cpp b/tests/src/dynamicLoading/hipApiDynamicLoad.cpp new file mode 100644 index 0000000000..f46e94f7f9 --- /dev/null +++ b/tests/src/dynamicLoading/hipApiDynamicLoad.cpp @@ -0,0 +1,163 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* Test is to load hip runtime using dlopen and get function pointer + * using dlsym for hip apis using dlsym() + * */ + +/* HIT_START + * BUILD_CMD: bit_extract_kernel.code %hc --genco %S/bit_extract_kernel.cpp -o bit_extract_kernel.code EXCLUDE_HIP_PLATFORM nvcc + * BUILD_CMD: %t %hc %S/%s -I%S/.. -o %T/%t -ldl EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include "test_common.h" +#include +#include +#include +#include +#include + +#define fileName "bit_extract_kernel.code" + +#define LEN 64 +#define SIZE LEN * sizeof(float) + +int main(int argc, char* argv[]) { + uint32_t *A_d, *C_d; + uint32_t *A_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(uint32_t); + + void* handle = dlopen("libamdhip64.so", RTLD_LAZY); + if (!handle) { + std::cout << dlerror() << "\n"; + failed("hip runtime failed to load from dlopen\n"); + } + + std::cout << "hip runtime loaded using dlopen\n"; + + void* sym_hipGetDevice = dlsym(handle, "hipGetDevice"); + void* sym_hipMalloc = dlsym(handle, "hipMalloc"); + void* sym_hipMemcpyHtoD = dlsym(handle, "hipMemcpyHtoD"); + void* sym_hipMemcpyDtoH = dlsym(handle, "hipMemcpyDtoH"); + void* sym_hipModuleLoad = dlsym(handle, "hipModuleLoad"); + void* sym_hipGetDeviceProperties = dlsym(handle, "hipGetDeviceProperties"); + void* sym_hipModuleGetFunction = dlsym(handle, "hipModuleGetFunction"); + void* sym_hipModuleLaunchKernel = dlsym(handle, "hipModuleLaunchKernel"); + + dlclose(handle); + hipError_t (*dyn_hipGetDevice)(hipDevice_t*, int) = reinterpret_cast + (sym_hipGetDevice); + + hipError_t (*dyn_hipMalloc)(void**, uint32_t) = reinterpret_cast + (sym_hipMalloc); + + hipError_t (*dyn_hipMemcpyHtoD)(hipDeviceptr_t, void*, size_t) = reinterpret_cast + (sym_hipMemcpyHtoD); + + hipError_t (*dyn_hipMemcpyDtoH)(void*, hipDeviceptr_t, size_t) = reinterpret_cast + (sym_hipMemcpyDtoH); + + hipError_t (*dyn_hipModuleLoad)(hipModule_t*, const char*) = reinterpret_cast + (sym_hipModuleLoad); + + hipError_t (*dyn_hipGetDeviceProperties)(hipDeviceProp_t*, int) = reinterpret_cast + (sym_hipGetDeviceProperties); + + hipError_t (*dyn_hipModuleGetFunction)(hipFunction_t*, hipModule_t, const char*) = + reinterpret_cast + (sym_hipModuleGetFunction); + + hipError_t (*dyn_hipModuleLaunchKernel)(hipFunction_t, unsigned int, unsigned int, + unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, + hipStream_t, void**, void**) = reinterpret_cast + (sym_hipModuleLaunchKernel); + + hipDevice_t device; + HIPCHECK(dyn_hipGetDevice(&device, 0)); + + hipDeviceProp_t props; + HIPCHECK(dyn_hipGetDeviceProperties(&props, device)); + printf("info: running on device #%d %s\n", device, props.name); + printf("info: allocate host mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0); + A_h = reinterpret_cast(malloc(Nbytes)); + HIPASSERT(A_h != NULL); + C_h = reinterpret_cast(malloc(Nbytes)); + HIPASSERT(C_h != NULL); + + for (size_t i = 0; i < N; i++) { + A_h[i] = i; + } + + printf("info: allocate device mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0); + HIPCHECK(dyn_hipMalloc(reinterpret_cast(&A_d), Nbytes)); + HIPCHECK(dyn_hipMalloc(reinterpret_cast(&C_d), Nbytes)); + + printf("info: copy Host2Device\n"); + HIPCHECK(dyn_hipMemcpyHtoD((hipDeviceptr_t)(A_d), A_h, Nbytes)); + + printf("info: launch 'bit_extract_kernel' \n"); + + struct { + void* _Cd; + void* _Ad; + size_t _N; + } args; + args._Cd = reinterpret_cast (C_d); + args._Ad = reinterpret_cast (A_d); + args._N = (size_t) N; + size_t size = sizeof(args); + + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END}; + + hipModule_t Module; + HIPCHECK(dyn_hipModuleLoad(&Module, fileName)); + + hipFunction_t Function; + HIPCHECK(dyn_hipModuleGetFunction(&Function, Module, "bit_extract_kernel")); + + HIPCHECK(dyn_hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, + reinterpret_cast(&config))); + + printf("info: copy Device2Host\n"); + HIPCHECK(dyn_hipMemcpyDtoH(C_h, (hipDeviceptr_t)(C_d), Nbytes)); + + printf("info: check result\n"); + for (size_t i = 0; i < N; i++) { + unsigned Agold = ((A_h[i] & 0xf00) >> 8); + if (C_h[i] != Agold) { + fprintf(stderr, "mismatch detected.\n"); + printf("%zu: %08x =? %08x (Ain=%08x)\n", i, C_h[i], Agold, A_h[i]); + failed("Test failed\n"); + } + } + HIPCHECK(hipFree(A_d)); + HIPCHECK(hipFree(C_d)); + free(A_h); + free(C_h); + passed(); +}