/* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /** * @addtogroup hipGetFuncBySymbol hipGetFuncBySymbol * @{ * @ingroup KernelTest * `hipError_t hipGetFuncBySymbol (hipFunction_t* functionPtr, * const void* symbolPtr * )` - * Gets pointer to device entry function that matches entry function symbolPtr. */ #include #include #include #include #define LEN 64 #define SIZE LEN * sizeof(float) #define ARR_SIZE (32 * 32) #define SIZE_BYTES (ARR_SIZE * sizeof(int)) extern "C" __global__ void bit_extract_kernel(uint32_t* C_d, const uint32_t* A_d, size_t N) { size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); size_t stride = blockDim.x * gridDim.x; for (size_t i = offset; i < N; i += stride) { #if HT_AMD C_d[i] = __bitextract_u32(A_d[i], 8, 4); #else /* defined __HIP_PLATFORM_NVIDIA__ or other path */ C_d[i] = ((A_d[i] & 0xf00) >> 8); #endif } } /** * Host Function to check for negative case. */ __host__ void hostFunction() { printf("hostFunction\n"); } /** * Sample Kernel to be used for functional test cases */ __global__ void hipKernel(int* a) { int offset = blockDim.x * blockIdx.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = offset; i < ARR_SIZE; i += stride) { a[i] += a[i]; } } /** * Local Function to validate the result */ bool verifyResult(int* a, int* output_ref, int arrSize) { for (int i = 0; i < arrSize; i++) { if (a[i] != output_ref[i]) { return false; } } return true; } /** * Test Description * ------------------------ * - Test is to get function ptr (hipFunction_t) using hipGetFuncBySymbol and launch * bit_extract kernel. Verify the output. * Test source * ------------------------ * - catch/unit/module/hipGetFuncBySymbol.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipGetFuncBySymbol_PositiveTest") { uint32_t *A_d, *C_d; uint32_t *A_h, *C_h; size_t N = 1000000; size_t Nbytes = N * sizeof(uint32_t); hipDevice_t device; HIPCHECK(hipGetDevice(&device)); hipDeviceProp_t props; HIPCHECK(hipGetDeviceProperties(&props, device)); A_h = reinterpret_cast(malloc(Nbytes)); REQUIRE(A_h != NULL); C_h = reinterpret_cast(malloc(Nbytes)); REQUIRE(C_h != NULL); for (size_t i = 0; i < N; i++) { A_h[i] = i; } HIPCHECK(hipMalloc(reinterpret_cast(&A_d), Nbytes)); HIPCHECK(hipMalloc(reinterpret_cast(&C_d), Nbytes)); HIPCHECK(hipMemcpyHtoD((hipDeviceptr_t)(A_d), A_h, Nbytes)); struct { void* _Cd; void* _Ad; size_t _N; } args; args._Cd = reinterpret_cast(C_d); args._Ad = reinterpret_cast(A_d); args._N = static_cast(N); size_t size = sizeof(args); void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END}; hipFunction_t Function; HIPCHECK(hipGetFuncBySymbol(&Function, reinterpret_cast(bit_extract_kernel))); HIPCHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, reinterpret_cast(&config))); HIPCHECK(hipMemcpyDtoH(C_h, (hipDeviceptr_t)(C_d), Nbytes)); for (size_t i = 0; i < N; i++) { unsigned Agold = ((A_h[i] & 0xf00) >> 8); REQUIRE(C_h[i] == Agold); } HIPCHECK(hipFree(A_d)); HIPCHECK(hipFree(C_d)); free(A_h); free(C_h); } /** * Test Description * ------------------------ * - Pass the NULL as a symbolPtr it should return * hipErrorInvalidDeviceFunction * - Pass a host function as a symbolPtr it should return * hipErrorInvalidDeviceFunction * Test source * ------------------------ * - catch/unit/module/hipGetFuncBySymbol.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipGetFuncBySymbol_NegativeTests") { hipFunction_t funcPointer; // Passing NULL as second parameter REQUIRE(hipGetFuncBySymbol(&funcPointer, NULL) != hipSuccess); // Passing hostFunction as second parameter REQUIRE(hipGetFuncBySymbol(&funcPointer, reinterpret_cast(hostFunction))); } /** * Test Description * ------------------------ * - Create a child process and pass the __global__ function as a symbolPtr * it should return hipSuccess, and kernel launch and execution with * functionPtr should success. * Test source * ------------------------ * - catch/unit/module/hipGetFuncBySymbol.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipGetFuncBySymbol_InChildProcess") { hip::SpawnProc proc("hipGetFuncBySymbol_exe", true); REQUIRE(proc.run() == 0); } /** * Test Description * ------------------------ * - For all the GPU devices in the system and pass the __global__ function * as a symbolPtr it should return hipSuccess, and kernel launch and * execution with functionPtr should success. * Test source * ------------------------ * - catch/unit/module/hipGetFuncBySymbol.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipGetFuncBySymbol_MultiDev", "[multigpu]") { int deviceCount = 0; HIP_CHECK(hipGetDeviceCount(&deviceCount)); if (deviceCount < 2) { HipTest::HIP_SKIP_TEST("Skipping because this machine has total GPUs < 2"); return; } hipFunction_t funcPointer; for (int deviceId = 0; deviceId < deviceCount; deviceId++) { HIP_CHECK(hipSetDevice(deviceId)); REQUIRE(hipGetFuncBySymbol(&funcPointer, reinterpret_cast(hipKernel)) == hipSuccess); int* h_a = reinterpret_cast(malloc(SIZE_BYTES)); REQUIRE(h_a != nullptr); int* output_ref = reinterpret_cast(malloc(SIZE_BYTES)); REQUIRE(output_ref != nullptr); for (int i = 0; i < ARR_SIZE; i++) { h_a[i] = 2; output_ref[i] = 4; } int* d_a = nullptr; HIP_CHECK(hipMalloc(&d_a, SIZE_BYTES)); REQUIRE(d_a != nullptr); HIP_CHECK(hipMemcpy(d_a, h_a, SIZE_BYTES, hipMemcpyHostToDevice)); dim3 blocksPerGrid(1, 1, 1); dim3 threadsPerBlock(1, 1, 64); void* kernelParam[] = {d_a}; auto size = sizeof(kernelParam); void* kernel_parameter[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &kernelParam, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END}; REQUIRE(hipModuleLaunchKernel(funcPointer, blocksPerGrid.x, blocksPerGrid.y, blocksPerGrid.z, threadsPerBlock.x, threadsPerBlock.y, threadsPerBlock.z, 0, 0, nullptr, kernel_parameter) == hipSuccess); HIP_CHECK(hipMemcpy(h_a, d_a, SIZE_BYTES, hipMemcpyDeviceToHost)); REQUIRE(verifyResult(h_a, output_ref, ARR_SIZE) == true); free(h_a); free(output_ref); HIP_CHECK(hipFree(d_a)); } } /** * Local function useful to create stream and memory copy and launch kernel */ void MultiThreadMultiDevFunc(int DevId) { HIP_CHECK(hipSetDevice(DevId)); int* h_a = reinterpret_cast(malloc(SIZE_BYTES)); REQUIRE(h_a != nullptr); int* output_ref = reinterpret_cast(malloc(SIZE_BYTES)); REQUIRE(output_ref != nullptr); for (int i = 0; i < ARR_SIZE; i++) { h_a[i] = 2; output_ref[i] = 4; } hipStream_t stream; HIP_CHECK(hipSetDevice(DevId)); HIP_CHECK(hipStreamCreate(&stream)); int* d_a = nullptr; HIP_CHECK(hipMalloc(&d_a, SIZE_BYTES)); REQUIRE(d_a != nullptr); HIP_CHECK(hipMemcpyAsync(d_a, h_a, SIZE_BYTES, hipMemcpyHostToDevice, stream)); dim3 blocksPerGrid(1, 1, 1); dim3 threadsPerBlock(1, 1, 64); hipFunction_t funcPointer; REQUIRE(hipGetFuncBySymbol(&funcPointer, reinterpret_cast(hipKernel)) == hipSuccess); void* kernelParam[] = {d_a}; auto size = sizeof(kernelParam); void* kernel_parameter[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &kernelParam, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END}; REQUIRE(hipModuleLaunchKernel(funcPointer, blocksPerGrid.x, blocksPerGrid.y, blocksPerGrid.z, threadsPerBlock.x, threadsPerBlock.y, threadsPerBlock.z, 0, stream, nullptr, kernel_parameter) == hipSuccess); HIP_CHECK(hipMemcpyAsync(h_a, d_a, SIZE_BYTES, hipMemcpyDeviceToHost, stream)); REQUIRE(verifyResult(h_a, output_ref, ARR_SIZE) == true); free(h_a); free(output_ref); HIP_CHECK(hipStreamDestroy(stream)); HIP_CHECK(hipFree(d_a)); } /** * Test Description * ------------------------ * - Create number of threads equals to number of devices and in each devices * pass the __global__ function as a symbolPtr it should return hipSuccess, * and kernel launch and execution with functionPtr should success. * Test source * ------------------------ * - catch/unit/module/hipGetFuncBySymbol.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipGetFuncBySymbol_MultiDevMultiThread", "[multigpu]") { int deviceCount = 0; HIP_CHECK(hipGetDeviceCount(&deviceCount)); if (deviceCount < 2) { HipTest::HIP_SKIP_TEST("Skipping because this machine has total GPUs < 2"); return; } ::std::vector< ::std::thread> threads; for (int dev = 0; dev < deviceCount; dev++) { threads.push_back(::std::thread(MultiThreadMultiDevFunc, dev)); } for (int dev = 0; (dev < deviceCount) && (dev < threads.size()); dev++) { threads[dev].join(); } }