diff --git a/projects/hip-tests/catch/unit/deviceLib/DynamicAllocas.cc b/projects/hip-tests/catch/unit/deviceLib/DynamicAllocas.cc new file mode 100644 index 0000000000..af0eef9500 --- /dev/null +++ b/projects/hip-tests/catch/unit/deviceLib/DynamicAllocas.cc @@ -0,0 +1,753 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Testcase Description: +// Verifies working of following Dyanmic Allocations: +// 1. Dynamic Allocas in kernels, +// 2. Dynamic Allocas in device functions, +// 3. Dynamic Allocas with over-alignment, +// 4. Multiple Dynamic Allocas in a function, +// 5. Dynamic Allocas in functions with parameter passing + +#include + +//////////////////////////////////////////////////////////////////////////////// +// Auto-Verification Code +//////////////////////////////////////////////////////////////////////////////// + +// Auto-verification API +template bool verifyResult(T* output, T* expected, int N) { + for (int i = 0; i < N; ++i) + if (expected[i] != output[i]) return false; + return true; +} + +// Host device memory allocator +enum BufferIndex { align = 0, ret_1, ret_2, ret_3, ret_align, h0, h1, h2, h3, h4, h5, h6, h7 }; + +static std::vector> allocate_host_memory(int num_arrays, int num_elements) { + std::vector> buffers(num_arrays); + + for (auto& buf : buffers) buf = std::make_unique(num_elements); + + for (size_t i = 0; i < num_elements; ++i) { + buffers[h0][i] = i; + buffers[h1][i] = num_elements - 1 - i; + buffers[h2][i] = 5; + buffers[align][i] = 0; + } + + for (size_t i = 0; i < num_elements; ++i) { + buffers[ret_1][i] = buffers[h0][buffers[h1][i]]; + buffers[ret_2][i] = buffers[h1][buffers[h2][i]]; + buffers[ret_3][i] = buffers[h2][buffers[h0][i]]; + buffers[ret_align][i] = 0; + } + return buffers; +} + +// Kernels exercising dynamic allocations +extern "C" __global__ void test_kernel_uniform_dynamic_alloca(int* a, int* b, int N) { + volatile int* tmp = (int*)__builtin_alloca(N * 4); + for (int i = 0; i < N; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < N; i++) { + a[i] = tmp[i]; + } +} + +extern "C" __global__ void test_kernel_uniform_dynamic_alloca_over_aligned(int* a, int* b, int N) { + alignas(1024) volatile int tmp[N]; + volatile int* tmp2 = (int*)__builtin_alloca(20 * 4); + uintptr_t addr = reinterpret_cast(tmp); + for (int i = 0; i < N; i++) { + tmp[i] = a[b[i]]; + tmp2[i] = addr % 1024; + } + for (int i = 0; i < N; i++) { + a[i] = tmp[i]; + b[i] = tmp2[i]; + } +} + +static void runKernelUniformDynamicAllocasTest() { + size_t size = 20 * sizeof(int); + std::unique_ptr num_elements = std::make_unique(20); + + // allocate mem for the result on host side + std::vector> host_memory_arrays = allocate_host_memory(10, 20); + + // allocate device memory for result + int* d1 = nullptr; + int* d2 = nullptr; + int* size_kernel; + HIP_CHECK(hipMalloc(&d1, size)); + HIP_CHECK(hipMalloc(&d2, size)); + HIP_CHECK(hipMalloc(&size_kernel, sizeof(int))); + size_t stackSize = 100; + HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, stackSize)); + + // kernel uniform dynamic alloca + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(size_kernel, num_elements.get(), sizeof(int), hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(test_kernel_uniform_dynamic_alloca, dim3(1), dim3(1), 0, 0, d1, d2, + *size_kernel); + + HIP_CHECK(hipMemcpy(host_memory_arrays[h2].get(), d1, size, hipMemcpyDeviceToHost)); + + REQUIRE(verifyResult(host_memory_arrays[h2].get(), host_memory_arrays[ret_1].get(), 20)); + + // kernel uniform dynamic alloca over-aligned + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(size_kernel, num_elements.get(), sizeof(int), hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(test_kernel_uniform_dynamic_alloca_over_aligned, dim3(1), dim3(1), 0, 0, d1, + d2, *size_kernel); + + HIP_CHECK(hipMemcpy(host_memory_arrays[h2].get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h3].get(), d2, size, hipMemcpyDeviceToHost)); + + REQUIRE(verifyResult(host_memory_arrays[h2].get(), host_memory_arrays[ret_1].get(), 20)); + REQUIRE(verifyResult(host_memory_arrays[h3].get(), host_memory_arrays[ret_align].get(), 20)); + + HIP_CHECK(hipFree(d1)); + HIP_CHECK(hipFree(d2)); + HIP_CHECK(hipFree(size_kernel)); +} + +extern "C" __global__ void test_kernel_divergent_dynamic_alloca(int* a, int* b) { + int N = threadIdx.x + 20; + volatile int* tmp = (int*)__builtin_alloca(N * 4); + for (int i = 0; i < N; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < N; i++) { + a[i] = tmp[i]; + } +} + +extern "C" __global__ void test_kernel_divergent_dynamic_alloca_over_aligned(int* a, int* b) { + int N = threadIdx.x + 20; + alignas(512) volatile int tmp[N]; + uintptr_t addr = reinterpret_cast(tmp); + for (int i = 0; i < N; i++) { + tmp[i] = a[b[i]]; + b[i] = addr % 512; + } + for (int i = 0; i < N; i++) { + a[i] = tmp[i]; + } +} + +static void runKernelDivergentDynamicAllocasTest() { + size_t size = 20 * sizeof(int); + std::unique_ptr num_elements = std::make_unique(20); + // allocate mem for the result on host side + std::vector> host_memory_arrays = allocate_host_memory(10, 20); + + // allocate device memory for result + int* d1 = nullptr; + int* d2 = nullptr; + int* size_kernel; + HIP_CHECK(hipMalloc(&d1, size)); + HIP_CHECK(hipMalloc(&d2, size)); + HIP_CHECK(hipMalloc(&size_kernel, sizeof(int))); + size_t stackSize = 10; + HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, stackSize)); + + // kernel divergent dynamic alloca + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(test_kernel_divergent_dynamic_alloca, dim3(1), dim3(1), 0, 0, d1, d2); + + HIP_CHECK(hipMemcpy(host_memory_arrays[h2].get(), d1, size, hipMemcpyDeviceToHost)); + + REQUIRE(verifyResult(host_memory_arrays[h2].get(), host_memory_arrays[ret_1].get(), 20)); + + // kernel divergent dynamic alloca over aligned + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(test_kernel_divergent_dynamic_alloca_over_aligned, dim3(1), dim3(1), 0, 0, d1, + d2); + + HIP_CHECK(hipMemcpy(host_memory_arrays[h2].get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h3].get(), d2, size, hipMemcpyDeviceToHost)); + + REQUIRE(verifyResult(host_memory_arrays[h2].get(), host_memory_arrays[ret_1].get(), 20)); + REQUIRE(verifyResult(host_memory_arrays[h3].get(), host_memory_arrays[ret_align].get(), 20)); + + HIP_CHECK(hipFree(d1)); + HIP_CHECK(hipFree(d2)); + HIP_CHECK(hipFree(size_kernel)); +} + +extern "C" __global__ void test_kernel_multiple_dynamic_alloca(int* a, int* b, int* c, int N) { + int X = threadIdx.x + 20; + int Y = threadIdx.x + N; + volatile int* tmp = (int*)__builtin_alloca(N * 4); + volatile int* tmp2 = (int*)__builtin_alloca(X * 4); + volatile int* tmp3 = (int*)__builtin_alloca(Y * 4); + for (int i = 0; i < N; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < N; i++) { + a[i] = tmp[i]; + } + for (int i = 0; i < N; i++) { + tmp2[i] = b[c[i]]; + tmp3[i] = c[a[i]]; + } + for (int i = 0; i < N; i++) { + b[i] = tmp2[i]; + c[i] = tmp3[i]; + } +} + +extern "C" __global__ void test_kernel_multiple_dynamic_alloca_over_aligned(int* a, int* b, int* c, + int* d, int N) { + int X = threadIdx.x + 20; + int Y = threadIdx.x + N; + alignas(32) volatile int tmp[N]; + alignas(128) volatile int tmp2[X]; + alignas(512) volatile int tmp3[Y]; + uintptr_t addr = reinterpret_cast(tmp); + uintptr_t addr2 = reinterpret_cast(tmp2); + uintptr_t addr3 = reinterpret_cast(tmp3); + for (int i = 0; i < N; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < N; i++) { + a[i] = tmp[i]; + } + for (int i = 0; i < N; i++) { + tmp2[i] = b[c[i]]; + tmp3[i] = c[a[i]]; + } + for (int i = 0; i < N; i++) { + b[i] = tmp2[i]; + c[i] = tmp3[i]; + } + for (int i = 0; i < N; i++) { + d[i] = (addr % 32) + (addr2 % 128) + (addr3 % 512); + } +} + +static void runKernelMultipleDynamicAllocasTest() { + size_t size = 20 * sizeof(int); + std::unique_ptr num_elements = std::make_unique(20); + + // allocate mem for the result on host side + std::vector> host_memory_arrays = allocate_host_memory(15, 20); + + // allocate device memory for result + int* d1 = nullptr; + int* d2 = nullptr; + int* d3 = nullptr; + int* d4 = nullptr; + int* size_kernel; + HIP_CHECK(hipMalloc(&d1, size)); + HIP_CHECK(hipMalloc(&d2, size)); + HIP_CHECK(hipMalloc(&d3, size)); + HIP_CHECK(hipMalloc(&d4, size)); + HIP_CHECK(hipMalloc(&size_kernel, sizeof(int))); + size_t stackSize = 10; + HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, stackSize)); + + // kernel multiple dynamic alloca + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d3, host_memory_arrays[h2].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(size_kernel, num_elements.get(), sizeof(int), hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(test_kernel_multiple_dynamic_alloca, dim3(1), dim3(1), 0, 0, d1, d2, d3, + *size_kernel); + + HIP_CHECK(hipMemcpy(host_memory_arrays[h3].get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h4].get(), d2, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h5].get(), d3, size, hipMemcpyDeviceToHost)); + + REQUIRE(verifyResult(host_memory_arrays[h3].get(), host_memory_arrays[ret_1].get(), 20)); + REQUIRE(verifyResult(host_memory_arrays[h4].get(), host_memory_arrays[ret_2].get(), 20)); + REQUIRE(verifyResult(host_memory_arrays[h5].get(), host_memory_arrays[ret_3].get(), 20)); + + // kernel multiple dynamic alloca over aligned + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d3, host_memory_arrays[h2].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(size_kernel, num_elements.get(), sizeof(int), hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(test_kernel_multiple_dynamic_alloca_over_aligned, dim3(1), dim3(1), 0, 0, d1, + d2, d3, d4, *size_kernel); + + HIP_CHECK(hipMemcpy(host_memory_arrays[h3].get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h4].get(), d2, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h5].get(), d3, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h6].get(), d4, size, hipMemcpyDeviceToHost)); + + REQUIRE(verifyResult(host_memory_arrays[h3].get(), host_memory_arrays[ret_1].get(), 20)); + REQUIRE(verifyResult(host_memory_arrays[h4].get(), host_memory_arrays[ret_2].get(), 20)); + REQUIRE(verifyResult(host_memory_arrays[h5].get(), host_memory_arrays[ret_3].get(), 20)); + REQUIRE(verifyResult(host_memory_arrays[h6].get(), host_memory_arrays[ret_align].get(), 20)); + + HIP_CHECK(hipFree(d1)); + HIP_CHECK(hipFree(d2)); + HIP_CHECK(hipFree(d3)); + HIP_CHECK(hipFree(size_kernel)); +} + +// kernels and device functions exercizing dynamic allocations +__noinline__ extern "C" __device__ void process_array_with_aligned_32_byte_buffer(int* a, int* b, + int* c, int* d, + int X) { + alignas(32) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + for (int i = 0; i < 50; i++) { + tmp[i] = c[a[i]]; + } + for (int i = 0; i < 50; i++) { + c[i] = tmp[i]; + d[i] += addr % 32; + } +} + +__noinline__ extern "C" __device__ void process_array_with_aligned_1024_byte_buffer(int* a, int* b, + int* c, int* d, + int X, int N) { + alignas(1024) volatile int tmp[N]; + uintptr_t addr = reinterpret_cast(tmp); + for (int i = 0; i < 50; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < 50; i++) { + b[i] = tmp[i]; + d[i] += addr % 1024; + } +} + +extern "C" __global__ void test_multiple_device_functions_nested(int* a, int* b, int* c, int* d, + int N) { + int X = threadIdx.x + 50; + alignas(1024) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + process_array_with_aligned_1024_byte_buffer(a, b, c, d, X, N); + for (int i = 0; i < X; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < X; i++) { + a[i] = tmp[i]; + d[i] += addr % 1024; + } + process_array_with_aligned_32_byte_buffer(a, b, c, d, X); +} + +static void runDeviceMultipleDynamicAllocasNestedTest() { + size_t size = 50 * sizeof(int); + std::unique_ptr num_elements = std::make_unique(50); + + // allocate mem for the result on host side + std::vector> host_memory_arrays = allocate_host_memory(15, 50); + + // allocate device memory for result + int* d1 = nullptr; + int* d2 = nullptr; + int* d3 = nullptr; + int* d4 = nullptr; + int* size_kernel; + HIP_CHECK(hipMalloc(&d1, size)); + HIP_CHECK(hipMalloc(&d2, size)); + HIP_CHECK(hipMalloc(&d3, size)); + HIP_CHECK(hipMalloc(&d4, size)); + HIP_CHECK(hipMalloc(&size_kernel, sizeof(int))); + size_t stackSize = 100; + HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, stackSize)); + + // multiple device functions nested + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d3, host_memory_arrays[h2].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d4, host_memory_arrays[align].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(size_kernel, num_elements.get(), sizeof(int), hipMemcpyHostToDevice)); + hipLaunchKernelGGL(test_multiple_device_functions_nested, dim3(1), dim3(1), 0, 0, d1, d2, d3, d4, + *size_kernel); + HIP_CHECK(hipMemcpy(host_memory_arrays[h3].get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h4].get(), d2, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h5].get(), d3, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h6].get(), d4, size, hipMemcpyDeviceToHost)); + + REQUIRE(verifyResult(host_memory_arrays[h3].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h4].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h5].get(), host_memory_arrays[ret_3].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h6].get(), host_memory_arrays[ret_align].get(), 50)); + + HIP_CHECK(hipFree(d1)); + HIP_CHECK(hipFree(d2)); + HIP_CHECK(hipFree(d3)); + HIP_CHECK(hipFree(size_kernel)); +} + +__noinline__ extern "C" __device__ void process_array_with_nested_1024_byte_alignment( + int* a, int* b, int* c, int* d, int X, int N) { + alignas(1024) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + process_array_with_aligned_32_byte_buffer(a, b, c, d, X); + for (int i = 0; i < X; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < X; i++) { + b[i] = tmp[i]; + d[i] += addr % 1024; + } +} + +extern "C" __global__ void test_multiple_device_functions(int* a, int* b, int* c, int* d, int N) { + int X = threadIdx.x + 50; + alignas(1024) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + process_array_with_nested_1024_byte_alignment(a, b, c, d, X, N); + for (int i = 0; i < X; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < X; i++) { + a[i] = tmp[i]; + d[i] += addr % 1024; + } +} + +static void runDeviceMultipleDynamicAllocasTest() { + size_t size = 50 * sizeof(int); + std::unique_ptr num_elements = std::make_unique(50); + + // allocate mem for the result on host side + std::vector> host_memory_arrays = allocate_host_memory(15, 50); + + // allocate device memory for result + int* d1 = nullptr; + int* d2 = nullptr; + int* d3 = nullptr; + int* d4 = nullptr; + int* size_kernel; + HIP_CHECK(hipMalloc(&d1, size)); + HIP_CHECK(hipMalloc(&d2, size)); + HIP_CHECK(hipMalloc(&d3, size)); + HIP_CHECK(hipMalloc(&d4, size)); + HIP_CHECK(hipMalloc(&size_kernel, sizeof(int))); + size_t stackSize = 100; + HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, stackSize)); + + // multiple device functions + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d3, host_memory_arrays[h2].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d4, host_memory_arrays[align].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(size_kernel, num_elements.get(), sizeof(int), hipMemcpyHostToDevice)); + hipLaunchKernelGGL(test_multiple_device_functions, dim3(1), dim3(1), 0, 0, d1, d2, d3, d4, + *size_kernel); + HIP_CHECK(hipMemcpy(host_memory_arrays[h3].get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h4].get(), d2, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h5].get(), d3, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h6].get(), d4, size, hipMemcpyDeviceToHost)); + + REQUIRE(verifyResult(host_memory_arrays[h3].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h4].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h5].get(), host_memory_arrays[ret_3].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h6].get(), host_memory_arrays[ret_align].get(), 50)); + + HIP_CHECK(hipFree(d1)); + HIP_CHECK(hipFree(d2)); + HIP_CHECK(hipFree(d3)); + HIP_CHECK(hipFree(size_kernel)); +} + +__noinline__ extern "C" __device__ void process_array_with_unaligned_buffer( + int* a, int* b, int* c, int X, int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7, + int a8, int a9, int a10, int a11, int a12, int a13, int a14, int a15, int a16, int a17, int a18, + int a19, int a20, int a21, int a22, int a23, int a24, int a25, int a26, int a27, int a28, + int a29, int a30, int a31, int a32, int a33, int a34, int a35, int a36, int a37, int a38, + int a39, int a40) { + int* tmp = (int*)__builtin_alloca(X * 4); + int sum = 0; + sum += a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9; + sum += a10 + a11 + a12 + a13 + a14 + a15 + a16 + a17 + a18 + a19; + sum += a20 + a21 + a22 + a23 + a24 + a25 + a26 + a27 + a28 + a29; + sum += a30 + a31 + a32 + a33 + a34 + a35 + a36 + a37 + a38 + a39 + a40; + for (int i = 0; i < X; i++) { + tmp[i] = c[a[i]] + sum; + } + for (int i = 0; i < X; i++) { + c[i] = tmp[i]; + } +} + +__noinline__ extern "C" __device__ void process_array_with_aligned_64_byte_buffer(int* a, int* b, + int* c, int* d, + int X, int N) { + alignas(64) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + process_array_with_unaligned_buffer(a, b, c, X, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], + a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], + a[17], a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25], + a[26], a[27], a[28], a[29], a[30], a[31], a[32], a[33], a[34], + a[35], a[36], a[37], a[38], a[39], a[40]); + for (int i = 0; i < X; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < X; i++) { + b[i] = tmp[i]; + d[i] += addr % 64; + } +} + +extern "C" __global__ void test_function_calls_parameter_passing(int* a, int* b, int* c, int* d, + int N) { + int X = threadIdx.x + 50; + alignas(32) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + for (int i = 0; i < X; i++) { + tmp[i] = a[b[i]]; + } + process_array_with_aligned_64_byte_buffer(a, b, c, d, X, N); + for (int i = 0; i < X; i++) { + a[i] = tmp[i]; + d[i] += addr % 32; + } +} + +__noinline__ extern "C" __device__ void process_array_with_aligned_128_byte_buffer( + int* a, int* b, int* c, int* d, int X, int a0, int a1, int a2, int a3, int a4, int a5, int a6, + int a7, int a8, int a9, int a10, int a11, int a12, int a13, int a14, int a15, int a16, int a17, + int a18, int a19, int a20, int a21, int a22, int a23, int a24, int a25, int a26, int a27, + int a28, int a29, int a30, int a31, int a32, int a33, int a34, int a35, int a36, int a37, + int a38, int a39, int a40) { + alignas(128) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + int sum = 0; + sum += a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9; + sum += a10 + a11 + a12 + a13 + a14 + a15 + a16 + a17 + a18 + a19; + sum += a20 + a21 + a22 + a23 + a24 + a25 + a26 + a27 + a28 + a29; + sum += a30 + a31 + a32 + a33 + a34 + a35 + a36 + a37 + a38 + a39 + a40; + for (int i = 0; i < X; i++) { + tmp[i] = c[a[i]] + sum; + } + for (int i = 0; i < X; i++) { + c[i] = tmp[i]; + d[i] += addr % 128; + } +} + +__noinline__ extern "C" __device__ void process_nested_dynamic_allocas(int* a, int* b, int* c, + int* d, int X, int N) { + alignas(64) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + process_array_with_aligned_128_byte_buffer( + a, b, c, d, X, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11], + a[12], a[13], a[14], a[15], a[16], a[17], a[18], a[19], a[20], a[21], a[22], a[23], a[24], + a[25], a[26], a[27], a[28], a[29], a[30], a[31], a[32], a[33], a[34], a[35], a[36], a[37], + a[38], a[39], a[40]); + for (int i = 0; i < X; i++) { + tmp[i] = a[b[i]]; + } + for (int i = 0; i < X; i++) { + b[i] = tmp[i]; + d[i] += addr % 32; + } +} + +extern "C" __global__ void test_function_calls_parameter_passing_over_aligned(int* a, int* b, + int* c, int* d, + int N) { + int X = threadIdx.x + 50; + alignas(32) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + for (int i = 0; i < X; i++) { + tmp[i] = a[b[i]]; + } + process_nested_dynamic_allocas(a, b, c, d, X, N); + for (int i = 0; i < X; i++) { + a[i] = tmp[i]; + d[i] += addr % 32; + } +} + +static void runDeviceMultipleDynamicAllocasParameterPassingTest() { + size_t size = 50 * sizeof(int); + std::unique_ptr num_elements = std::make_unique(50); + + // allocate mem for the result on host side + std::vector> host_memory_arrays = allocate_host_memory(15, 50); + + // allocate device memory for result + int* d1 = nullptr; + int* d2 = nullptr; + int* d3 = nullptr; + int* d4 = nullptr; + int* size_kernel; + HIP_CHECK(hipMalloc(&d1, size)); + HIP_CHECK(hipMalloc(&d2, size)); + HIP_CHECK(hipMalloc(&d3, size)); + HIP_CHECK(hipMalloc(&d4, size)); + HIP_CHECK(hipMalloc(&size_kernel, sizeof(int))); + size_t stackSize = 100; + HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, stackSize)); + + // function calls with parameter passing + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d3, host_memory_arrays[h2].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d4, host_memory_arrays[align].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(size_kernel, num_elements.get(), sizeof(int), hipMemcpyHostToDevice)); + hipLaunchKernelGGL(test_function_calls_parameter_passing, dim3(1), dim3(1), 0, 0, d1, d2, d3, d4, + *size_kernel); + HIP_CHECK(hipMemcpy(host_memory_arrays[h3].get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h4].get(), d2, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h5].get(), d3, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h6].get(), d4, size, hipMemcpyDeviceToHost)); + + int sum = 0; + for (int i = 0; i <= 40; i++) sum += host_memory_arrays[h0][i]; + for (int i = 0; i < 50; i++) host_memory_arrays[h7][i] = host_memory_arrays[ret_3][i] + sum; + + REQUIRE(verifyResult(host_memory_arrays[h3].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h4].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h5].get(), host_memory_arrays[h7].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h6].get(), host_memory_arrays[ret_align].get(), 50)); + + // function calls with parameter passing over aligned + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d3, host_memory_arrays[h2].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(size_kernel, num_elements.get(), sizeof(int), hipMemcpyHostToDevice)); + hipLaunchKernelGGL(test_function_calls_parameter_passing_over_aligned, dim3(1), dim3(1), 0, 0, d1, + d2, d3, d4, *size_kernel); + HIP_CHECK(hipMemcpy(host_memory_arrays[h3].get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h4].get(), d2, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h5].get(), d3, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h6].get(), d4, size, hipMemcpyDeviceToHost)); + + REQUIRE(verifyResult(host_memory_arrays[h3].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h4].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h5].get(), host_memory_arrays[h7].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h6].get(), host_memory_arrays[ret_align].get(), 50)); + + HIP_CHECK(hipFree(d1)); + HIP_CHECK(hipFree(d2)); + HIP_CHECK(hipFree(d3)); + HIP_CHECK(hipFree(size_kernel)); +} + +extern "C" __global__ void test_function_calls_parameter_passing_sandwiched(int* a, int* b, int* c, + int* d, int N) { + int X = threadIdx.x + 50; + process_array_with_aligned_128_byte_buffer( + a, b, c, d, X, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11], + a[12], a[13], a[14], a[15], a[16], a[17], a[18], a[19], a[20], a[21], a[22], a[23], a[24], + a[25], a[26], a[27], a[28], a[29], a[30], a[31], a[32], a[33], a[34], a[35], a[36], a[37], + a[38], a[39], a[40]); + alignas(32) volatile int tmp[X]; + uintptr_t addr = reinterpret_cast(tmp); + for (int i = 0; i < X; i++) { + tmp[i] = a[b[i]]; + } + process_array_with_aligned_32_byte_buffer(a, b, c, d, X); + for (int i = 0; i < X; i++) { + a[i] = tmp[i]; + d[i] += addr % 32; + } +} + +static void runDeviceMultipleDynamicAllocasParameterPassingSandwichedTest() { + size_t size = 50 * sizeof(int); + std::unique_ptr num_elements = std::make_unique(50); + + // allocate mem for the result on host side + std::vector> host_memory_arrays = allocate_host_memory(15, 50); + + // allocate device memory for result + int* d1 = nullptr; + int* d2 = nullptr; + int* d3 = nullptr; + int* d4 = nullptr; + int* size_kernel; + HIP_CHECK(hipMalloc(&d1, size)); + HIP_CHECK(hipMalloc(&d2, size)); + HIP_CHECK(hipMalloc(&d3, size)); + HIP_CHECK(hipMalloc(&d4, size)); + HIP_CHECK(hipMalloc(&size_kernel, sizeof(int))); + size_t stackSize = 10; + HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, stackSize)); + + // function calls with parameter passing sandwiched + HIP_CHECK(hipMemcpy(d1, host_memory_arrays[h0].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d2, host_memory_arrays[h1].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d3, host_memory_arrays[h2].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d4, host_memory_arrays[align].get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(size_kernel, num_elements.get(), sizeof(int), hipMemcpyHostToDevice)); + hipLaunchKernelGGL(test_function_calls_parameter_passing_sandwiched, dim3(1), dim3(1), 0, 0, d1, + d2, d3, d4, *size_kernel); + HIP_CHECK(hipMemcpy(host_memory_arrays[h3].get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h4].get(), d2, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h5].get(), d3, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_memory_arrays[h6].get(), d4, size, hipMemcpyDeviceToHost)); + + int sum = 0; + for (int i = 0; i <= 40; i++) sum += host_memory_arrays[h0][i]; + for (int i = 0; i < 50; i++) host_memory_arrays[h7][i] = host_memory_arrays[ret_3][i] + sum; + + REQUIRE(verifyResult(host_memory_arrays[h3].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h4].get(), host_memory_arrays[ret_1].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h5].get(), host_memory_arrays[h7].get(), 50)); + REQUIRE(verifyResult(host_memory_arrays[h6].get(), host_memory_arrays[ret_align].get(), 50)); + + HIP_CHECK(hipFree(d1)); + HIP_CHECK(hipFree(d2)); + HIP_CHECK(hipFree(d3)); + HIP_CHECK(hipFree(size_kernel)); +} + +/* +These testcases perform the following scenarios of dynamic allocas + // 1. dyn alloca in kernel - uniform value + // 2. dyn alloca in kernel - uniform value - over aligned + // 3. dyn alloca in kernel - divergent value + // 4. dyn alloca in kernel - divergent value - over aligned + // 5. multiple dyn alloca in kernel + // 6. multiple dyn alloca in kernel - over aligned + // 7. multiple device function calls + // 8. multiple device function calls nested + // 9. function calls with parameter passing + // 10. function calls with parameter passing - over aligned + // 11. function calls with parameter passing - sandwiched +*/ +TEST_CASE("Dynamic_Alloca") { + SECTION("Kernel Uniform Dynamic Allocas") { runKernelUniformDynamicAllocasTest(); } + SECTION("Kernel Divergent Dynamic Allocas") { runKernelDivergentDynamicAllocasTest(); } + SECTION("Kernel Multiple Dynamic Allocas") { runKernelMultipleDynamicAllocasTest(); } + SECTION("Device Multiple Dynamic Allocas") { runDeviceMultipleDynamicAllocasTest(); } + SECTION("Device Multiple Dynamic Allocas Nested") { runDeviceMultipleDynamicAllocasNestedTest(); } + SECTION("Device Multiple Dynamic Allocas Parameter Passing") { + runDeviceMultipleDynamicAllocasParameterPassingTest(); + } + SECTION("Device Multiple Dynamic Allocas Parameter Passing Sandwiched") { + runDeviceMultipleDynamicAllocasParameterPassingSandwichedTest(); + } +}