From 5eade5cfa104e0bc112bbd854aa96bbafe220753 Mon Sep 17 00:00:00 2001 From: Sourabh U Betigeri Date: Fri, 25 Feb 2022 02:56:20 -0800 Subject: [PATCH] SWDEV-294375 - Adding coalesced_groups HIP tests (#2489) Change-Id: I000a402e32d6057bab11829af2b78931fcd67fb5 --- .../coalesced_groups_shfl_down.cpp | 276 +++++++++ .../coalesced_groups_shfl_up.cpp | 260 ++++++++ .../simple_coalesced_groups.cpp | 583 ++++++++++++++++++ 3 files changed, 1119 insertions(+) create mode 100644 tests/src/runtimeApi/cooperativeGrps/coalesced_groups_shfl_down.cpp create mode 100644 tests/src/runtimeApi/cooperativeGrps/coalesced_groups_shfl_up.cpp create mode 100644 tests/src/runtimeApi/cooperativeGrps/simple_coalesced_groups.cpp diff --git a/tests/src/runtimeApi/cooperativeGrps/coalesced_groups_shfl_down.cpp b/tests/src/runtimeApi/cooperativeGrps/coalesced_groups_shfl_down.cpp new file mode 100644 index 0000000000..aa7b00e290 --- /dev/null +++ b/tests/src/runtimeApi/cooperativeGrps/coalesced_groups_shfl_down.cpp @@ -0,0 +1,276 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Test Description: +/* This test implements sum reduction kernel, first with each threads own rank + as input and comparing the sum with expected sum output derieved from n(n-1)/2 + formula. + This sample tests functionality of intrinsics provided by thread_block_tile type, + shfl_down and shfl_xor. +*/ + +#include "test_common.h" +#include +#include +#include + +using namespace cooperative_groups; + +#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) +#define WAVE_SIZE 32 + +__device__ int reduction_kernel_shfl_down(coalesced_group const& g, int val) { + int sz = g.size(); + + for (int i = sz / 2; i > 0; i >>= 1) { + val += g.shfl_down(val, i); + } + + // Choose the 0'th indexed thread that holds the reduction value to return + if (g.thread_rank() == 0) { + return val; + } + // Rest of the threads return no useful values + else { + return -1; + } +} + +__global__ void kernel_shfl_down (int * dPtr, int *dResults, int lane_delta, int cg_sizes) { + int id = threadIdx.x + blockIdx.x * blockDim.x; + + if (id % cg_sizes == 0) { + coalesced_group const& g = coalesced_threads(); + int rank = g.thread_rank(); + int val = dPtr[rank]; + dResults[rank] = g.shfl_down(val, lane_delta); + return; + } +} + +__global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int cg_sizes) { + + int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id % cg_sizes == 0) { + coalesced_group threadBlockCGTy = coalesced_threads(); + int input, outputSum, expectedSum; + + // Choose a leader thread to print the results + if (threadBlockCGTy.thread_rank() == 0) { + printf(" Creating %d groups, of tile size %d threads:\n\n", + (int)threadBlockCGTy.size() / tileSz, tileSz); + } + + threadBlockCGTy.sync(); + + coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz); + int threadRank = tiledPartition.thread_rank(); + + input = tiledPartition.thread_rank(); + + // (n-1)(n)/2 + expectedSum = ((tileSz - 1) * tileSz / 2); + + outputSum = reduction_kernel_shfl_down(tiledPartition, input); + + if (tiledPartition.thread_rank() == 0) { + printf( + " Sum of all ranks 0..%d in this tiledPartition group using shfl_down is %d (expected " + "%d)\n", + tiledPartition.size() - 1, outputSum, expectedSum); + result[threadBlockCGTy.thread_rank() / (tileSz)] = outputSum; + } + return; + } +} + +void verifyResults(int* ptr, int expectedResult, int numTiles) { + for (int i = 0; i < numTiles; i++) { + if (ptr[i] != expectedResult) { + printf(" Results do not match! "); + } + } +} + +void compareResults(int* cpu, int* gpu, int size) { + for (unsigned int i = 0; i < size / sizeof(int); i++) { + if (cpu[i] != gpu[i]) { + printf(" results do not match."); + } + } +} + +void printResults(int* ptr, int size) { + for (int i = 0; i < size; i++) { + std::cout << ptr[i] << " "; + } + std::cout << '\n'; +} + +static void test_group_partition(unsigned int tileSz) { + hipError_t err; + int blockSize = 1; + int threadsPerBlock = 32; + + std::vector cg_sizes = {1, 2, 3}; + for (auto i : cg_sizes) { + + int numTiles = ((blockSize * threadsPerBlock) / i) / tileSz; + int expectedSum = ((tileSz - 1) * tileSz / 2); + int* expectedResult = new int[numTiles]; + + // numTiles = 0 when partitioning is possible. The below statement is to avoid + // out-of-bounds error and still evaluate failure case. + numTiles = (numTiles == 0) ? 1 : numTiles; + + for (int i = 0; i < numTiles; i++) { + expectedResult[i] = expectedSum; + } + + int* dResult = NULL; + int* hResult = NULL; + + hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault); + memset(hResult, 0, numTiles * sizeof(int)); + + hipMalloc(&dResult, numTiles * sizeof(int)); + + + // Launch Kernel + hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock, + threadsPerBlock * sizeof(int), 0, dResult, tileSz, i); + err = hipDeviceSynchronize(); + if (err != hipSuccess) { + fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); + } + + + hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost); + + verifyResults(hResult, expectedSum, numTiles); + + // Free all allocated memory on host and device + hipFree(dResult); + hipFree(hResult); + delete[] expectedResult; + + printf("\n...PASSED.\n\n"); + } +} + +static void test_shfl_down() { + + std::vector cg_sizes = {1, 2, 3}; + for (auto i : cg_sizes) { + + hipError_t err; + int blockSize = 1; + int threadsPerBlock = WAVE_SIZE; + + int totalThreads = blockSize * threadsPerBlock; + int group_size = totalThreads / i; + int group_size_in_bytes = group_size * sizeof(int); + + int* hPtr = NULL; + int* dPtr = NULL; + int* dResults = NULL; + int lane_delta = rand() % group_size; + std::cout << "Testing coalesced_groups shfl_down with lane_delta " << lane_delta << "and group size " + << WAVE_SIZE << '\n' << std::endl; + + int arrSize = blockSize * threadsPerBlock * sizeof(int); + + hipHostMalloc(&hPtr, arrSize); + // Fill up the array + for (int i = 0; i < WAVE_SIZE; i++) { + hPtr[i] = rand() % 1000; + } + + int* cpuResultsArr = (int*)malloc(group_size_in_bytes); + for (int i = 0; i < group_size; i++) { + cpuResultsArr[i] = (i + lane_delta >= group_size) ? hPtr[i] : hPtr[i + lane_delta]; + } + //printf("Array passed to GPU for computation\n"); + //printResults(hPtr, WAVE_SIZE); + hipMalloc(&dPtr, group_size_in_bytes); + hipMalloc(&dResults, group_size_in_bytes); + + hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice); + // Launch Kernel + hipLaunchKernelGGL(kernel_shfl_down, blockSize, threadsPerBlock, + threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i); + hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost); + err = hipDeviceSynchronize(); + if (err != hipSuccess) { + fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); + } + //printf("GPU results: \n"); + //printResults(hPtr, WAVE_SIZE); + //printf("Printing cpu to be verified array\n"); + //printResults(cpuResultsArr, WAVE_SIZE); + + compareResults(hPtr, cpuResultsArr, group_size_in_bytes); + std::cout << "Results verified!\n"; + + hipFree(hPtr); + hipFree(dPtr); + free(cpuResultsArr); + } +} + + + +int main() { + // Use default device for validating the test + int deviceId; + ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess); + hipDeviceProp_t deviceProperties; + ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); + int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; + + if (!deviceProperties.cooperativeLaunch) { + std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; + if (hip_skip_tests_enabled()) { + return hip_skip_retcode(); + } else { + passed(); + } + return 0; + } + + // Test shfl_down with random group sizes + for (int i = 0; i < 100; i++) { + test_shfl_down(); + } + + std::cout << "Testing static tiled_partition for different tile sizes using shfl_down" + << std::endl; + + int testNo = 1; + std::vector tileSizes = {2, 4, 8, 16, 32}; + for (auto i : tileSizes) { + std::cout << "TEST " << testNo << ":" << '\n' << std::endl; + test_group_partition(i); + testNo++; + } + + passed(); +} diff --git a/tests/src/runtimeApi/cooperativeGrps/coalesced_groups_shfl_up.cpp b/tests/src/runtimeApi/cooperativeGrps/coalesced_groups_shfl_up.cpp new file mode 100644 index 0000000000..62a05ec244 --- /dev/null +++ b/tests/src/runtimeApi/cooperativeGrps/coalesced_groups_shfl_up.cpp @@ -0,0 +1,260 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Test Description: +/* This test implements prefix sum(scan) kernel, first with each threads own rank + as input and comparing the sum with expected serial summation output on CPU. + + This sample tests functionality of intrinsics provided by coalesced_group, + shfl_up. +*/ +#include "test_common.h" +#include +#include +#include + +using namespace cooperative_groups; + +#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) +#define WAVE_SIZE 32 +__device__ int prefix_sum_kernel(coalesced_group const& g, int val) { + int sz = g.size(); + for (int i = 1; i < sz; i <<= 1) { + int temp = g.shfl_up(val, i); + + if (g.thread_rank() >= i) { + val += temp; + } + } + return val; +} + +__global__ void kernel_shfl_up (int * dPtr, int *dResults, int lane_delta, int cg_sizes) { + int id = threadIdx.x + blockIdx.x * blockDim.x; + + if (id % cg_sizes == 0) { + coalesced_group g = coalesced_threads(); + int rank = g.thread_rank(); + int val = dPtr[rank]; + dResults[rank] = g.shfl_up(val, lane_delta); + return; + } + +} + +__global__ void kernel_cg_group_partition(int* dPtr, unsigned int tileSz, int cg_sizes) { + + int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id % cg_sizes == 0) { + coalesced_group threadBlockCGTy = coalesced_threads(); + int input, outputSum; + + // we pass its own thread rank as inputs + input = threadBlockCGTy.thread_rank(); + + // Choose a leader thread to print the results + if (threadBlockCGTy.thread_rank() == 0) { + printf(" Creating %d groups, of tile size %d threads:\n\n", + (int)threadBlockCGTy.size() / tileSz, tileSz); + } + + threadBlockCGTy.sync(); + + coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz); + + input = tiledPartition.thread_rank(); + + outputSum = prefix_sum_kernel(tiledPartition, input); + + // Update the result array with the corresponsing prefix sum + dPtr[threadBlockCGTy.thread_rank()] = outputSum; + return; + } +} + +void serialScan(int* ptr, int size) { + // Fill up the array + for (int i = 0; i < size; i++) { + ptr[i] = i; + } + + int acc = 0; + for (int i = 0; i < size; i++) { + acc = acc + ptr[i]; + ptr[i] = acc; + } +} + +void printResults(int* ptr, int size) { + for (int i = 0; i < size; i++) { + std::cout << ptr[i] << " "; + } + std::cout << '\n'; +} + +void verifyResults(int* cpu, int* gpu, int size) { + for (unsigned int i = 0; i < size / sizeof(int); i++) { + if (cpu[i] != gpu[i]) { + printf(" Results do not match."); + } + } +} + +static void test_group_partition(unsigned tileSz) { + hipError_t err; + int blockSize = 1; + int threadsPerBlock = WAVE_SIZE; + + int* hPtr = NULL; + int* dPtr = NULL; + int* cpuPrefixSum = NULL; + + std::vector cg_sizes = {1, 2, 3}; + for (auto i : cg_sizes) { + + int arrSize = blockSize * threadsPerBlock * sizeof(int); + + hipHostMalloc(&hPtr, arrSize); + hipMalloc(&dPtr, arrSize); + + // Launch Kernel + hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock, + threadsPerBlock * sizeof(int), 0, dPtr, tileSz, i); + hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost); + err = hipDeviceSynchronize(); + if (err != hipSuccess) { + fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); + } + + cpuPrefixSum = new int[tileSz]; + serialScan(cpuPrefixSum, tileSz); + //std::cout << "\nPrefix sum results on CPU\n"; + //printResults(cpuPrefixSum, tileSz); + + //std::cout << "\nPrefix sum results on GPU\n"; + //printResults(hPtr, tileSz); + std::cout << "\n"; + verifyResults(hPtr, cpuPrefixSum, tileSz); + std::cout << "Results verified!\n"; + + delete[] cpuPrefixSum; + hipFree(hPtr); + hipFree(dPtr); + } +} + +static void test_shfl_up() { + + std::vector cg_sizes = {1, 2, 3}; + for (auto i : cg_sizes) { + + hipError_t err; + int blockSize = 1; + + int threadsPerBlock = WAVE_SIZE; + int totalThreads = blockSize * threadsPerBlock; + int group_size = totalThreads / i; + int group_size_in_bytes = group_size * sizeof(int); + + int* hPtr = NULL; + int* dPtr = NULL; + int* dResults = NULL; + int lane_delta = (rand() % group_size); + + std::cout << "Testing coalesced_groups shfl_up with lane_delta " << lane_delta + << " and group size " << WAVE_SIZE << '\n' << std::endl; + + int arrSize = blockSize * threadsPerBlock * sizeof(int); + + hipHostMalloc(&hPtr, arrSize); + // Fill up the array + for (int i = 0; i < WAVE_SIZE; i++) { + hPtr[i] = rand() % 1000; + } + //printResults(hPtr, WAVE_SIZE); + + int* cpuResultsArr = (int*)malloc(group_size_in_bytes); + for (int i = 0; i < group_size; i++) { + cpuResultsArr[i] = (i <= (lane_delta - 1)) ? hPtr[i] : hPtr[i - lane_delta]; + } + + //printf("Printing cpu results arr\n"); + //printResults(cpuResultsArr, WAVE_SIZE); + + hipMalloc(&dPtr, group_size_in_bytes); + hipMalloc(&dResults, group_size_in_bytes); + + hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice); + // Launch Kernel + hipLaunchKernelGGL(kernel_shfl_up, blockSize, threadsPerBlock, + threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i); + hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost); + err = hipDeviceSynchronize(); + if (err != hipSuccess) { + fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); + } + //printf("GPU computation array :\n"); + //printResults(hPtr, WAVE_SIZE); + + verifyResults(hPtr, cpuResultsArr, group_size_in_bytes); + std::cout << "Results verified!\n"; + + hipFree(hPtr); + hipFree(dPtr); + free(cpuResultsArr); + } +} + +int main() { + // Use default device for validating the test + int deviceId; + ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess); + hipDeviceProp_t deviceProperties; + ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); + int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; + + if (!deviceProperties.cooperativeLaunch) { + std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n"; + if (hip_skip_tests_enabled()) { + return hip_skip_retcode(); + } else { + passed(); + } + return 0; + } + + for (int i = 0; i < 100; i++) { + test_shfl_up(); + } + + std::cout << "Testing coalesced_groups partitioning and shfl_up" << '\n' << std::endl; + + int testNo = 1; + std::vector tileSizes = {2, 4, 8, 16, 32}; + for (auto i : tileSizes) { + std::cout << "TEST " << testNo << ":" << '\n' << std::endl; + test_group_partition(i); + testNo++; + } + passed(); +} + +/* Kogge-Stone algorithm */ \ No newline at end of file diff --git a/tests/src/runtimeApi/cooperativeGrps/simple_coalesced_groups.cpp b/tests/src/runtimeApi/cooperativeGrps/simple_coalesced_groups.cpp new file mode 100644 index 0000000000..9f2775b5dd --- /dev/null +++ b/tests/src/runtimeApi/cooperativeGrps/simple_coalesced_groups.cpp @@ -0,0 +1,583 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Test Description: +/* This test implements sum reduction kernel, first with each threads own rank + as input and comparing the sum with expected sum output derieved from n(n-1)/2 + formula. The second part, partitions this parent group into child subgroups + a.k.a tiles using using tiled_partition() collective operation. This can be called + with a static tile size, passed in templated non-type variable-tiled_partition, + or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these + cases. + This test tests functionality of cg group partitioning, (static and dynamic) and its respective + API's size(), thread_rank(), and sync(). +*/ + +#include "test_common.h" +#include +#include +#include + +using namespace cooperative_groups; + +#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) + +#define NUM_ELEMS 10000000 +#define NUM_THREADS_PER_BLOCK 512 +#define WAVE_SIZE 32 + +/* Test coalesced group's functionality. + * + */ + +__device__ int atomicAggInc(int *ptr) { + coalesced_group g = coalesced_threads(); + int prev; + // elect the first active thread to perform atomic add + if (g.thread_rank() == 0) { + prev = atomicAdd(ptr, g.size()); + } + // broadcast previous value within the warp + // and add each active thread’s rank to it + prev = g.thread_rank() + g.shfl(prev, 0); + return prev; +} + +__global__ void kernel_shfl (int * dPtr, int *dResults, int srcLane, int cg_sizes) { + int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id % cg_sizes == 0) { + coalesced_group const& g = coalesced_threads(); + int rank = g.thread_rank(); + int val = dPtr[rank]; + dResults[rank] = g.shfl(val, srcLane); + return; + } +} + +__global__ void kernel_shfl_any_to_any (int *randVal, int *dsrcArr, int *dResults, int cg_sizes) { + + int id = threadIdx.x + blockIdx.x * blockDim.x; + + if (id % cg_sizes == 0) { + coalesced_group const& g = coalesced_threads(); + int rank = g.thread_rank(); + int val = randVal[rank]; + dResults[rank] = g.shfl(val, dsrcArr[rank]); + return; + } + +} + +__global__ void filter_arr(int *dst, int *nres, const int *src, int n) { + int id = threadIdx.x + blockIdx.x * blockDim.x; + + for (int i = id; i < n; i += gridDim.x * blockDim.x) { + if (src[i] > 0) dst[atomicAggInc(nres)] = src[i]; + } +} + +/* Parallel reduce kernel. + * + * Step complexity: O(log n) + * Work complexity: O(n) + * + * Note: This kernel works only with power of 2 input arrays. + */ +__device__ int reduction_kernel(coalesced_group g, int* x, int val) { + int lane = g.thread_rank(); + int sz = g.size(); + + for (int i = g.size() / 2; i > 0; i /= 2) { + // use lds to store the temporary result + x[lane] = val; + // Ensure all the stores are completed. + g.sync(); + + if (lane < i) { + val += x[lane + i]; + } + // It must work on one tiled thread group at a time, + // and it must make sure all memory operations are + // completed before moving to the next stride. + // sync() here just does that. + g.sync(); + } + + // Choose the 0'th indexed thread that holds the reduction value to return + if (g.thread_rank() == 0) { + return val; + } + // Rest of the threads return no useful values + else { + return -1; + } +} + +__global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* result, + bool isGlobalMem, int* globalMem, int cg_sizes) { + + int id = threadIdx.x + blockIdx.x * blockDim.x; + if (id % cg_sizes == 0) { + coalesced_group threadBlockCGTy = coalesced_threads(); + int threadBlockGroupSize = threadBlockCGTy.size(); + + int* workspace = NULL; + + if (isGlobalMem) { + workspace = globalMem; + } else { + // Declare a shared memory + extern __shared__ int sharedMem[]; + workspace = sharedMem; + } + + int input, outputSum, expectedOutput; + + // input to reduction, for each thread, is its' rank in the group + input = threadBlockCGTy.thread_rank(); + + expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2; + + outputSum = reduction_kernel(threadBlockCGTy, workspace, input); + + if (threadBlockCGTy.thread_rank() == 0) { + printf(" Sum of all ranks 0..%d in coalesced_group is %d\n\n", + (int)threadBlockCGTy.size() - 1, outputSum); + printf(" Creating %d groups, of tile size %d threads:\n\n", + (int)threadBlockCGTy.size() / tileSz, tileSz); + } + + threadBlockCGTy.sync(); + + coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz); + + // This offset allows each group to have its own unique area in the workspace array + int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank(); + + outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input); + + if (tiledPartition.thread_rank() == 0) { + printf( + " Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread " + "rank: %d\n", + tiledPartition.size() - 1, outputSum, input); + + result[input / (tileSz)] = outputSum; + } + return; + } +} + +__global__ void kernel_coalesced_active_groups() { + thread_block threadBlockCGTy = this_thread_block(); + int threadBlockGroupSize = threadBlockCGTy.size(); + + // input to reduction, for each thread, is its' rank in the group + int input = threadBlockCGTy.thread_rank(); + + if (threadBlockCGTy.thread_rank() == 0) { + printf(" Creating odd and even set of active thread groups based on branch divergence\n\n"); + } + + threadBlockCGTy.sync(); + + // Group all active odd threads + if (threadBlockCGTy.thread_rank() % 2) { + coalesced_group activeOdd = coalesced_threads(); + + if (activeOdd.thread_rank() == 0) { + printf(" ODD: Size of odd set of active threads is %d." + " Corresponding parent thread_rank is %d.\n\n", + activeOdd.size(), threadBlockCGTy.thread_rank()); + } + } + else { // Group all active even threads + coalesced_group activeEven = coalesced_threads(); + + if (activeEven.thread_rank() == 0) { + printf(" EVEN: Size of even set of active threads is %d." + " Corresponding parent thread_rank is %d.", + activeEven.size(), threadBlockCGTy.thread_rank()); + } + } + return; +} + +void printResults(int* ptr, int size) { + for (int i = 0; i < size; i++) { + std::cout << ptr[i] << " "; + } + std::cout << '\n'; +} + +void compareResults(int* cpu, int* gpu, int size) { + for (unsigned int i = 0; i < size / sizeof(int); i++) { + if (cpu[i] != gpu[i]) { + printf(" results do not match."); + } + } +} + +static void test_active_threads_grouping() { + hipError_t err; + int blockSize = 1; + int threadsPerBlock = WAVE_SIZE; + + // Launch Kernel + hipLaunchKernelGGL(kernel_coalesced_active_groups, blockSize, threadsPerBlock, 0, 0); + + err = hipDeviceSynchronize(); + if (err != hipSuccess) { + fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); + } + printf("\n...PASSED.\n\n"); +} + +// Search if the sum exists in the expected results array +void verifyResults(int* hPtr, int* dPtr, int size) { + int i = 0, j = 0; + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + if (hPtr[i] == dPtr[j]) { + break; + } + } + if (j == size) { + printf(" Result verification failed!"); + } + } +} + + +static void test_group_partition(unsigned int tileSz, bool useGlobalMem) { + hipError_t err; + int blockSize = 1; + int threadsPerBlock = WAVE_SIZE; + + std::vector cg_sizes = {1, 2, 3}; + for (auto i : cg_sizes) { + + int numTiles = ((blockSize * threadsPerBlock) / i) / tileSz; + + // numTiles = 0 when partitioning is possible. The below statement is to avoid + // out-of-bounds error and still evaluate failure case. + numTiles = (numTiles == 0) ? 1 : numTiles; + + // Build an array of expected reduction sum output on the host + // based on the sum of their respective thread ranks to use for verification + int* expectedSum = new int[numTiles]; + int temp = 0, sum = 0; + for (int i = 1; i <= numTiles; i++) { + sum = temp; + temp = (((tileSz * i) - 1) * (tileSz * i)) / 2; + expectedSum[i-1] = temp - sum; + } + + int* dResult = NULL; + hipMalloc(&dResult, sizeof(int) * numTiles); + + int* globalMem = NULL; + if (useGlobalMem) { + hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)); + } + + int* hResult = NULL; + hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault); + memset(hResult, 0, numTiles * sizeof(int)); + + // Launch Kernel + if (useGlobalMem) { + hipLaunchKernelGGL(kernel_cg_coalesced_group_partition, blockSize, threadsPerBlock, 0, 0, tileSz, + dResult, useGlobalMem, globalMem, i); + + err = hipDeviceSynchronize(); + if (err != hipSuccess) { + fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); + } + } else { + hipLaunchKernelGGL(kernel_cg_coalesced_group_partition, blockSize, threadsPerBlock, + threadsPerBlock * sizeof(int), 0, tileSz, dResult, useGlobalMem, globalMem, i); + + err = hipDeviceSynchronize(); + if (err != hipSuccess) { + fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); + } + } + + hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost); + verifyResults(expectedSum, hResult, numTiles); + // Free all allocated memory on host and device + hipFree(dResult); + hipFree(hResult); + if (useGlobalMem) { + hipFree(globalMem); + } + delete[] expectedSum; + + printf("\n...PASSED.\n\n"); + } +} +static void test_shfl_any_to_any() { + + std::vector cg_sizes = {1, 2, 3}; + for (auto i : cg_sizes) { + + hipError_t err; + int blockSize = 1; + int threadsPerBlock = WAVE_SIZE; + + int totalThreads = blockSize * threadsPerBlock; + int group_size = (totalThreads + i - 1) / i; + int group_size_in_bytes = group_size * sizeof(int); + + int* hPtr = NULL; + int* dPtr = NULL; + int* dsrcArr = NULL; + int* dResults = NULL; + int* srcArr = (int*)malloc(group_size_in_bytes); + int* srcArrCpu = (int*)malloc(group_size_in_bytes); + + std::cout << "Testing coalesced_groups shfl any-to-any\n" < cg_sizes = {1, 2, 3}; + for (auto i : cg_sizes) { + + hipError_t err; + int blockSize = 1; + int threadsPerBlock = WAVE_SIZE; + + int totalThreads = blockSize * threadsPerBlock; + int group_size = (totalThreads + i - 1) / i; + int group_size_in_bytes = group_size * sizeof(int); + + int* hPtr = NULL; + int* dPtr = NULL; + int* dResults = NULL; + int srcLane = rand() % 1000; + int srcLaneCpu = 0; + std::cout << "Testing coalesced_groups shfl with srcLane " << srcLane << '\n' + << " and group size " << i <(malloc(sizeof(int) * NUM_ELEMS)); + + // Generate input data. + for (int i = 0; i < NUM_ELEMS; i++) { + data_to_filter[i] = rand() % numOfBuckets; + } + + + hipMalloc(&d_data_to_filter, sizeof(int) * NUM_ELEMS); + hipMalloc(&d_filtered_data, sizeof(int) * NUM_ELEMS); + hipMalloc(&d_nres, sizeof(int)); + + hipMemcpy(d_data_to_filter, data_to_filter, + sizeof(int) * NUM_ELEMS, hipMemcpyHostToDevice); + hipMemset(d_nres, 0, sizeof(int)); + + dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1); + dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1); + + filter_arr<<>>(d_filtered_data, d_nres, d_data_to_filter, + NUM_ELEMS); + + + hipMemcpy(&nres, d_nres, sizeof(int), hipMemcpyDeviceToHost); + + filtered_data = reinterpret_cast(malloc(sizeof(int) * nres)); + + hipMemcpy(filtered_data, d_filtered_data, sizeof(int) * nres, + hipMemcpyDeviceToHost); + + int *host_filtered_data = + reinterpret_cast(malloc(sizeof(int) * NUM_ELEMS)); + + // Generate host output with host filtering code. + int host_flt_count = 0; + for (int i = 0; i < NUM_ELEMS; i++) { + if (data_to_filter[i] > 0) { + host_filtered_data[host_flt_count++] = data_to_filter[i]; + } + } + + printf("\nWarp Aggregated Atomics %s \n", + (host_flt_count == nres) ? "PASSED" : "FAILED"); + + // Now, testing shfl collective + std::cout << "Now testing shfl collective as a broadcast" << '\n' << std::endl; + + for (int i = 0; i < 100; i++) { + test_shfl_broadcast(); + } + + + // Now, testing shfl collective + std::cout << "Now testing shfl operations any-to-any member lanes" << '\n' << std::endl; + + for (int i = 0; i < 100; i++) { + test_shfl_any_to_any(); + } + + // Now, pass a already coalesced_group that was partitioned + /* Test coalesced group partitioning */ + std::cout << "Now testing coalesced_groups partitioning" << '\n' << std::endl; + + int testNo = 1; + for (int memTy = 0; memTy < 2; memTy++) { + std::vector tileSizes = {2, 4, 8, 16, 32}; + for (auto i : tileSizes) { + std::cout << "TEST " << testNo << ":" << '\n' << std::endl; + test_group_partition(i, memTy); + testNo++; + } + } + + std::cout << "Now grouping active threads based on branch divergence" << '\n' << std::endl; + test_active_threads_grouping(); + + passed(); + return 0; +}