SWDEV-294375 - Adding coalesced_groups HIP tests (#2489)

Change-Id: I000a402e32d6057bab11829af2b78931fcd67fb5
This commit is contained in:
Sourabh U Betigeri
2022-02-25 02:56:20 -08:00
کامیت شده توسط GitHub
والد 4ecad956c9
کامیت 5eade5cfa1
3فایلهای تغییر یافته به همراه1119 افزوده شده و 0 حذف شده
@@ -0,0 +1,276 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/* This test implements sum reduction kernel, first with each threads own rank
as input and comparing the sum with expected sum output derieved from n(n-1)/2
formula.
This sample tests functionality of intrinsics provided by thread_block_tile type,
shfl_down and shfl_xor.
*/
#include "test_common.h"
#include <hip/hip_cooperative_groups.h>
#include <stdio.h>
#include <vector>
using namespace cooperative_groups;
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
#define WAVE_SIZE 32
__device__ int reduction_kernel_shfl_down(coalesced_group const& g, int val) {
int sz = g.size();
for (int i = sz / 2; i > 0; i >>= 1) {
val += g.shfl_down(val, i);
}
// Choose the 0'th indexed thread that holds the reduction value to return
if (g.thread_rank() == 0) {
return val;
}
// Rest of the threads return no useful values
else {
return -1;
}
}
__global__ void kernel_shfl_down (int * dPtr, int *dResults, int lane_delta, int cg_sizes) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
coalesced_group const& g = coalesced_threads();
int rank = g.thread_rank();
int val = dPtr[rank];
dResults[rank] = g.shfl_down(val, lane_delta);
return;
}
}
__global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int cg_sizes) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
coalesced_group threadBlockCGTy = coalesced_threads();
int input, outputSum, expectedSum;
// Choose a leader thread to print the results
if (threadBlockCGTy.thread_rank() == 0) {
printf(" Creating %d groups, of tile size %d threads:\n\n",
(int)threadBlockCGTy.size() / tileSz, tileSz);
}
threadBlockCGTy.sync();
coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
int threadRank = tiledPartition.thread_rank();
input = tiledPartition.thread_rank();
// (n-1)(n)/2
expectedSum = ((tileSz - 1) * tileSz / 2);
outputSum = reduction_kernel_shfl_down(tiledPartition, input);
if (tiledPartition.thread_rank() == 0) {
printf(
" Sum of all ranks 0..%d in this tiledPartition group using shfl_down is %d (expected "
"%d)\n",
tiledPartition.size() - 1, outputSum, expectedSum);
result[threadBlockCGTy.thread_rank() / (tileSz)] = outputSum;
}
return;
}
}
void verifyResults(int* ptr, int expectedResult, int numTiles) {
for (int i = 0; i < numTiles; i++) {
if (ptr[i] != expectedResult) {
printf(" Results do not match! ");
}
}
}
void compareResults(int* cpu, int* gpu, int size) {
for (unsigned int i = 0; i < size / sizeof(int); i++) {
if (cpu[i] != gpu[i]) {
printf(" results do not match.");
}
}
}
void printResults(int* ptr, int size) {
for (int i = 0; i < size; i++) {
std::cout << ptr[i] << " ";
}
std::cout << '\n';
}
static void test_group_partition(unsigned int tileSz) {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = 32;
std::vector<unsigned int> cg_sizes = {1, 2, 3};
for (auto i : cg_sizes) {
int numTiles = ((blockSize * threadsPerBlock) / i) / tileSz;
int expectedSum = ((tileSz - 1) * tileSz / 2);
int* expectedResult = new int[numTiles];
// numTiles = 0 when partitioning is possible. The below statement is to avoid
// out-of-bounds error and still evaluate failure case.
numTiles = (numTiles == 0) ? 1 : numTiles;
for (int i = 0; i < numTiles; i++) {
expectedResult[i] = expectedSum;
}
int* dResult = NULL;
int* hResult = NULL;
hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
memset(hResult, 0, numTiles * sizeof(int));
hipMalloc(&dResult, numTiles * sizeof(int));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dResult, tileSz, i);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost);
verifyResults(hResult, expectedSum, numTiles);
// Free all allocated memory on host and device
hipFree(dResult);
hipFree(hResult);
delete[] expectedResult;
printf("\n...PASSED.\n\n");
}
}
static void test_shfl_down() {
std::vector<unsigned int> cg_sizes = {1, 2, 3};
for (auto i : cg_sizes) {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = WAVE_SIZE;
int totalThreads = blockSize * threadsPerBlock;
int group_size = totalThreads / i;
int group_size_in_bytes = group_size * sizeof(int);
int* hPtr = NULL;
int* dPtr = NULL;
int* dResults = NULL;
int lane_delta = rand() % group_size;
std::cout << "Testing coalesced_groups shfl_down with lane_delta " << lane_delta << "and group size "
<< WAVE_SIZE << '\n' << std::endl;
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
// Fill up the array
for (int i = 0; i < WAVE_SIZE; i++) {
hPtr[i] = rand() % 1000;
}
int* cpuResultsArr = (int*)malloc(group_size_in_bytes);
for (int i = 0; i < group_size; i++) {
cpuResultsArr[i] = (i + lane_delta >= group_size) ? hPtr[i] : hPtr[i + lane_delta];
}
//printf("Array passed to GPU for computation\n");
//printResults(hPtr, WAVE_SIZE);
hipMalloc(&dPtr, group_size_in_bytes);
hipMalloc(&dResults, group_size_in_bytes);
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
// Launch Kernel
hipLaunchKernelGGL(kernel_shfl_down, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
//printf("GPU results: \n");
//printResults(hPtr, WAVE_SIZE);
//printf("Printing cpu to be verified array\n");
//printResults(cpuResultsArr, WAVE_SIZE);
compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
std::cout << "Results verified!\n";
hipFree(hPtr);
hipFree(dPtr);
free(cpuResultsArr);
}
}
int main() {
// Use default device for validating the test
int deviceId;
ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
hipDeviceProp_t deviceProperties;
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
if (!deviceProperties.cooperativeLaunch) {
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
} else {
passed();
}
return 0;
}
// Test shfl_down with random group sizes
for (int i = 0; i < 100; i++) {
test_shfl_down();
}
std::cout << "Testing static tiled_partition for different tile sizes using shfl_down"
<< std::endl;
int testNo = 1;
std::vector<unsigned int> tileSizes = {2, 4, 8, 16, 32};
for (auto i : tileSizes) {
std::cout << "TEST " << testNo << ":" << '\n' << std::endl;
test_group_partition(i);
testNo++;
}
passed();
}
@@ -0,0 +1,260 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/* This test implements prefix sum(scan) kernel, first with each threads own rank
as input and comparing the sum with expected serial summation output on CPU.
This sample tests functionality of intrinsics provided by coalesced_group,
shfl_up.
*/
#include "test_common.h"
#include <hip/hip_cooperative_groups.h>
#include <stdio.h>
#include <vector>
using namespace cooperative_groups;
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
#define WAVE_SIZE 32
__device__ int prefix_sum_kernel(coalesced_group const& g, int val) {
int sz = g.size();
for (int i = 1; i < sz; i <<= 1) {
int temp = g.shfl_up(val, i);
if (g.thread_rank() >= i) {
val += temp;
}
}
return val;
}
__global__ void kernel_shfl_up (int * dPtr, int *dResults, int lane_delta, int cg_sizes) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
coalesced_group g = coalesced_threads();
int rank = g.thread_rank();
int val = dPtr[rank];
dResults[rank] = g.shfl_up(val, lane_delta);
return;
}
}
__global__ void kernel_cg_group_partition(int* dPtr, unsigned int tileSz, int cg_sizes) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
coalesced_group threadBlockCGTy = coalesced_threads();
int input, outputSum;
// we pass its own thread rank as inputs
input = threadBlockCGTy.thread_rank();
// Choose a leader thread to print the results
if (threadBlockCGTy.thread_rank() == 0) {
printf(" Creating %d groups, of tile size %d threads:\n\n",
(int)threadBlockCGTy.size() / tileSz, tileSz);
}
threadBlockCGTy.sync();
coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
input = tiledPartition.thread_rank();
outputSum = prefix_sum_kernel(tiledPartition, input);
// Update the result array with the corresponsing prefix sum
dPtr[threadBlockCGTy.thread_rank()] = outputSum;
return;
}
}
void serialScan(int* ptr, int size) {
// Fill up the array
for (int i = 0; i < size; i++) {
ptr[i] = i;
}
int acc = 0;
for (int i = 0; i < size; i++) {
acc = acc + ptr[i];
ptr[i] = acc;
}
}
void printResults(int* ptr, int size) {
for (int i = 0; i < size; i++) {
std::cout << ptr[i] << " ";
}
std::cout << '\n';
}
void verifyResults(int* cpu, int* gpu, int size) {
for (unsigned int i = 0; i < size / sizeof(int); i++) {
if (cpu[i] != gpu[i]) {
printf(" Results do not match.");
}
}
}
static void test_group_partition(unsigned tileSz) {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = WAVE_SIZE;
int* hPtr = NULL;
int* dPtr = NULL;
int* cpuPrefixSum = NULL;
std::vector<unsigned int> cg_sizes = {1, 2, 3};
for (auto i : cg_sizes) {
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
hipMalloc(&dPtr, arrSize);
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dPtr, tileSz, i);
hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
cpuPrefixSum = new int[tileSz];
serialScan(cpuPrefixSum, tileSz);
//std::cout << "\nPrefix sum results on CPU\n";
//printResults(cpuPrefixSum, tileSz);
//std::cout << "\nPrefix sum results on GPU\n";
//printResults(hPtr, tileSz);
std::cout << "\n";
verifyResults(hPtr, cpuPrefixSum, tileSz);
std::cout << "Results verified!\n";
delete[] cpuPrefixSum;
hipFree(hPtr);
hipFree(dPtr);
}
}
static void test_shfl_up() {
std::vector<unsigned int> cg_sizes = {1, 2, 3};
for (auto i : cg_sizes) {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = WAVE_SIZE;
int totalThreads = blockSize * threadsPerBlock;
int group_size = totalThreads / i;
int group_size_in_bytes = group_size * sizeof(int);
int* hPtr = NULL;
int* dPtr = NULL;
int* dResults = NULL;
int lane_delta = (rand() % group_size);
std::cout << "Testing coalesced_groups shfl_up with lane_delta " << lane_delta
<< " and group size " << WAVE_SIZE << '\n' << std::endl;
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
// Fill up the array
for (int i = 0; i < WAVE_SIZE; i++) {
hPtr[i] = rand() % 1000;
}
//printResults(hPtr, WAVE_SIZE);
int* cpuResultsArr = (int*)malloc(group_size_in_bytes);
for (int i = 0; i < group_size; i++) {
cpuResultsArr[i] = (i <= (lane_delta - 1)) ? hPtr[i] : hPtr[i - lane_delta];
}
//printf("Printing cpu results arr\n");
//printResults(cpuResultsArr, WAVE_SIZE);
hipMalloc(&dPtr, group_size_in_bytes);
hipMalloc(&dResults, group_size_in_bytes);
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
// Launch Kernel
hipLaunchKernelGGL(kernel_shfl_up, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
//printf("GPU computation array :\n");
//printResults(hPtr, WAVE_SIZE);
verifyResults(hPtr, cpuResultsArr, group_size_in_bytes);
std::cout << "Results verified!\n";
hipFree(hPtr);
hipFree(dPtr);
free(cpuResultsArr);
}
}
int main() {
// Use default device for validating the test
int deviceId;
ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
hipDeviceProp_t deviceProperties;
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
if (!deviceProperties.cooperativeLaunch) {
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
} else {
passed();
}
return 0;
}
for (int i = 0; i < 100; i++) {
test_shfl_up();
}
std::cout << "Testing coalesced_groups partitioning and shfl_up" << '\n' << std::endl;
int testNo = 1;
std::vector<unsigned int> tileSizes = {2, 4, 8, 16, 32};
for (auto i : tileSizes) {
std::cout << "TEST " << testNo << ":" << '\n' << std::endl;
test_group_partition(i);
testNo++;
}
passed();
}
/* Kogge-Stone algorithm */
@@ -0,0 +1,583 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/* This test implements sum reduction kernel, first with each threads own rank
as input and comparing the sum with expected sum output derieved from n(n-1)/2
formula. The second part, partitions this parent group into child subgroups
a.k.a tiles using using tiled_partition() collective operation. This can be called
with a static tile size, passed in templated non-type variable-tiled_partition<tileSz>,
or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these
cases.
This test tests functionality of cg group partitioning, (static and dynamic) and its respective
API's size(), thread_rank(), and sync().
*/
#include "test_common.h"
#include <hip/hip_cooperative_groups.h>
#include <stdio.h>
#include <vector>
using namespace cooperative_groups;
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
#define NUM_ELEMS 10000000
#define NUM_THREADS_PER_BLOCK 512
#define WAVE_SIZE 32
/* Test coalesced group's functionality.
*
*/
__device__ int atomicAggInc(int *ptr) {
coalesced_group g = coalesced_threads();
int prev;
// elect the first active thread to perform atomic add
if (g.thread_rank() == 0) {
prev = atomicAdd(ptr, g.size());
}
// broadcast previous value within the warp
// and add each active threads rank to it
prev = g.thread_rank() + g.shfl(prev, 0);
return prev;
}
__global__ void kernel_shfl (int * dPtr, int *dResults, int srcLane, int cg_sizes) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
coalesced_group const& g = coalesced_threads();
int rank = g.thread_rank();
int val = dPtr[rank];
dResults[rank] = g.shfl(val, srcLane);
return;
}
}
__global__ void kernel_shfl_any_to_any (int *randVal, int *dsrcArr, int *dResults, int cg_sizes) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
coalesced_group const& g = coalesced_threads();
int rank = g.thread_rank();
int val = randVal[rank];
dResults[rank] = g.shfl(val, dsrcArr[rank]);
return;
}
}
__global__ void filter_arr(int *dst, int *nres, const int *src, int n) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = id; i < n; i += gridDim.x * blockDim.x) {
if (src[i] > 0) dst[atomicAggInc(nres)] = src[i];
}
}
/* Parallel reduce kernel.
*
* Step complexity: O(log n)
* Work complexity: O(n)
*
* Note: This kernel works only with power of 2 input arrays.
*/
__device__ int reduction_kernel(coalesced_group g, int* x, int val) {
int lane = g.thread_rank();
int sz = g.size();
for (int i = g.size() / 2; i > 0; i /= 2) {
// use lds to store the temporary result
x[lane] = val;
// Ensure all the stores are completed.
g.sync();
if (lane < i) {
val += x[lane + i];
}
// It must work on one tiled thread group at a time,
// and it must make sure all memory operations are
// completed before moving to the next stride.
// sync() here just does that.
g.sync();
}
// Choose the 0'th indexed thread that holds the reduction value to return
if (g.thread_rank() == 0) {
return val;
}
// Rest of the threads return no useful values
else {
return -1;
}
}
__global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* result,
bool isGlobalMem, int* globalMem, int cg_sizes) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
coalesced_group threadBlockCGTy = coalesced_threads();
int threadBlockGroupSize = threadBlockCGTy.size();
int* workspace = NULL;
if (isGlobalMem) {
workspace = globalMem;
} else {
// Declare a shared memory
extern __shared__ int sharedMem[];
workspace = sharedMem;
}
int input, outputSum, expectedOutput;
// input to reduction, for each thread, is its' rank in the group
input = threadBlockCGTy.thread_rank();
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
if (threadBlockCGTy.thread_rank() == 0) {
printf(" Sum of all ranks 0..%d in coalesced_group is %d\n\n",
(int)threadBlockCGTy.size() - 1, outputSum);
printf(" Creating %d groups, of tile size %d threads:\n\n",
(int)threadBlockCGTy.size() / tileSz, tileSz);
}
threadBlockCGTy.sync();
coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
// This offset allows each group to have its own unique area in the workspace array
int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank();
outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input);
if (tiledPartition.thread_rank() == 0) {
printf(
" Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
"rank: %d\n",
tiledPartition.size() - 1, outputSum, input);
result[input / (tileSz)] = outputSum;
}
return;
}
}
__global__ void kernel_coalesced_active_groups() {
thread_block threadBlockCGTy = this_thread_block();
int threadBlockGroupSize = threadBlockCGTy.size();
// input to reduction, for each thread, is its' rank in the group
int input = threadBlockCGTy.thread_rank();
if (threadBlockCGTy.thread_rank() == 0) {
printf(" Creating odd and even set of active thread groups based on branch divergence\n\n");
}
threadBlockCGTy.sync();
// Group all active odd threads
if (threadBlockCGTy.thread_rank() % 2) {
coalesced_group activeOdd = coalesced_threads();
if (activeOdd.thread_rank() == 0) {
printf(" ODD: Size of odd set of active threads is %d."
" Corresponding parent thread_rank is %d.\n\n",
activeOdd.size(), threadBlockCGTy.thread_rank());
}
}
else { // Group all active even threads
coalesced_group activeEven = coalesced_threads();
if (activeEven.thread_rank() == 0) {
printf(" EVEN: Size of even set of active threads is %d."
" Corresponding parent thread_rank is %d.",
activeEven.size(), threadBlockCGTy.thread_rank());
}
}
return;
}
void printResults(int* ptr, int size) {
for (int i = 0; i < size; i++) {
std::cout << ptr[i] << " ";
}
std::cout << '\n';
}
void compareResults(int* cpu, int* gpu, int size) {
for (unsigned int i = 0; i < size / sizeof(int); i++) {
if (cpu[i] != gpu[i]) {
printf(" results do not match.");
}
}
}
static void test_active_threads_grouping() {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = WAVE_SIZE;
// Launch Kernel
hipLaunchKernelGGL(kernel_coalesced_active_groups, blockSize, threadsPerBlock, 0, 0);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
printf("\n...PASSED.\n\n");
}
// Search if the sum exists in the expected results array
void verifyResults(int* hPtr, int* dPtr, int size) {
int i = 0, j = 0;
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
if (hPtr[i] == dPtr[j]) {
break;
}
}
if (j == size) {
printf(" Result verification failed!");
}
}
}
static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = WAVE_SIZE;
std::vector<unsigned int> cg_sizes = {1, 2, 3};
for (auto i : cg_sizes) {
int numTiles = ((blockSize * threadsPerBlock) / i) / tileSz;
// numTiles = 0 when partitioning is possible. The below statement is to avoid
// out-of-bounds error and still evaluate failure case.
numTiles = (numTiles == 0) ? 1 : numTiles;
// Build an array of expected reduction sum output on the host
// based on the sum of their respective thread ranks to use for verification
int* expectedSum = new int[numTiles];
int temp = 0, sum = 0;
for (int i = 1; i <= numTiles; i++) {
sum = temp;
temp = (((tileSz * i) - 1) * (tileSz * i)) / 2;
expectedSum[i-1] = temp - sum;
}
int* dResult = NULL;
hipMalloc(&dResult, sizeof(int) * numTiles);
int* globalMem = NULL;
if (useGlobalMem) {
hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int));
}
int* hResult = NULL;
hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
memset(hResult, 0, numTiles * sizeof(int));
// Launch Kernel
if (useGlobalMem) {
hipLaunchKernelGGL(kernel_cg_coalesced_group_partition, blockSize, threadsPerBlock, 0, 0, tileSz,
dResult, useGlobalMem, globalMem, i);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
} else {
hipLaunchKernelGGL(kernel_cg_coalesced_group_partition, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, tileSz, dResult, useGlobalMem, globalMem, i);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
}
hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost);
verifyResults(expectedSum, hResult, numTiles);
// Free all allocated memory on host and device
hipFree(dResult);
hipFree(hResult);
if (useGlobalMem) {
hipFree(globalMem);
}
delete[] expectedSum;
printf("\n...PASSED.\n\n");
}
}
static void test_shfl_any_to_any() {
std::vector<unsigned int> cg_sizes = {1, 2, 3};
for (auto i : cg_sizes) {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = WAVE_SIZE;
int totalThreads = blockSize * threadsPerBlock;
int group_size = (totalThreads + i - 1) / i;
int group_size_in_bytes = group_size * sizeof(int);
int* hPtr = NULL;
int* dPtr = NULL;
int* dsrcArr = NULL;
int* dResults = NULL;
int* srcArr = (int*)malloc(group_size_in_bytes);
int* srcArrCpu = (int*)malloc(group_size_in_bytes);
std::cout << "Testing coalesced_groups shfl any-to-any\n" <<std::endl;
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
// Fill up the array
for (int i = 0; i < WAVE_SIZE; i++) {
hPtr[i] = rand() % 1000;
}
// Fill up the random array
for (int i = 0; i < group_size; i++) {
srcArr[i] = rand() % 1000;
srcArrCpu[i] = srcArr[i] % group_size;
}
/* Fill cpu results array so that we can verify with gpu computation */
int* cpuResultsArr = (int*)malloc(group_size_in_bytes);
for(int i = 0; i < group_size; i++) {
cpuResultsArr[i] = hPtr[srcArrCpu[i]];
}
//printf("Array passed to GPU for computation\n");
//printResults(hPtr, WAVE_SIZE);
hipMalloc(&dPtr, group_size_in_bytes);
hipMalloc(&dResults, group_size_in_bytes);
hipMalloc(&dsrcArr, group_size_in_bytes);
hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice);
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
// Launch Kernel
hipLaunchKernelGGL(kernel_shfl_any_to_any, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0 , dPtr, dsrcArr, dResults, i);
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
//printf("GPU results: \n");
//printResults(hPtr, group_size);
//printf("Printing cpu to be verified array\n");
//printResults(cpuResultsArr, group_size);
//printf("Printing srcLane array that was passed\n");
//printResults(srcArr, group_size);
//printf("Printing srcLane array on the CPU\n");
//printResults(srcArrCpu, group_size);
compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
std::cout << "Results verified!\n";
hipFree(hPtr);
hipFree(dPtr);
free(srcArr);
free(srcArrCpu);
free(cpuResultsArr);
}
}
static void test_shfl_broadcast() {
std::vector<unsigned int> cg_sizes = {1, 2, 3};
for (auto i : cg_sizes) {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = WAVE_SIZE;
int totalThreads = blockSize * threadsPerBlock;
int group_size = (totalThreads + i - 1) / i;
int group_size_in_bytes = group_size * sizeof(int);
int* hPtr = NULL;
int* dPtr = NULL;
int* dResults = NULL;
int srcLane = rand() % 1000;
int srcLaneCpu = 0;
std::cout << "Testing coalesced_groups shfl with srcLane " << srcLane << '\n'
<< " and group size " << i <<std::endl;
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
// Fill up the array
for (int i = 0; i < WAVE_SIZE; i++) {
hPtr[i] = rand() % 1000;
}
/* Fill cpu results array so that we can verify with gpu computation */
srcLaneCpu = hPtr[srcLane % group_size];
int* cpuResultsArr = (int*)malloc(sizeof(int) * group_size);
for (int i = 0; i < group_size; i++) {
cpuResultsArr[i] = srcLaneCpu;
}
printf("Array passed to GPU for computation\n");
printResults(hPtr, WAVE_SIZE);
hipMalloc(&dPtr, group_size_in_bytes);
hipMalloc(&dResults, group_size_in_bytes);
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
// Launch Kernel
hipLaunchKernelGGL(kernel_shfl, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dPtr, dResults, srcLane, i);
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
printf("GPU results: \n");
printResults(hPtr, group_size);
printf("Printing cpu to be verified array\n");
printResults(cpuResultsArr, group_size);
compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
std::cout << "Results verified!\n";
hipFree(hPtr);
hipFree(dPtr);
free(cpuResultsArr);
}
}
int main() {
// Use default device for validating the test
int deviceId;
ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
hipDeviceProp_t deviceProperties;
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
if (!deviceProperties.cooperativeLaunch) {
std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
if (hip_skip_tests_enabled()) {
return hip_skip_retcode();
} else {
passed();
}
}
std::cout << "Now testing coalesced_groups" << '\n' << std::endl;
int *data_to_filter, *filtered_data, nres = 0;
int *d_data_to_filter, *d_filtered_data, *d_nres;
int numOfBuckets = 5;
data_to_filter = reinterpret_cast<int *>(malloc(sizeof(int) * NUM_ELEMS));
// Generate input data.
for (int i = 0; i < NUM_ELEMS; i++) {
data_to_filter[i] = rand() % numOfBuckets;
}
hipMalloc(&d_data_to_filter, sizeof(int) * NUM_ELEMS);
hipMalloc(&d_filtered_data, sizeof(int) * NUM_ELEMS);
hipMalloc(&d_nres, sizeof(int));
hipMemcpy(d_data_to_filter, data_to_filter,
sizeof(int) * NUM_ELEMS, hipMemcpyHostToDevice);
hipMemset(d_nres, 0, sizeof(int));
dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1);
dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1);
filter_arr<<<dimGrid, dimBlock>>>(d_filtered_data, d_nres, d_data_to_filter,
NUM_ELEMS);
hipMemcpy(&nres, d_nres, sizeof(int), hipMemcpyDeviceToHost);
filtered_data = reinterpret_cast<int *>(malloc(sizeof(int) * nres));
hipMemcpy(filtered_data, d_filtered_data, sizeof(int) * nres,
hipMemcpyDeviceToHost);
int *host_filtered_data =
reinterpret_cast<int *>(malloc(sizeof(int) * NUM_ELEMS));
// Generate host output with host filtering code.
int host_flt_count = 0;
for (int i = 0; i < NUM_ELEMS; i++) {
if (data_to_filter[i] > 0) {
host_filtered_data[host_flt_count++] = data_to_filter[i];
}
}
printf("\nWarp Aggregated Atomics %s \n",
(host_flt_count == nres) ? "PASSED" : "FAILED");
// Now, testing shfl collective
std::cout << "Now testing shfl collective as a broadcast" << '\n' << std::endl;
for (int i = 0; i < 100; i++) {
test_shfl_broadcast();
}
// Now, testing shfl collective
std::cout << "Now testing shfl operations any-to-any member lanes" << '\n' << std::endl;
for (int i = 0; i < 100; i++) {
test_shfl_any_to_any();
}
// Now, pass a already coalesced_group that was partitioned
/* Test coalesced group partitioning */
std::cout << "Now testing coalesced_groups partitioning" << '\n' << std::endl;
int testNo = 1;
for (int memTy = 0; memTy < 2; memTy++) {
std::vector<unsigned int> tileSizes = {2, 4, 8, 16, 32};
for (auto i : tileSizes) {
std::cout << "TEST " << testNo << ":" << '\n' << std::endl;
test_group_partition(i, memTy);
testNo++;
}
}
std::cout << "Now grouping active threads based on branch divergence" << '\n' << std::endl;
test_active_threads_grouping();
passed();
return 0;
}