SWDEV-358553 - Fixed all compiling bugs (#214)

Change-Id: I6f5316d76b2fad534f670b18138fc100c313be0d
This commit is contained in:
ROCm CI Service Account
2023-06-21 15:57:11 +05:30
committato da GitHub
parent 9760ccd5dc
commit 6e4bfd4bce
4 ha cambiato i file con 82 aggiunte e 89 eliminazioni
@@ -6,6 +6,9 @@ set(TEST_SRC
hipCGMultiGridGroupType.cc
hipCGMultiGridGroupTypeViaBaseType.cc
hipCGMultiGridGroupTypeViaPublicApi.cc
coalesced_groups_shfl_down.cc
coalesced_groups_shfl_up.cc
simple_coalesced_groups.cc
)
if(HIP_PLATFORM STREQUAL "nvidia")
set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
@@ -66,7 +66,7 @@ __global__ void kernel_shfl_down (int * dPtr, int *dResults, int lane_delta, int
}
}
__global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int cg_sizes) {
__global__ void kernel_cg_group_partition_shfl_down(int* result, unsigned int tileSz, int cg_sizes) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
@@ -82,7 +82,6 @@ __global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int
threadBlockCGTy.sync();
coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
int threadRank = tiledPartition.thread_rank();
input = tiledPartition.thread_rank();
@@ -110,7 +109,7 @@ void verifyResults(int* ptr, int expectedResult, int numTiles) {
}
}
void compareResults(int* cpu, int* gpu, int size) {
void compareResultsCoalescedGroupsShflDown(int* cpu, int* gpu, int size) {
for (unsigned int i = 0; i < size / sizeof(int); i++) {
if (cpu[i] != gpu[i]) {
INFO(" results do not match.");
@@ -118,7 +117,7 @@ void compareResults(int* cpu, int* gpu, int size) {
}
}
void printResults(int* ptr, int size) {
void printResultsCoalescedGroupsShflDown(int* ptr, int size) {
for (int i = 0; i < size; i++) {
std::cout << ptr[i] << " ";
}
@@ -148,14 +147,14 @@ static void test_group_partition(unsigned int tileSz) {
int* dResult = NULL;
int* hResult = NULL;
hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
memset(hResult, 0, numTiles * sizeof(int));
hipMalloc(&dResult, numTiles * sizeof(int));
HIPCHECK(hipMalloc(&dResult, numTiles * sizeof(int)));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
hipLaunchKernelGGL(kernel_cg_group_partition_shfl_down, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dResult, tileSz, i);
HIP_CHECK(hipGetLastError());
err = hipDeviceSynchronize();
@@ -164,13 +163,13 @@ static void test_group_partition(unsigned int tileSz) {
}
hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost);
HIPCHECK(hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost));
verifyResults(hResult, expectedSum, numTiles);
// Free all allocated memory on host and device
hipFree(dResult);
hipFree(hResult);
HIPCHECK(hipFree(dResult));
HIPCHECK(hipHostFree(hResult));
delete[] expectedResult;
printf("\n...PASSED.\n\n");
@@ -199,7 +198,7 @@ static void test_shfl_down() {
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
// Fill up the array
for (int i = 0; i < WAVE_SIZE; i++) {
hPtr[i] = rand() % 1000;
@@ -210,30 +209,30 @@ static void test_shfl_down() {
cpuResultsArr[i] = (i + lane_delta >= group_size) ? hPtr[i] : hPtr[i + lane_delta];
}
//printf("Array passed to GPU for computation\n");
//printResults(hPtr, WAVE_SIZE);
hipMalloc(&dPtr, group_size_in_bytes);
hipMalloc(&dResults, group_size_in_bytes);
//printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE);
HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
// Launch Kernel
hipLaunchKernelGGL(kernel_shfl_down, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
HIP_CHECK(hipGetLastError());
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
//printf("GPU results: \n");
//printResults(hPtr, WAVE_SIZE);
//printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE);
//printf("Printing cpu to be verified array\n");
//printResults(cpuResultsArr, WAVE_SIZE);
//printResultsCoalescedGroupsShflDown(cpuResultsArr, WAVE_SIZE);
compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
compareResultsCoalescedGroupsShflDown(hPtr, cpuResultsArr, group_size_in_bytes);
std::cout << "Results verified!\n";
hipFree(hPtr);
hipFree(dPtr);
HIPCHECK(hipHostFree(hPtr));
HIPCHECK(hipFree(dPtr));
free(cpuResultsArr);
}
}
@@ -246,7 +245,6 @@ TEST_CASE("Unit_coalesced_groups_shfl_down") {
ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
hipDeviceProp_t deviceProperties;
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
// Test shfl_down with random group sizes
for (int i = 0; i < 100; i++) {
@@ -40,7 +40,7 @@ __device__ int prefix_sum_kernel(coalesced_group const& g, int val) {
for (int i = 1; i < sz; i <<= 1) {
int temp = g.shfl_up(val, i);
if (g.thread_rank() >= i) {
if ((int)g.thread_rank() >= i) {
val += temp;
}
}
@@ -60,7 +60,7 @@ __global__ void kernel_shfl_up (int * dPtr, int *dResults, int lane_delta, int c
}
__global__ void kernel_cg_group_partition(int* dPtr, unsigned int tileSz, int cg_sizes) {
__global__ void kernel_cg_group_partition_shfl_up(int* dPtr, unsigned int tileSz, int cg_sizes) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
@@ -110,7 +110,7 @@ void printResults(int* ptr, int size) {
std::cout << '\n';
}
void verifyResults(int* cpu, int* gpu, int size) {
void verifyResultsCoalescedGroupsShflUp(int* cpu, int* gpu, int size) {
for (unsigned int i = 0; i < size / sizeof(int); i++) {
if (cpu[i] != gpu[i]) {
INFO(" Results do not match.");
@@ -132,14 +132,14 @@ static void test_group_partition(unsigned tileSz) {
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
hipMalloc(&dPtr, arrSize);
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
HIPCHECK(hipMalloc(&dPtr, arrSize));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
hipLaunchKernelGGL(kernel_cg_group_partition_shfl_up, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dPtr, tileSz, i);
HIP_CHECK(hipGetLastError());
hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost);
HIPCHECK(hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost));
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
@@ -153,12 +153,12 @@ static void test_group_partition(unsigned tileSz) {
//std::cout << "\nPrefix sum results on GPU\n";
//printResults(hPtr, tileSz);
std::cout << "\n";
verifyResults(hPtr, cpuPrefixSum, tileSz);
verifyResultsCoalescedGroupsShflUp(hPtr, cpuPrefixSum, tileSz);
std::cout << "Results verified!\n";
delete[] cpuPrefixSum;
hipFree(hPtr);
hipFree(dPtr);
HIPCHECK(hipHostFree(hPtr));
HIPCHECK(hipFree(dPtr));
}
}
@@ -185,7 +185,7 @@ static void test_shfl_up() {
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
// Fill up the array
for (int i = 0; i < WAVE_SIZE; i++) {
hPtr[i] = rand() % 1000;
@@ -200,14 +200,14 @@ static void test_shfl_up() {
//printf("Printing cpu results arr\n");
//printResults(cpuResultsArr, WAVE_SIZE);
hipMalloc(&dPtr, group_size_in_bytes);
hipMalloc(&dResults, group_size_in_bytes);
HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
// Launch Kernel
hipLaunchKernelGGL(kernel_shfl_up, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipGetLastError());
err = hipDeviceSynchronize();
if (err != hipSuccess) {
@@ -216,22 +216,21 @@ static void test_shfl_up() {
//printf("GPU computation array :\n");
//printResults(hPtr, WAVE_SIZE);
verifyResults(hPtr, cpuResultsArr, group_size_in_bytes);
verifyResultsCoalescedGroupsShflUp(hPtr, cpuResultsArr, group_size_in_bytes);
std::cout << "Results verified!\n";
hipFree(hPtr);
hipFree(dPtr);
HIPCHECK(hipHostFree(hPtr));
HIPCHECK(hipFree(dPtr));
free(cpuResultsArr);
}
}
TEST_CASE("Unit_coalesced_groups_shfl_down") {
TEST_CASE("Unit_coalesced_groups_shfl_up") {
// Use default device for validating the test
int deviceId;
ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
hipDeviceProp_t deviceProperties;
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
for (int i = 0; i < 100; i++) {
test_shfl_up();
@@ -51,7 +51,7 @@ using namespace cooperative_groups;
__device__ int atomicAggInc(int *ptr) {
coalesced_group g = coalesced_threads();
int prev;
int prev = 0;
// elect the first active thread to perform atomic add
if (g.thread_rank() == 0) {
prev = atomicAdd(ptr, g.size());
@@ -104,7 +104,6 @@ __global__ void filter_arr(int *dst, int *nres, const int *src, int n) {
*/
__device__ int reduction_kernel(coalesced_group g, int* x, int val) {
int lane = g.thread_rank();
int sz = g.size();
for (int i = g.size() / 2; i > 0; i /= 2) {
// use lds to store the temporary result
@@ -138,7 +137,6 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id % cg_sizes == 0) {
coalesced_group threadBlockCGTy = coalesced_threads();
int threadBlockGroupSize = threadBlockCGTy.size();
int* workspace = NULL;
@@ -150,13 +148,11 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
workspace = sharedMem;
}
int input, outputSum, expectedOutput;
int input, outputSum;
// input to reduction, for each thread, is its' rank in the group
input = threadBlockCGTy.thread_rank();
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
if (threadBlockCGTy.thread_rank() == 0) {
@@ -189,10 +185,8 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
__global__ void kernel_coalesced_active_groups() {
thread_block threadBlockCGTy = this_thread_block();
int threadBlockGroupSize = threadBlockCGTy.size();
// input to reduction, for each thread, is its' rank in the group
int input = threadBlockCGTy.thread_rank();
if (threadBlockCGTy.thread_rank() == 0) {
printf(" Creating odd and even set of active thread groups based on branch divergence\n\n");
@@ -222,14 +216,14 @@ __global__ void kernel_coalesced_active_groups() {
return;
}
void printResults(int* ptr, int size) {
void printResultsSimpleCoalescedGroups(int* ptr, int size) {
for (int i = 0; i < size; i++) {
std::cout << ptr[i] << " ";
}
std::cout << '\n';
}
void compareResults(int* cpu, int* gpu, int size) {
void compareResultsSimpleCoalescedGroups(int* cpu, int* gpu, int size) {
for (unsigned int i = 0; i < size / sizeof(int); i++) {
if (cpu[i] != gpu[i]) {
INFO(" results do not match.");
@@ -254,7 +248,7 @@ static void test_active_threads_grouping() {
}
// Search if the sum exists in the expected results array
void verifyResults(int* hPtr, int* dPtr, int size) {
void verifyResultsSimpleCoalescedGroups(int* hPtr, int* dPtr, int size) {
int i = 0, j = 0;
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
@@ -294,15 +288,15 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
}
int* dResult = NULL;
hipMalloc(&dResult, sizeof(int) * numTiles);
HIPCHECK(hipMalloc(&dResult, sizeof(int) * numTiles));
int* globalMem = NULL;
if (useGlobalMem) {
hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int));
HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
}
int* hResult = NULL;
hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
memset(hResult, 0, numTiles * sizeof(int));
// Launch Kernel
@@ -326,13 +320,13 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
}
}
hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost);
verifyResults(expectedSum, hResult, numTiles);
HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
verifyResultsSimpleCoalescedGroups(expectedSum, hResult, numTiles);
// Free all allocated memory on host and device
hipFree(dResult);
hipFree(hResult);
HIPCHECK(hipFree(dResult));
HIPCHECK(hipHostFree(hResult));
if (useGlobalMem) {
hipFree(globalMem);
HIPCHECK(hipFree(globalMem));
}
delete[] expectedSum;
@@ -363,7 +357,7 @@ static void test_shfl_any_to_any() {
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
// Fill up the array
for (int i = 0; i < WAVE_SIZE; i++) {
hPtr[i] = rand() % 1000;
@@ -382,36 +376,36 @@ static void test_shfl_any_to_any() {
}
//printf("Array passed to GPU for computation\n");
//printResults(hPtr, WAVE_SIZE);
hipMalloc(&dPtr, group_size_in_bytes);
hipMalloc(&dResults, group_size_in_bytes);
//printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE);
HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
hipMalloc(&dsrcArr, group_size_in_bytes);
hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice);
HIPCHECK(hipMalloc(&dsrcArr, group_size_in_bytes));
HIPCHECK(hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice));
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
// Launch Kernel
hipLaunchKernelGGL(kernel_shfl_any_to_any, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0 , dPtr, dsrcArr, dResults, i);
HIP_CHECK(hipGetLastError());
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
//printf("GPU results: \n");
//printResults(hPtr, group_size);
//printResultsSimpleCoalescedGroups(hPtr, group_size);
//printf("Printing cpu to be verified array\n");
//printResults(cpuResultsArr, group_size);
//printResultsSimpleCoalescedGroups(cpuResultsArr, group_size);
//printf("Printing srcLane array that was passed\n");
//printResults(srcArr, group_size);
//printResultsSimpleCoalescedGroups(srcArr, group_size);
//printf("Printing srcLane array on the CPU\n");
//printResults(srcArrCpu, group_size);
compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
//printResultsSimpleCoalescedGroups(srcArrCpu, group_size);
compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes);
std::cout << "Results verified!\n";
hipFree(hPtr);
hipFree(dPtr);
HIPCHECK(hipHostFree(hPtr));
HIPCHECK(hipFree(dPtr));
free(srcArr);
free(srcArrCpu);
free(cpuResultsArr);
@@ -440,7 +434,7 @@ static void test_shfl_broadcast() {
int arrSize = blockSize * threadsPerBlock * sizeof(int);
hipHostMalloc(&hPtr, arrSize);
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
// Fill up the array
for (int i = 0; i < WAVE_SIZE; i++) {
hPtr[i] = rand() % 1000;
@@ -455,30 +449,30 @@ static void test_shfl_broadcast() {
cpuResultsArr[i] = srcLaneCpu;
}
printf("Array passed to GPU for computation\n");
printResults(hPtr, WAVE_SIZE);
hipMalloc(&dPtr, group_size_in_bytes);
hipMalloc(&dResults, group_size_in_bytes);
printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE);
HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
// Launch Kernel
hipLaunchKernelGGL(kernel_shfl, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dPtr, dResults, srcLane, i);
HIP_CHECK(hipGetLastError());
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
printf("GPU results: \n");
printResults(hPtr, group_size);
printResultsSimpleCoalescedGroups(hPtr, group_size);
printf("Printing cpu to be verified array\n");
printResults(cpuResultsArr, group_size);
printResultsSimpleCoalescedGroups(cpuResultsArr, group_size);
compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes);
std::cout << "Results verified!\n";
hipFree(hPtr);
hipFree(dPtr);
HIPCHECK(hipHostFree(hPtr));
HIPCHECK(hipFree(dPtr));
free(cpuResultsArr);
}
}
@@ -489,7 +483,6 @@ TEST_CASE("Unit_coalesced_groups") {
HIP_CHECK(hipGetDevice(&deviceId));
hipDeviceProp_t deviceProperties;
HIP_CHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
std::cout << "Now testing coalesced_groups" << '\n' << std::endl;
@@ -512,7 +505,7 @@ TEST_CASE("Unit_coalesced_groups") {
HIP_CHECK(hipMemcpy(d_data_to_filter, data_to_filter,
sizeof(int) * NUM_ELEMS, hipMemcpyHostToDevice));
hipMemset(d_nres, 0, sizeof(int));
HIPCHECK(hipMemset(d_nres, 0, sizeof(int)));
dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1);
dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1);