SWDEV-358553 - Fixed all compiling bugs (#214)
Change-Id: I6f5316d76b2fad534f670b18138fc100c313be0d
This commit is contained in:
committato da
GitHub
parent
9760ccd5dc
commit
6e4bfd4bce
@@ -6,6 +6,9 @@ set(TEST_SRC
|
||||
hipCGMultiGridGroupType.cc
|
||||
hipCGMultiGridGroupTypeViaBaseType.cc
|
||||
hipCGMultiGridGroupTypeViaPublicApi.cc
|
||||
coalesced_groups_shfl_down.cc
|
||||
coalesced_groups_shfl_up.cc
|
||||
simple_coalesced_groups.cc
|
||||
)
|
||||
if(HIP_PLATFORM STREQUAL "nvidia")
|
||||
set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
|
||||
@@ -66,7 +66,7 @@ __global__ void kernel_shfl_down (int * dPtr, int *dResults, int lane_delta, int
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int cg_sizes) {
|
||||
__global__ void kernel_cg_group_partition_shfl_down(int* result, unsigned int tileSz, int cg_sizes) {
|
||||
|
||||
int id = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (id % cg_sizes == 0) {
|
||||
@@ -82,7 +82,6 @@ __global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int
|
||||
threadBlockCGTy.sync();
|
||||
|
||||
coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
|
||||
int threadRank = tiledPartition.thread_rank();
|
||||
|
||||
input = tiledPartition.thread_rank();
|
||||
|
||||
@@ -110,7 +109,7 @@ void verifyResults(int* ptr, int expectedResult, int numTiles) {
|
||||
}
|
||||
}
|
||||
|
||||
void compareResults(int* cpu, int* gpu, int size) {
|
||||
void compareResultsCoalescedGroupsShflDown(int* cpu, int* gpu, int size) {
|
||||
for (unsigned int i = 0; i < size / sizeof(int); i++) {
|
||||
if (cpu[i] != gpu[i]) {
|
||||
INFO(" results do not match.");
|
||||
@@ -118,7 +117,7 @@ void compareResults(int* cpu, int* gpu, int size) {
|
||||
}
|
||||
}
|
||||
|
||||
void printResults(int* ptr, int size) {
|
||||
void printResultsCoalescedGroupsShflDown(int* ptr, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
std::cout << ptr[i] << " ";
|
||||
}
|
||||
@@ -148,14 +147,14 @@ static void test_group_partition(unsigned int tileSz) {
|
||||
int* dResult = NULL;
|
||||
int* hResult = NULL;
|
||||
|
||||
hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
|
||||
HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
|
||||
memset(hResult, 0, numTiles * sizeof(int));
|
||||
|
||||
hipMalloc(&dResult, numTiles * sizeof(int));
|
||||
HIPCHECK(hipMalloc(&dResult, numTiles * sizeof(int)));
|
||||
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
|
||||
hipLaunchKernelGGL(kernel_cg_group_partition_shfl_down, blockSize, threadsPerBlock,
|
||||
threadsPerBlock * sizeof(int), 0, dResult, tileSz, i);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
err = hipDeviceSynchronize();
|
||||
@@ -164,13 +163,13 @@ static void test_group_partition(unsigned int tileSz) {
|
||||
}
|
||||
|
||||
|
||||
hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost);
|
||||
HIPCHECK(hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost));
|
||||
|
||||
verifyResults(hResult, expectedSum, numTiles);
|
||||
|
||||
// Free all allocated memory on host and device
|
||||
hipFree(dResult);
|
||||
hipFree(hResult);
|
||||
HIPCHECK(hipFree(dResult));
|
||||
HIPCHECK(hipHostFree(hResult));
|
||||
delete[] expectedResult;
|
||||
|
||||
printf("\n...PASSED.\n\n");
|
||||
@@ -199,7 +198,7 @@ static void test_shfl_down() {
|
||||
|
||||
int arrSize = blockSize * threadsPerBlock * sizeof(int);
|
||||
|
||||
hipHostMalloc(&hPtr, arrSize);
|
||||
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
|
||||
// Fill up the array
|
||||
for (int i = 0; i < WAVE_SIZE; i++) {
|
||||
hPtr[i] = rand() % 1000;
|
||||
@@ -210,30 +209,30 @@ static void test_shfl_down() {
|
||||
cpuResultsArr[i] = (i + lane_delta >= group_size) ? hPtr[i] : hPtr[i + lane_delta];
|
||||
}
|
||||
//printf("Array passed to GPU for computation\n");
|
||||
//printResults(hPtr, WAVE_SIZE);
|
||||
hipMalloc(&dPtr, group_size_in_bytes);
|
||||
hipMalloc(&dResults, group_size_in_bytes);
|
||||
//printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE);
|
||||
HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
|
||||
HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
|
||||
|
||||
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
|
||||
HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_shfl_down, blockSize, threadsPerBlock,
|
||||
threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
|
||||
HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
|
||||
}
|
||||
//printf("GPU results: \n");
|
||||
//printResults(hPtr, WAVE_SIZE);
|
||||
//printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE);
|
||||
//printf("Printing cpu to be verified array\n");
|
||||
//printResults(cpuResultsArr, WAVE_SIZE);
|
||||
//printResultsCoalescedGroupsShflDown(cpuResultsArr, WAVE_SIZE);
|
||||
|
||||
compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
|
||||
compareResultsCoalescedGroupsShflDown(hPtr, cpuResultsArr, group_size_in_bytes);
|
||||
std::cout << "Results verified!\n";
|
||||
|
||||
hipFree(hPtr);
|
||||
hipFree(dPtr);
|
||||
HIPCHECK(hipHostFree(hPtr));
|
||||
HIPCHECK(hipFree(dPtr));
|
||||
free(cpuResultsArr);
|
||||
}
|
||||
}
|
||||
@@ -246,7 +245,6 @@ TEST_CASE("Unit_coalesced_groups_shfl_down") {
|
||||
ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
|
||||
hipDeviceProp_t deviceProperties;
|
||||
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
|
||||
// Test shfl_down with random group sizes
|
||||
for (int i = 0; i < 100; i++) {
|
||||
|
||||
@@ -40,7 +40,7 @@ __device__ int prefix_sum_kernel(coalesced_group const& g, int val) {
|
||||
for (int i = 1; i < sz; i <<= 1) {
|
||||
int temp = g.shfl_up(val, i);
|
||||
|
||||
if (g.thread_rank() >= i) {
|
||||
if ((int)g.thread_rank() >= i) {
|
||||
val += temp;
|
||||
}
|
||||
}
|
||||
@@ -60,7 +60,7 @@ __global__ void kernel_shfl_up (int * dPtr, int *dResults, int lane_delta, int c
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernel_cg_group_partition(int* dPtr, unsigned int tileSz, int cg_sizes) {
|
||||
__global__ void kernel_cg_group_partition_shfl_up(int* dPtr, unsigned int tileSz, int cg_sizes) {
|
||||
|
||||
int id = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (id % cg_sizes == 0) {
|
||||
@@ -110,7 +110,7 @@ void printResults(int* ptr, int size) {
|
||||
std::cout << '\n';
|
||||
}
|
||||
|
||||
void verifyResults(int* cpu, int* gpu, int size) {
|
||||
void verifyResultsCoalescedGroupsShflUp(int* cpu, int* gpu, int size) {
|
||||
for (unsigned int i = 0; i < size / sizeof(int); i++) {
|
||||
if (cpu[i] != gpu[i]) {
|
||||
INFO(" Results do not match.");
|
||||
@@ -132,14 +132,14 @@ static void test_group_partition(unsigned tileSz) {
|
||||
|
||||
int arrSize = blockSize * threadsPerBlock * sizeof(int);
|
||||
|
||||
hipHostMalloc(&hPtr, arrSize);
|
||||
hipMalloc(&dPtr, arrSize);
|
||||
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
|
||||
HIPCHECK(hipMalloc(&dPtr, arrSize));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
|
||||
hipLaunchKernelGGL(kernel_cg_group_partition_shfl_up, blockSize, threadsPerBlock,
|
||||
threadsPerBlock * sizeof(int), 0, dPtr, tileSz, i);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost);
|
||||
HIPCHECK(hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost));
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
|
||||
@@ -153,12 +153,12 @@ static void test_group_partition(unsigned tileSz) {
|
||||
//std::cout << "\nPrefix sum results on GPU\n";
|
||||
//printResults(hPtr, tileSz);
|
||||
std::cout << "\n";
|
||||
verifyResults(hPtr, cpuPrefixSum, tileSz);
|
||||
verifyResultsCoalescedGroupsShflUp(hPtr, cpuPrefixSum, tileSz);
|
||||
std::cout << "Results verified!\n";
|
||||
|
||||
delete[] cpuPrefixSum;
|
||||
hipFree(hPtr);
|
||||
hipFree(dPtr);
|
||||
HIPCHECK(hipHostFree(hPtr));
|
||||
HIPCHECK(hipFree(dPtr));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -185,7 +185,7 @@ static void test_shfl_up() {
|
||||
|
||||
int arrSize = blockSize * threadsPerBlock * sizeof(int);
|
||||
|
||||
hipHostMalloc(&hPtr, arrSize);
|
||||
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
|
||||
// Fill up the array
|
||||
for (int i = 0; i < WAVE_SIZE; i++) {
|
||||
hPtr[i] = rand() % 1000;
|
||||
@@ -200,14 +200,14 @@ static void test_shfl_up() {
|
||||
//printf("Printing cpu results arr\n");
|
||||
//printResults(cpuResultsArr, WAVE_SIZE);
|
||||
|
||||
hipMalloc(&dPtr, group_size_in_bytes);
|
||||
hipMalloc(&dResults, group_size_in_bytes);
|
||||
HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
|
||||
HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
|
||||
|
||||
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
|
||||
HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_shfl_up, blockSize, threadsPerBlock,
|
||||
threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
|
||||
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
|
||||
HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGetLastError());
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
@@ -216,22 +216,21 @@ static void test_shfl_up() {
|
||||
//printf("GPU computation array :\n");
|
||||
//printResults(hPtr, WAVE_SIZE);
|
||||
|
||||
verifyResults(hPtr, cpuResultsArr, group_size_in_bytes);
|
||||
verifyResultsCoalescedGroupsShflUp(hPtr, cpuResultsArr, group_size_in_bytes);
|
||||
std::cout << "Results verified!\n";
|
||||
|
||||
hipFree(hPtr);
|
||||
hipFree(dPtr);
|
||||
HIPCHECK(hipHostFree(hPtr));
|
||||
HIPCHECK(hipFree(dPtr));
|
||||
free(cpuResultsArr);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_coalesced_groups_shfl_down") {
|
||||
TEST_CASE("Unit_coalesced_groups_shfl_up") {
|
||||
// Use default device for validating the test
|
||||
int deviceId;
|
||||
ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
|
||||
hipDeviceProp_t deviceProperties;
|
||||
ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
test_shfl_up();
|
||||
|
||||
@@ -51,7 +51,7 @@ using namespace cooperative_groups;
|
||||
|
||||
__device__ int atomicAggInc(int *ptr) {
|
||||
coalesced_group g = coalesced_threads();
|
||||
int prev;
|
||||
int prev = 0;
|
||||
// elect the first active thread to perform atomic add
|
||||
if (g.thread_rank() == 0) {
|
||||
prev = atomicAdd(ptr, g.size());
|
||||
@@ -104,7 +104,6 @@ __global__ void filter_arr(int *dst, int *nres, const int *src, int n) {
|
||||
*/
|
||||
__device__ int reduction_kernel(coalesced_group g, int* x, int val) {
|
||||
int lane = g.thread_rank();
|
||||
int sz = g.size();
|
||||
|
||||
for (int i = g.size() / 2; i > 0; i /= 2) {
|
||||
// use lds to store the temporary result
|
||||
@@ -138,7 +137,6 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
|
||||
int id = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (id % cg_sizes == 0) {
|
||||
coalesced_group threadBlockCGTy = coalesced_threads();
|
||||
int threadBlockGroupSize = threadBlockCGTy.size();
|
||||
|
||||
int* workspace = NULL;
|
||||
|
||||
@@ -150,13 +148,11 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
|
||||
workspace = sharedMem;
|
||||
}
|
||||
|
||||
int input, outputSum, expectedOutput;
|
||||
int input, outputSum;
|
||||
|
||||
// input to reduction, for each thread, is its' rank in the group
|
||||
input = threadBlockCGTy.thread_rank();
|
||||
|
||||
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
|
||||
|
||||
outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
|
||||
|
||||
if (threadBlockCGTy.thread_rank() == 0) {
|
||||
@@ -189,10 +185,8 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
|
||||
|
||||
__global__ void kernel_coalesced_active_groups() {
|
||||
thread_block threadBlockCGTy = this_thread_block();
|
||||
int threadBlockGroupSize = threadBlockCGTy.size();
|
||||
|
||||
// input to reduction, for each thread, is its' rank in the group
|
||||
int input = threadBlockCGTy.thread_rank();
|
||||
|
||||
if (threadBlockCGTy.thread_rank() == 0) {
|
||||
printf(" Creating odd and even set of active thread groups based on branch divergence\n\n");
|
||||
@@ -222,14 +216,14 @@ __global__ void kernel_coalesced_active_groups() {
|
||||
return;
|
||||
}
|
||||
|
||||
void printResults(int* ptr, int size) {
|
||||
void printResultsSimpleCoalescedGroups(int* ptr, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
std::cout << ptr[i] << " ";
|
||||
}
|
||||
std::cout << '\n';
|
||||
}
|
||||
|
||||
void compareResults(int* cpu, int* gpu, int size) {
|
||||
void compareResultsSimpleCoalescedGroups(int* cpu, int* gpu, int size) {
|
||||
for (unsigned int i = 0; i < size / sizeof(int); i++) {
|
||||
if (cpu[i] != gpu[i]) {
|
||||
INFO(" results do not match.");
|
||||
@@ -254,7 +248,7 @@ static void test_active_threads_grouping() {
|
||||
}
|
||||
|
||||
// Search if the sum exists in the expected results array
|
||||
void verifyResults(int* hPtr, int* dPtr, int size) {
|
||||
void verifyResultsSimpleCoalescedGroups(int* hPtr, int* dPtr, int size) {
|
||||
int i = 0, j = 0;
|
||||
for (i = 0; i < size; i++) {
|
||||
for (j = 0; j < size; j++) {
|
||||
@@ -294,15 +288,15 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
|
||||
}
|
||||
|
||||
int* dResult = NULL;
|
||||
hipMalloc(&dResult, sizeof(int) * numTiles);
|
||||
HIPCHECK(hipMalloc(&dResult, sizeof(int) * numTiles));
|
||||
|
||||
int* globalMem = NULL;
|
||||
if (useGlobalMem) {
|
||||
hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int));
|
||||
HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
|
||||
}
|
||||
|
||||
int* hResult = NULL;
|
||||
hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
|
||||
HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
|
||||
memset(hResult, 0, numTiles * sizeof(int));
|
||||
|
||||
// Launch Kernel
|
||||
@@ -326,13 +320,13 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
|
||||
}
|
||||
}
|
||||
|
||||
hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost);
|
||||
verifyResults(expectedSum, hResult, numTiles);
|
||||
HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
|
||||
verifyResultsSimpleCoalescedGroups(expectedSum, hResult, numTiles);
|
||||
// Free all allocated memory on host and device
|
||||
hipFree(dResult);
|
||||
hipFree(hResult);
|
||||
HIPCHECK(hipFree(dResult));
|
||||
HIPCHECK(hipHostFree(hResult));
|
||||
if (useGlobalMem) {
|
||||
hipFree(globalMem);
|
||||
HIPCHECK(hipFree(globalMem));
|
||||
}
|
||||
delete[] expectedSum;
|
||||
|
||||
@@ -363,7 +357,7 @@ static void test_shfl_any_to_any() {
|
||||
|
||||
int arrSize = blockSize * threadsPerBlock * sizeof(int);
|
||||
|
||||
hipHostMalloc(&hPtr, arrSize);
|
||||
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
|
||||
// Fill up the array
|
||||
for (int i = 0; i < WAVE_SIZE; i++) {
|
||||
hPtr[i] = rand() % 1000;
|
||||
@@ -382,36 +376,36 @@ static void test_shfl_any_to_any() {
|
||||
}
|
||||
|
||||
//printf("Array passed to GPU for computation\n");
|
||||
//printResults(hPtr, WAVE_SIZE);
|
||||
hipMalloc(&dPtr, group_size_in_bytes);
|
||||
hipMalloc(&dResults, group_size_in_bytes);
|
||||
//printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE);
|
||||
HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
|
||||
HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
|
||||
|
||||
hipMalloc(&dsrcArr, group_size_in_bytes);
|
||||
hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice);
|
||||
HIPCHECK(hipMalloc(&dsrcArr, group_size_in_bytes));
|
||||
HIPCHECK(hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
|
||||
HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_shfl_any_to_any, blockSize, threadsPerBlock,
|
||||
threadsPerBlock * sizeof(int), 0 , dPtr, dsrcArr, dResults, i);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
|
||||
HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
|
||||
}
|
||||
//printf("GPU results: \n");
|
||||
//printResults(hPtr, group_size);
|
||||
//printResultsSimpleCoalescedGroups(hPtr, group_size);
|
||||
//printf("Printing cpu to be verified array\n");
|
||||
//printResults(cpuResultsArr, group_size);
|
||||
//printResultsSimpleCoalescedGroups(cpuResultsArr, group_size);
|
||||
//printf("Printing srcLane array that was passed\n");
|
||||
//printResults(srcArr, group_size);
|
||||
//printResultsSimpleCoalescedGroups(srcArr, group_size);
|
||||
//printf("Printing srcLane array on the CPU\n");
|
||||
//printResults(srcArrCpu, group_size);
|
||||
compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
|
||||
//printResultsSimpleCoalescedGroups(srcArrCpu, group_size);
|
||||
compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes);
|
||||
std::cout << "Results verified!\n";
|
||||
|
||||
hipFree(hPtr);
|
||||
hipFree(dPtr);
|
||||
HIPCHECK(hipHostFree(hPtr));
|
||||
HIPCHECK(hipFree(dPtr));
|
||||
free(srcArr);
|
||||
free(srcArrCpu);
|
||||
free(cpuResultsArr);
|
||||
@@ -440,7 +434,7 @@ static void test_shfl_broadcast() {
|
||||
|
||||
int arrSize = blockSize * threadsPerBlock * sizeof(int);
|
||||
|
||||
hipHostMalloc(&hPtr, arrSize);
|
||||
HIPCHECK(hipHostMalloc(&hPtr, arrSize));
|
||||
// Fill up the array
|
||||
for (int i = 0; i < WAVE_SIZE; i++) {
|
||||
hPtr[i] = rand() % 1000;
|
||||
@@ -455,30 +449,30 @@ static void test_shfl_broadcast() {
|
||||
cpuResultsArr[i] = srcLaneCpu;
|
||||
}
|
||||
printf("Array passed to GPU for computation\n");
|
||||
printResults(hPtr, WAVE_SIZE);
|
||||
hipMalloc(&dPtr, group_size_in_bytes);
|
||||
hipMalloc(&dResults, group_size_in_bytes);
|
||||
printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE);
|
||||
HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
|
||||
HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
|
||||
|
||||
hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
|
||||
HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_shfl, blockSize, threadsPerBlock,
|
||||
threadsPerBlock * sizeof(int), 0, dPtr, dResults, srcLane, i);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
|
||||
HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
|
||||
}
|
||||
printf("GPU results: \n");
|
||||
printResults(hPtr, group_size);
|
||||
printResultsSimpleCoalescedGroups(hPtr, group_size);
|
||||
printf("Printing cpu to be verified array\n");
|
||||
printResults(cpuResultsArr, group_size);
|
||||
printResultsSimpleCoalescedGroups(cpuResultsArr, group_size);
|
||||
|
||||
compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
|
||||
compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes);
|
||||
std::cout << "Results verified!\n";
|
||||
|
||||
hipFree(hPtr);
|
||||
hipFree(dPtr);
|
||||
HIPCHECK(hipHostFree(hPtr));
|
||||
HIPCHECK(hipFree(dPtr));
|
||||
free(cpuResultsArr);
|
||||
}
|
||||
}
|
||||
@@ -489,7 +483,6 @@ TEST_CASE("Unit_coalesced_groups") {
|
||||
HIP_CHECK(hipGetDevice(&deviceId));
|
||||
hipDeviceProp_t deviceProperties;
|
||||
HIP_CHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
|
||||
std::cout << "Now testing coalesced_groups" << '\n' << std::endl;
|
||||
|
||||
@@ -512,7 +505,7 @@ TEST_CASE("Unit_coalesced_groups") {
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_data_to_filter, data_to_filter,
|
||||
sizeof(int) * NUM_ELEMS, hipMemcpyHostToDevice));
|
||||
hipMemset(d_nres, 0, sizeof(int));
|
||||
HIPCHECK(hipMemset(d_nres, 0, sizeof(int)));
|
||||
|
||||
dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1);
|
||||
dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1);
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user