diff --git a/catch/unit/cooperativeGrps/CMakeLists.txt b/catch/unit/cooperativeGrps/CMakeLists.txt index 61c567aee4..ce7632fb8b 100644 --- a/catch/unit/cooperativeGrps/CMakeLists.txt +++ b/catch/unit/cooperativeGrps/CMakeLists.txt @@ -6,6 +6,9 @@ set(TEST_SRC hipCGMultiGridGroupType.cc hipCGMultiGridGroupTypeViaBaseType.cc hipCGMultiGridGroupTypeViaPublicApi.cc + coalesced_groups_shfl_down.cc + coalesced_groups_shfl_up.cc + simple_coalesced_groups.cc ) if(HIP_PLATFORM STREQUAL "nvidia") set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc index 63d569c0e5..de13e8eda1 100644 --- a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc +++ b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc @@ -66,7 +66,7 @@ __global__ void kernel_shfl_down (int * dPtr, int *dResults, int lane_delta, int } } -__global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int cg_sizes) { +__global__ void kernel_cg_group_partition_shfl_down(int* result, unsigned int tileSz, int cg_sizes) { int id = threadIdx.x + blockIdx.x * blockDim.x; if (id % cg_sizes == 0) { @@ -82,7 +82,6 @@ __global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int threadBlockCGTy.sync(); coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz); - int threadRank = tiledPartition.thread_rank(); input = tiledPartition.thread_rank(); @@ -110,7 +109,7 @@ void verifyResults(int* ptr, int expectedResult, int numTiles) { } } -void compareResults(int* cpu, int* gpu, int size) { +void compareResultsCoalescedGroupsShflDown(int* cpu, int* gpu, int size) { for (unsigned int i = 0; i < size / sizeof(int); i++) { if (cpu[i] != gpu[i]) { INFO(" results do not match."); @@ -118,7 +117,7 @@ void compareResults(int* cpu, int* gpu, int size) { } } -void printResults(int* ptr, int size) { +void printResultsCoalescedGroupsShflDown(int* ptr, int size) { for (int i = 0; i < size; i++) { std::cout << ptr[i] << " "; } @@ -148,14 +147,14 @@ static void test_group_partition(unsigned int tileSz) { int* dResult = NULL; int* hResult = NULL; - hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault); + HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault)); memset(hResult, 0, numTiles * sizeof(int)); - hipMalloc(&dResult, numTiles * sizeof(int)); + HIPCHECK(hipMalloc(&dResult, numTiles * sizeof(int))); // Launch Kernel - hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock, + hipLaunchKernelGGL(kernel_cg_group_partition_shfl_down, blockSize, threadsPerBlock, threadsPerBlock * sizeof(int), 0, dResult, tileSz, i); HIP_CHECK(hipGetLastError()); err = hipDeviceSynchronize(); @@ -164,13 +163,13 @@ static void test_group_partition(unsigned int tileSz) { } - hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost); + HIPCHECK(hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost)); verifyResults(hResult, expectedSum, numTiles); // Free all allocated memory on host and device - hipFree(dResult); - hipFree(hResult); + HIPCHECK(hipFree(dResult)); + HIPCHECK(hipHostFree(hResult)); delete[] expectedResult; printf("\n...PASSED.\n\n"); @@ -199,7 +198,7 @@ static void test_shfl_down() { int arrSize = blockSize * threadsPerBlock * sizeof(int); - hipHostMalloc(&hPtr, arrSize); + HIPCHECK(hipHostMalloc(&hPtr, arrSize)); // Fill up the array for (int i = 0; i < WAVE_SIZE; i++) { hPtr[i] = rand() % 1000; @@ -210,30 +209,30 @@ static void test_shfl_down() { cpuResultsArr[i] = (i + lane_delta >= group_size) ? hPtr[i] : hPtr[i + lane_delta]; } //printf("Array passed to GPU for computation\n"); - //printResults(hPtr, WAVE_SIZE); - hipMalloc(&dPtr, group_size_in_bytes); - hipMalloc(&dResults, group_size_in_bytes); + //printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE); + HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes)); + HIPCHECK(hipMalloc(&dResults, group_size_in_bytes)); - hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice); + HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice)); // Launch Kernel hipLaunchKernelGGL(kernel_shfl_down, blockSize, threadsPerBlock, threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i); HIP_CHECK(hipGetLastError()); - hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost); + HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost)); err = hipDeviceSynchronize(); if (err != hipSuccess) { fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); } //printf("GPU results: \n"); - //printResults(hPtr, WAVE_SIZE); + //printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE); //printf("Printing cpu to be verified array\n"); - //printResults(cpuResultsArr, WAVE_SIZE); + //printResultsCoalescedGroupsShflDown(cpuResultsArr, WAVE_SIZE); - compareResults(hPtr, cpuResultsArr, group_size_in_bytes); + compareResultsCoalescedGroupsShflDown(hPtr, cpuResultsArr, group_size_in_bytes); std::cout << "Results verified!\n"; - hipFree(hPtr); - hipFree(dPtr); + HIPCHECK(hipHostFree(hPtr)); + HIPCHECK(hipFree(dPtr)); free(cpuResultsArr); } } @@ -246,7 +245,6 @@ TEST_CASE("Unit_coalesced_groups_shfl_down") { ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess); hipDeviceProp_t deviceProperties; ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); - int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; // Test shfl_down with random group sizes for (int i = 0; i < 100; i++) { diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc index e3c9d3d9f4..75ca40a6d7 100644 --- a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc +++ b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc @@ -40,7 +40,7 @@ __device__ int prefix_sum_kernel(coalesced_group const& g, int val) { for (int i = 1; i < sz; i <<= 1) { int temp = g.shfl_up(val, i); - if (g.thread_rank() >= i) { + if ((int)g.thread_rank() >= i) { val += temp; } } @@ -60,7 +60,7 @@ __global__ void kernel_shfl_up (int * dPtr, int *dResults, int lane_delta, int c } -__global__ void kernel_cg_group_partition(int* dPtr, unsigned int tileSz, int cg_sizes) { +__global__ void kernel_cg_group_partition_shfl_up(int* dPtr, unsigned int tileSz, int cg_sizes) { int id = threadIdx.x + blockIdx.x * blockDim.x; if (id % cg_sizes == 0) { @@ -110,7 +110,7 @@ void printResults(int* ptr, int size) { std::cout << '\n'; } -void verifyResults(int* cpu, int* gpu, int size) { +void verifyResultsCoalescedGroupsShflUp(int* cpu, int* gpu, int size) { for (unsigned int i = 0; i < size / sizeof(int); i++) { if (cpu[i] != gpu[i]) { INFO(" Results do not match."); @@ -132,14 +132,14 @@ static void test_group_partition(unsigned tileSz) { int arrSize = blockSize * threadsPerBlock * sizeof(int); - hipHostMalloc(&hPtr, arrSize); - hipMalloc(&dPtr, arrSize); + HIPCHECK(hipHostMalloc(&hPtr, arrSize)); + HIPCHECK(hipMalloc(&dPtr, arrSize)); // Launch Kernel - hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock, + hipLaunchKernelGGL(kernel_cg_group_partition_shfl_up, blockSize, threadsPerBlock, threadsPerBlock * sizeof(int), 0, dPtr, tileSz, i); HIP_CHECK(hipGetLastError()); - hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost); + HIPCHECK(hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost)); err = hipDeviceSynchronize(); if (err != hipSuccess) { fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); @@ -153,12 +153,12 @@ static void test_group_partition(unsigned tileSz) { //std::cout << "\nPrefix sum results on GPU\n"; //printResults(hPtr, tileSz); std::cout << "\n"; - verifyResults(hPtr, cpuPrefixSum, tileSz); + verifyResultsCoalescedGroupsShflUp(hPtr, cpuPrefixSum, tileSz); std::cout << "Results verified!\n"; delete[] cpuPrefixSum; - hipFree(hPtr); - hipFree(dPtr); + HIPCHECK(hipHostFree(hPtr)); + HIPCHECK(hipFree(dPtr)); } } @@ -185,7 +185,7 @@ static void test_shfl_up() { int arrSize = blockSize * threadsPerBlock * sizeof(int); - hipHostMalloc(&hPtr, arrSize); + HIPCHECK(hipHostMalloc(&hPtr, arrSize)); // Fill up the array for (int i = 0; i < WAVE_SIZE; i++) { hPtr[i] = rand() % 1000; @@ -200,14 +200,14 @@ static void test_shfl_up() { //printf("Printing cpu results arr\n"); //printResults(cpuResultsArr, WAVE_SIZE); - hipMalloc(&dPtr, group_size_in_bytes); - hipMalloc(&dResults, group_size_in_bytes); + HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes)); + HIPCHECK(hipMalloc(&dResults, group_size_in_bytes)); - hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice); + HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice)); // Launch Kernel hipLaunchKernelGGL(kernel_shfl_up, blockSize, threadsPerBlock, threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i); - hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost); + HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost)); HIP_CHECK(hipGetLastError()); err = hipDeviceSynchronize(); if (err != hipSuccess) { @@ -216,22 +216,21 @@ static void test_shfl_up() { //printf("GPU computation array :\n"); //printResults(hPtr, WAVE_SIZE); - verifyResults(hPtr, cpuResultsArr, group_size_in_bytes); + verifyResultsCoalescedGroupsShflUp(hPtr, cpuResultsArr, group_size_in_bytes); std::cout << "Results verified!\n"; - hipFree(hPtr); - hipFree(dPtr); + HIPCHECK(hipHostFree(hPtr)); + HIPCHECK(hipFree(dPtr)); free(cpuResultsArr); } } -TEST_CASE("Unit_coalesced_groups_shfl_down") { +TEST_CASE("Unit_coalesced_groups_shfl_up") { // Use default device for validating the test int deviceId; ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess); hipDeviceProp_t deviceProperties; ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); - int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; for (int i = 0; i < 100; i++) { test_shfl_up(); diff --git a/catch/unit/cooperativeGrps/simple_coalesced_groups.cc b/catch/unit/cooperativeGrps/simple_coalesced_groups.cc index cef1d94e96..b07ccfe96f 100644 --- a/catch/unit/cooperativeGrps/simple_coalesced_groups.cc +++ b/catch/unit/cooperativeGrps/simple_coalesced_groups.cc @@ -51,7 +51,7 @@ using namespace cooperative_groups; __device__ int atomicAggInc(int *ptr) { coalesced_group g = coalesced_threads(); - int prev; + int prev = 0; // elect the first active thread to perform atomic add if (g.thread_rank() == 0) { prev = atomicAdd(ptr, g.size()); @@ -104,7 +104,6 @@ __global__ void filter_arr(int *dst, int *nres, const int *src, int n) { */ __device__ int reduction_kernel(coalesced_group g, int* x, int val) { int lane = g.thread_rank(); - int sz = g.size(); for (int i = g.size() / 2; i > 0; i /= 2) { // use lds to store the temporary result @@ -138,7 +137,6 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re int id = threadIdx.x + blockIdx.x * blockDim.x; if (id % cg_sizes == 0) { coalesced_group threadBlockCGTy = coalesced_threads(); - int threadBlockGroupSize = threadBlockCGTy.size(); int* workspace = NULL; @@ -150,13 +148,11 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re workspace = sharedMem; } - int input, outputSum, expectedOutput; + int input, outputSum; // input to reduction, for each thread, is its' rank in the group input = threadBlockCGTy.thread_rank(); - expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2; - outputSum = reduction_kernel(threadBlockCGTy, workspace, input); if (threadBlockCGTy.thread_rank() == 0) { @@ -189,10 +185,8 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re __global__ void kernel_coalesced_active_groups() { thread_block threadBlockCGTy = this_thread_block(); - int threadBlockGroupSize = threadBlockCGTy.size(); // input to reduction, for each thread, is its' rank in the group - int input = threadBlockCGTy.thread_rank(); if (threadBlockCGTy.thread_rank() == 0) { printf(" Creating odd and even set of active thread groups based on branch divergence\n\n"); @@ -222,14 +216,14 @@ __global__ void kernel_coalesced_active_groups() { return; } -void printResults(int* ptr, int size) { +void printResultsSimpleCoalescedGroups(int* ptr, int size) { for (int i = 0; i < size; i++) { std::cout << ptr[i] << " "; } std::cout << '\n'; } -void compareResults(int* cpu, int* gpu, int size) { +void compareResultsSimpleCoalescedGroups(int* cpu, int* gpu, int size) { for (unsigned int i = 0; i < size / sizeof(int); i++) { if (cpu[i] != gpu[i]) { INFO(" results do not match."); @@ -254,7 +248,7 @@ static void test_active_threads_grouping() { } // Search if the sum exists in the expected results array -void verifyResults(int* hPtr, int* dPtr, int size) { +void verifyResultsSimpleCoalescedGroups(int* hPtr, int* dPtr, int size) { int i = 0, j = 0; for (i = 0; i < size; i++) { for (j = 0; j < size; j++) { @@ -294,15 +288,15 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) { } int* dResult = NULL; - hipMalloc(&dResult, sizeof(int) * numTiles); + HIPCHECK(hipMalloc(&dResult, sizeof(int) * numTiles)); int* globalMem = NULL; if (useGlobalMem) { - hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)); + HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int))); } int* hResult = NULL; - hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault); + HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault)); memset(hResult, 0, numTiles * sizeof(int)); // Launch Kernel @@ -326,13 +320,13 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) { } } - hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost); - verifyResults(expectedSum, hResult, numTiles); + HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost)); + verifyResultsSimpleCoalescedGroups(expectedSum, hResult, numTiles); // Free all allocated memory on host and device - hipFree(dResult); - hipFree(hResult); + HIPCHECK(hipFree(dResult)); + HIPCHECK(hipHostFree(hResult)); if (useGlobalMem) { - hipFree(globalMem); + HIPCHECK(hipFree(globalMem)); } delete[] expectedSum; @@ -363,7 +357,7 @@ static void test_shfl_any_to_any() { int arrSize = blockSize * threadsPerBlock * sizeof(int); - hipHostMalloc(&hPtr, arrSize); + HIPCHECK(hipHostMalloc(&hPtr, arrSize)); // Fill up the array for (int i = 0; i < WAVE_SIZE; i++) { hPtr[i] = rand() % 1000; @@ -382,36 +376,36 @@ static void test_shfl_any_to_any() { } //printf("Array passed to GPU for computation\n"); - //printResults(hPtr, WAVE_SIZE); - hipMalloc(&dPtr, group_size_in_bytes); - hipMalloc(&dResults, group_size_in_bytes); + //printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE); + HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes)); + HIPCHECK(hipMalloc(&dResults, group_size_in_bytes)); - hipMalloc(&dsrcArr, group_size_in_bytes); - hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice); + HIPCHECK(hipMalloc(&dsrcArr, group_size_in_bytes)); + HIPCHECK(hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice)); - hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice); + HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice)); // Launch Kernel hipLaunchKernelGGL(kernel_shfl_any_to_any, blockSize, threadsPerBlock, threadsPerBlock * sizeof(int), 0 , dPtr, dsrcArr, dResults, i); HIP_CHECK(hipGetLastError()); - hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost); + HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost)); err = hipDeviceSynchronize(); if (err != hipSuccess) { fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); } //printf("GPU results: \n"); - //printResults(hPtr, group_size); + //printResultsSimpleCoalescedGroups(hPtr, group_size); //printf("Printing cpu to be verified array\n"); - //printResults(cpuResultsArr, group_size); + //printResultsSimpleCoalescedGroups(cpuResultsArr, group_size); //printf("Printing srcLane array that was passed\n"); - //printResults(srcArr, group_size); + //printResultsSimpleCoalescedGroups(srcArr, group_size); //printf("Printing srcLane array on the CPU\n"); - //printResults(srcArrCpu, group_size); - compareResults(hPtr, cpuResultsArr, group_size_in_bytes); + //printResultsSimpleCoalescedGroups(srcArrCpu, group_size); + compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes); std::cout << "Results verified!\n"; - hipFree(hPtr); - hipFree(dPtr); + HIPCHECK(hipHostFree(hPtr)); + HIPCHECK(hipFree(dPtr)); free(srcArr); free(srcArrCpu); free(cpuResultsArr); @@ -440,7 +434,7 @@ static void test_shfl_broadcast() { int arrSize = blockSize * threadsPerBlock * sizeof(int); - hipHostMalloc(&hPtr, arrSize); + HIPCHECK(hipHostMalloc(&hPtr, arrSize)); // Fill up the array for (int i = 0; i < WAVE_SIZE; i++) { hPtr[i] = rand() % 1000; @@ -455,30 +449,30 @@ static void test_shfl_broadcast() { cpuResultsArr[i] = srcLaneCpu; } printf("Array passed to GPU for computation\n"); - printResults(hPtr, WAVE_SIZE); - hipMalloc(&dPtr, group_size_in_bytes); - hipMalloc(&dResults, group_size_in_bytes); + printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE); + HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes)); + HIPCHECK(hipMalloc(&dResults, group_size_in_bytes)); - hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice); + HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice)); // Launch Kernel hipLaunchKernelGGL(kernel_shfl, blockSize, threadsPerBlock, threadsPerBlock * sizeof(int), 0, dPtr, dResults, srcLane, i); HIP_CHECK(hipGetLastError()); - hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost); + HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost)); err = hipDeviceSynchronize(); if (err != hipSuccess) { fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); } printf("GPU results: \n"); - printResults(hPtr, group_size); + printResultsSimpleCoalescedGroups(hPtr, group_size); printf("Printing cpu to be verified array\n"); - printResults(cpuResultsArr, group_size); + printResultsSimpleCoalescedGroups(cpuResultsArr, group_size); - compareResults(hPtr, cpuResultsArr, group_size_in_bytes); + compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes); std::cout << "Results verified!\n"; - hipFree(hPtr); - hipFree(dPtr); + HIPCHECK(hipHostFree(hPtr)); + HIPCHECK(hipFree(dPtr)); free(cpuResultsArr); } } @@ -489,7 +483,6 @@ TEST_CASE("Unit_coalesced_groups") { HIP_CHECK(hipGetDevice(&deviceId)); hipDeviceProp_t deviceProperties; HIP_CHECK(hipGetDeviceProperties(&deviceProperties, deviceId)); - int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; std::cout << "Now testing coalesced_groups" << '\n' << std::endl; @@ -512,7 +505,7 @@ TEST_CASE("Unit_coalesced_groups") { HIP_CHECK(hipMemcpy(d_data_to_filter, data_to_filter, sizeof(int) * NUM_ELEMS, hipMemcpyHostToDevice)); - hipMemset(d_nres, 0, sizeof(int)); + HIPCHECK(hipMemset(d_nres, 0, sizeof(int))); dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1); dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1);