SWDEV-358553 - Fixed all compiling bugs (#214)

Change-Id: I6f5316d76b2fad534f670b18138fc100c313be0d
2023-06-21 15:57:11 +05:30
parent 9760ccd5dc
commit 6e4bfd4bce
@@ -6,6 +6,9 @@ set(TEST_SRC
  hipCGMultiGridGroupType.cc
  hipCGMultiGridGroupTypeViaBaseType.cc
  hipCGMultiGridGroupTypeViaPublicApi.cc
+  coalesced_groups_shfl_down.cc
+  coalesced_groups_shfl_up.cc
+  simple_coalesced_groups.cc
 )
 if(HIP_PLATFORM STREQUAL "nvidia")
  set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
@@ -66,7 +66,7 @@ __global__ void kernel_shfl_down (int * dPtr, int *dResults, int lane_delta, int
  }
 }

-__global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int cg_sizes) {
+__global__ void kernel_cg_group_partition_shfl_down(int* result, unsigned int tileSz, int cg_sizes) {

  int id = threadIdx.x + blockIdx.x * blockDim.x;
  if (id % cg_sizes == 0) {
@@ -82,7 +82,6 @@ __global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int
    threadBlockCGTy.sync();

    coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
-    int threadRank = tiledPartition.thread_rank();

    input = tiledPartition.thread_rank();

@@ -110,7 +109,7 @@ void verifyResults(int* ptr, int expectedResult, int numTiles) {
  }
 }

-void compareResults(int* cpu, int* gpu, int size) {
+void compareResultsCoalescedGroupsShflDown(int* cpu, int* gpu, int size) {
  for (unsigned int i = 0; i < size / sizeof(int); i++) {
    if (cpu[i] != gpu[i]) {
      INFO(" results do not match.");
@@ -118,7 +117,7 @@ void compareResults(int* cpu, int* gpu, int size) {
  }
 }

-void printResults(int* ptr, int size) {
+void printResultsCoalescedGroupsShflDown(int* ptr, int size) {
  for (int i = 0; i < size; i++) {
    std::cout << ptr[i] << " ";
  }
@@ -148,14 +147,14 @@ static void test_group_partition(unsigned int tileSz) {
    int* dResult = NULL;
    int* hResult = NULL;

-    hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
+    HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
    memset(hResult, 0, numTiles * sizeof(int));

-    hipMalloc(&dResult, numTiles * sizeof(int));
+    HIPCHECK(hipMalloc(&dResult, numTiles * sizeof(int)));


    // Launch Kernel
-    hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
+    hipLaunchKernelGGL(kernel_cg_group_partition_shfl_down, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0, dResult, tileSz, i);
    HIP_CHECK(hipGetLastError()); 
    err = hipDeviceSynchronize();
@@ -164,13 +163,13 @@ static void test_group_partition(unsigned int tileSz) {
    }


-    hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost));

    verifyResults(hResult, expectedSum, numTiles);

    // Free all allocated memory on host and device
-    hipFree(dResult);
-    hipFree(hResult);
+    HIPCHECK(hipFree(dResult));
+    HIPCHECK(hipHostFree(hResult));
    delete[] expectedResult;

    printf("\n...PASSED.\n\n");
@@ -199,7 +198,7 @@ static void test_shfl_down() {

    int arrSize = blockSize * threadsPerBlock * sizeof(int);

-    hipHostMalloc(&hPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
    // Fill up the array
    for (int i = 0; i < WAVE_SIZE; i++) {
      hPtr[i] = rand() % 1000;
@@ -210,30 +209,30 @@ static void test_shfl_down() {
      cpuResultsArr[i] = (i + lane_delta >= group_size) ? hPtr[i] : hPtr[i + lane_delta];
    }
    //printf("Array passed to GPU for computation\n");
-    //printResults(hPtr, WAVE_SIZE);
-    hipMalloc(&dPtr, group_size_in_bytes);
-    hipMalloc(&dResults, group_size_in_bytes);
+    //printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE);
+    HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
+    HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));

-    hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
    // Launch Kernel
    hipLaunchKernelGGL(kernel_shfl_down, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
    HIP_CHECK(hipGetLastError()); 
-    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
    }
    //printf("GPU results: \n");
-    //printResults(hPtr, WAVE_SIZE);
+    //printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE);
    //printf("Printing cpu to be verified array\n");
-    //printResults(cpuResultsArr, WAVE_SIZE);
+    //printResultsCoalescedGroupsShflDown(cpuResultsArr, WAVE_SIZE);

-    compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
+    compareResultsCoalescedGroupsShflDown(hPtr, cpuResultsArr, group_size_in_bytes);
    std::cout << "Results verified!\n";

-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
    free(cpuResultsArr);
  }
 }
@@ -246,7 +245,6 @@ TEST_CASE("Unit_coalesced_groups_shfl_down") {
  ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
  hipDeviceProp_t deviceProperties;
  ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
-  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;

  // Test shfl_down with random group sizes
    for (int i = 0; i < 100; i++) {
@@ -40,7 +40,7 @@ __device__ int prefix_sum_kernel(coalesced_group const& g, int val) {
  for (int i = 1; i < sz; i <<= 1) {
    int temp = g.shfl_up(val, i);

-    if (g.thread_rank() >= i) {
+    if ((int)g.thread_rank() >= i) {
      val += temp;
    }
  }
@@ -60,7 +60,7 @@ __global__ void kernel_shfl_up (int * dPtr, int *dResults, int lane_delta, int c

 }

-__global__ void kernel_cg_group_partition(int* dPtr, unsigned int tileSz, int cg_sizes) {
+__global__ void kernel_cg_group_partition_shfl_up(int* dPtr, unsigned int tileSz, int cg_sizes) {

  int id = threadIdx.x + blockIdx.x * blockDim.x;
  if (id % cg_sizes == 0) {
@@ -110,7 +110,7 @@ void printResults(int* ptr, int size) {
  std::cout << '\n';
 }

-void verifyResults(int* cpu, int* gpu, int size) {
+void verifyResultsCoalescedGroupsShflUp(int* cpu, int* gpu, int size) {
  for (unsigned int i = 0; i < size / sizeof(int); i++) {
    if (cpu[i] != gpu[i]) {
      INFO(" Results do not match.");
@@ -132,14 +132,14 @@ static void test_group_partition(unsigned tileSz) {

    int arrSize = blockSize * threadsPerBlock * sizeof(int);

-    hipHostMalloc(&hPtr, arrSize);
-    hipMalloc(&dPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
+    HIPCHECK(hipMalloc(&dPtr, arrSize));

    // Launch Kernel
-    hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
+    hipLaunchKernelGGL(kernel_cg_group_partition_shfl_up, blockSize, threadsPerBlock,
                     threadsPerBlock * sizeof(int), 0, dPtr, tileSz, i);
    HIP_CHECK(hipGetLastError()); 
-    hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost));
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
@@ -153,12 +153,12 @@ static void test_group_partition(unsigned tileSz) {
    //std::cout << "\nPrefix sum results on GPU\n";
    //printResults(hPtr, tileSz);
    std::cout << "\n";
-    verifyResults(hPtr, cpuPrefixSum, tileSz);
+    verifyResultsCoalescedGroupsShflUp(hPtr, cpuPrefixSum, tileSz);
    std::cout << "Results verified!\n";

    delete[] cpuPrefixSum;
-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
  }
 }

@@ -185,7 +185,7 @@ static void test_shfl_up() {

    int arrSize = blockSize * threadsPerBlock * sizeof(int);

-    hipHostMalloc(&hPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
    // Fill up the array
    for (int i = 0; i < WAVE_SIZE; i++) {
      hPtr[i] = rand() % 1000;
@@ -200,14 +200,14 @@ static void test_shfl_up() {
    //printf("Printing cpu results arr\n");
    //printResults(cpuResultsArr, WAVE_SIZE);

-    hipMalloc(&dPtr, group_size_in_bytes);
-    hipMalloc(&dResults, group_size_in_bytes);
+    HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
+    HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));

-    hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
    // Launch Kernel
    hipLaunchKernelGGL(kernel_shfl_up, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
-    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
    HIP_CHECK(hipGetLastError()); 
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
@@ -216,22 +216,21 @@ static void test_shfl_up() {
    //printf("GPU computation array :\n");
    //printResults(hPtr, WAVE_SIZE);

-    verifyResults(hPtr, cpuResultsArr, group_size_in_bytes);
+    verifyResultsCoalescedGroupsShflUp(hPtr, cpuResultsArr, group_size_in_bytes);
    std::cout << "Results verified!\n";

-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
    free(cpuResultsArr);
  }
 }

-TEST_CASE("Unit_coalesced_groups_shfl_down") {
+TEST_CASE("Unit_coalesced_groups_shfl_up") {
  // Use default device for validating the test
  int deviceId;
  ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
  hipDeviceProp_t deviceProperties;
  ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
-  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;

  for (int i = 0; i < 100; i++) {
      test_shfl_up();
@@ -51,7 +51,7 @@ using namespace cooperative_groups;

 __device__ int atomicAggInc(int *ptr) {
   coalesced_group g = coalesced_threads();
-   int prev;
+   int prev = 0;
   // elect the first active thread to perform atomic add
   if (g.thread_rank() == 0) {
     prev = atomicAdd(ptr, g.size());
@@ -104,7 +104,6 @@ __global__ void filter_arr(int *dst, int *nres, const int *src, int n) {
 */
 __device__ int reduction_kernel(coalesced_group g, int* x, int val) {
  int lane = g.thread_rank();
-  int sz = g.size();

  for (int i = g.size() / 2; i > 0; i /= 2) {
    // use lds to store the temporary result
@@ -138,7 +137,6 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
  int id = threadIdx.x + blockIdx.x * blockDim.x;
  if (id % cg_sizes == 0) {
    coalesced_group threadBlockCGTy = coalesced_threads();
-    int threadBlockGroupSize = threadBlockCGTy.size();

    int* workspace = NULL;

@@ -150,13 +148,11 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
      workspace = sharedMem;
    }

-    int input, outputSum, expectedOutput;
+    int input, outputSum;

    // input to reduction, for each thread, is its' rank in the group
    input = threadBlockCGTy.thread_rank();

-    expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
-
    outputSum = reduction_kernel(threadBlockCGTy, workspace, input);

    if (threadBlockCGTy.thread_rank() == 0) {
@@ -189,10 +185,8 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re

 __global__ void kernel_coalesced_active_groups() {
  thread_block threadBlockCGTy = this_thread_block();
-  int threadBlockGroupSize = threadBlockCGTy.size();

  // input to reduction, for each thread, is its' rank in the group
-  int input = threadBlockCGTy.thread_rank();

  if (threadBlockCGTy.thread_rank() == 0) {
    printf(" Creating odd and even set of active thread groups based on branch divergence\n\n");
@@ -222,14 +216,14 @@ __global__ void kernel_coalesced_active_groups() {
  return;
 }

-void printResults(int* ptr, int size) {
+void printResultsSimpleCoalescedGroups(int* ptr, int size) {
  for (int i = 0; i < size; i++) {
    std::cout << ptr[i] << " ";
  }
  std::cout << '\n';
 }

-void compareResults(int* cpu, int* gpu, int size) {
+void compareResultsSimpleCoalescedGroups(int* cpu, int* gpu, int size) {
  for (unsigned int i = 0; i < size / sizeof(int); i++) {
    if (cpu[i] != gpu[i]) {
      INFO(" results do not match.");
@@ -254,7 +248,7 @@ static void test_active_threads_grouping() {
 }

 // Search if the sum exists in the expected results array
-void verifyResults(int* hPtr, int* dPtr, int size) {
+void verifyResultsSimpleCoalescedGroups(int* hPtr, int* dPtr, int size) {
  int i = 0, j = 0;
  for (i = 0; i < size; i++) {
    for (j = 0; j < size; j++) {
@@ -294,15 +288,15 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
    }

    int* dResult = NULL;
-    hipMalloc(&dResult, sizeof(int) * numTiles);
+    HIPCHECK(hipMalloc(&dResult, sizeof(int) * numTiles));

    int* globalMem = NULL;
    if (useGlobalMem) {
-      hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int));
+      HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
    }

    int* hResult = NULL;
-    hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
+    HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
    memset(hResult, 0, numTiles * sizeof(int));

    // Launch Kernel
@@ -326,13 +320,13 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
      }
    }

-    hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost);
-    verifyResults(expectedSum, hResult, numTiles);
+    HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
+    verifyResultsSimpleCoalescedGroups(expectedSum, hResult, numTiles);
    // Free all allocated memory on host and device
-    hipFree(dResult);
-    hipFree(hResult);
+    HIPCHECK(hipFree(dResult));
+    HIPCHECK(hipHostFree(hResult));
    if (useGlobalMem) {
-      hipFree(globalMem);
+      HIPCHECK(hipFree(globalMem));
    }
    delete[] expectedSum;

@@ -363,7 +357,7 @@ static void test_shfl_any_to_any() {

    int arrSize = blockSize * threadsPerBlock * sizeof(int);

-    hipHostMalloc(&hPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
    // Fill up the array
    for (int i = 0; i < WAVE_SIZE; i++) {
      hPtr[i] = rand() % 1000;
@@ -382,36 +376,36 @@ static void test_shfl_any_to_any() {
    }

    //printf("Array passed to GPU for computation\n");
-    //printResults(hPtr, WAVE_SIZE);
-    hipMalloc(&dPtr, group_size_in_bytes);
-    hipMalloc(&dResults, group_size_in_bytes);
+    //printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE);
+    HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
+    HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));

-    hipMalloc(&dsrcArr, group_size_in_bytes);
-    hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMalloc(&dsrcArr, group_size_in_bytes));
+    HIPCHECK(hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice));

-    hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
    // Launch Kernel
    hipLaunchKernelGGL(kernel_shfl_any_to_any, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0 , dPtr, dsrcArr, dResults, i);
    HIP_CHECK(hipGetLastError()); 
-    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
    }
    //printf("GPU results: \n");
-    //printResults(hPtr, group_size);
+    //printResultsSimpleCoalescedGroups(hPtr, group_size);
    //printf("Printing cpu to be verified array\n");
-    //printResults(cpuResultsArr, group_size);
+    //printResultsSimpleCoalescedGroups(cpuResultsArr, group_size);
    //printf("Printing srcLane array that was passed\n");
-    //printResults(srcArr, group_size);
+    //printResultsSimpleCoalescedGroups(srcArr, group_size);
    //printf("Printing srcLane array on the CPU\n");
-    //printResults(srcArrCpu, group_size);
-    compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
+    //printResultsSimpleCoalescedGroups(srcArrCpu, group_size);
+    compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes);
    std::cout << "Results verified!\n";

-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
    free(srcArr);
    free(srcArrCpu);
    free(cpuResultsArr);
@@ -440,7 +434,7 @@ static void test_shfl_broadcast() {

    int arrSize = blockSize * threadsPerBlock * sizeof(int);

-    hipHostMalloc(&hPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
    // Fill up the array
    for (int i = 0; i < WAVE_SIZE; i++) {
      hPtr[i] = rand() % 1000;
@@ -455,30 +449,30 @@ static void test_shfl_broadcast() {
      cpuResultsArr[i] = srcLaneCpu;
    }
    printf("Array passed to GPU for computation\n");
-    printResults(hPtr, WAVE_SIZE);
-    hipMalloc(&dPtr, group_size_in_bytes);
-    hipMalloc(&dResults, group_size_in_bytes);
+    printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE);
+    HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
+    HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));

-    hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
    // Launch Kernel
    hipLaunchKernelGGL(kernel_shfl, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0, dPtr, dResults, srcLane, i);
    HIP_CHECK(hipGetLastError()); 
-    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
    }
    printf("GPU results: \n");
-    printResults(hPtr, group_size);
+    printResultsSimpleCoalescedGroups(hPtr, group_size);
    printf("Printing cpu to be verified array\n");
-    printResults(cpuResultsArr, group_size);
+    printResultsSimpleCoalescedGroups(cpuResultsArr, group_size);

-    compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
+    compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes);
    std::cout << "Results verified!\n";

-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
    free(cpuResultsArr);
  }
 }
@@ -489,7 +483,6 @@ TEST_CASE("Unit_coalesced_groups") {
  HIP_CHECK(hipGetDevice(&deviceId));
  hipDeviceProp_t deviceProperties;
  HIP_CHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
-  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;

  std::cout << "Now testing coalesced_groups" << '\n' << std::endl;

@@ -512,7 +505,7 @@ TEST_CASE("Unit_coalesced_groups") {

  HIP_CHECK(hipMemcpy(d_data_to_filter, data_to_filter,
                             sizeof(int) * NUM_ELEMS, hipMemcpyHostToDevice));
-  hipMemset(d_nres, 0, sizeof(int));
+  HIPCHECK(hipMemset(d_nres, 0, sizeof(int)));

  dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1);
  dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1);