diff --git a/catch/unit/cooperativeGrps/CMakeLists.txt b/catch/unit/cooperativeGrps/CMakeLists.txt
index 61c567aee4..ce7632fb8b 100644
--- a/catch/unit/cooperativeGrps/CMakeLists.txt
+++ b/catch/unit/cooperativeGrps/CMakeLists.txt
@@ -6,6 +6,9 @@ set(TEST_SRC
   hipCGMultiGridGroupType.cc
   hipCGMultiGridGroupTypeViaBaseType.cc
   hipCGMultiGridGroupTypeViaPublicApi.cc
+  coalesced_groups_shfl_down.cc
+  coalesced_groups_shfl_up.cc
+  simple_coalesced_groups.cc
 )
 if(HIP_PLATFORM STREQUAL "nvidia")
   set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc
index 63d569c0e5..de13e8eda1 100644
--- a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc
+++ b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc
@@ -66,7 +66,7 @@ __global__ void kernel_shfl_down (int * dPtr, int *dResults, int lane_delta, int
   }
 }
 
-__global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int cg_sizes) {
+__global__ void kernel_cg_group_partition_shfl_down(int* result, unsigned int tileSz, int cg_sizes) {
 
   int id = threadIdx.x + blockIdx.x * blockDim.x;
   if (id % cg_sizes == 0) {
@@ -82,7 +82,6 @@ __global__ void kernel_cg_group_partition(int* result, unsigned int tileSz, int
     threadBlockCGTy.sync();
 
     coalesced_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
-    int threadRank = tiledPartition.thread_rank();
 
     input = tiledPartition.thread_rank();
 
@@ -110,7 +109,7 @@ void verifyResults(int* ptr, int expectedResult, int numTiles) {
   }
 }
 
-void compareResults(int* cpu, int* gpu, int size) {
+void compareResultsCoalescedGroupsShflDown(int* cpu, int* gpu, int size) {
   for (unsigned int i = 0; i < size / sizeof(int); i++) {
     if (cpu[i] != gpu[i]) {
       INFO(" results do not match.");
@@ -118,7 +117,7 @@ void compareResults(int* cpu, int* gpu, int size) {
   }
 }
 
-void printResults(int* ptr, int size) {
+void printResultsCoalescedGroupsShflDown(int* ptr, int size) {
   for (int i = 0; i < size; i++) {
     std::cout << ptr[i] << " ";
   }
@@ -148,14 +147,14 @@ static void test_group_partition(unsigned int tileSz) {
     int* dResult = NULL;
     int* hResult = NULL;
 
-    hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
+    HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
     memset(hResult, 0, numTiles * sizeof(int));
 
-    hipMalloc(&dResult, numTiles * sizeof(int));
+    HIPCHECK(hipMalloc(&dResult, numTiles * sizeof(int)));
 
 
     // Launch Kernel
-    hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
+    hipLaunchKernelGGL(kernel_cg_group_partition_shfl_down, blockSize, threadsPerBlock,
                        threadsPerBlock * sizeof(int), 0, dResult, tileSz, i);
     HIP_CHECK(hipGetLastError()); 
     err = hipDeviceSynchronize();
@@ -164,13 +163,13 @@ static void test_group_partition(unsigned int tileSz) {
     }
 
 
-    hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hResult, dResult, sizeof(int) * numTiles, hipMemcpyDeviceToHost));
 
     verifyResults(hResult, expectedSum, numTiles);
 
     // Free all allocated memory on host and device
-    hipFree(dResult);
-    hipFree(hResult);
+    HIPCHECK(hipFree(dResult));
+    HIPCHECK(hipHostFree(hResult));
     delete[] expectedResult;
 
     printf("\n...PASSED.\n\n");
@@ -199,7 +198,7 @@ static void test_shfl_down() {
 
     int arrSize = blockSize * threadsPerBlock * sizeof(int);
 
-    hipHostMalloc(&hPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
     // Fill up the array
     for (int i = 0; i < WAVE_SIZE; i++) {
       hPtr[i] = rand() % 1000;
@@ -210,30 +209,30 @@ static void test_shfl_down() {
       cpuResultsArr[i] = (i + lane_delta >= group_size) ? hPtr[i] : hPtr[i + lane_delta];
     }
     //printf("Array passed to GPU for computation\n");
-    //printResults(hPtr, WAVE_SIZE);
-    hipMalloc(&dPtr, group_size_in_bytes);
-    hipMalloc(&dResults, group_size_in_bytes);
+    //printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE);
+    HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
+    HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
 
-    hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
     // Launch Kernel
     hipLaunchKernelGGL(kernel_shfl_down, blockSize, threadsPerBlock,
                        threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
     HIP_CHECK(hipGetLastError()); 
-    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
     err = hipDeviceSynchronize();
     if (err != hipSuccess) {
       fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
     }
     //printf("GPU results: \n");
-    //printResults(hPtr, WAVE_SIZE);
+    //printResultsCoalescedGroupsShflDown(hPtr, WAVE_SIZE);
     //printf("Printing cpu to be verified array\n");
-    //printResults(cpuResultsArr, WAVE_SIZE);
+    //printResultsCoalescedGroupsShflDown(cpuResultsArr, WAVE_SIZE);
 
-    compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
+    compareResultsCoalescedGroupsShflDown(hPtr, cpuResultsArr, group_size_in_bytes);
     std::cout << "Results verified!\n";
 
-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
     free(cpuResultsArr);
   }
 }
@@ -246,7 +245,6 @@ TEST_CASE("Unit_coalesced_groups_shfl_down") {
   ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
   hipDeviceProp_t deviceProperties;
   ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
-  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
 
   // Test shfl_down with random group sizes
     for (int i = 0; i < 100; i++) {
diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc
index e3c9d3d9f4..75ca40a6d7 100644
--- a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc
+++ b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc
@@ -40,7 +40,7 @@ __device__ int prefix_sum_kernel(coalesced_group const& g, int val) {
   for (int i = 1; i < sz; i <<= 1) {
     int temp = g.shfl_up(val, i);
 
-    if (g.thread_rank() >= i) {
+    if ((int)g.thread_rank() >= i) {
       val += temp;
     }
   }
@@ -60,7 +60,7 @@ __global__ void kernel_shfl_up (int * dPtr, int *dResults, int lane_delta, int c
 
 }
 
-__global__ void kernel_cg_group_partition(int* dPtr, unsigned int tileSz, int cg_sizes) {
+__global__ void kernel_cg_group_partition_shfl_up(int* dPtr, unsigned int tileSz, int cg_sizes) {
 
   int id = threadIdx.x + blockIdx.x * blockDim.x;
   if (id % cg_sizes == 0) {
@@ -110,7 +110,7 @@ void printResults(int* ptr, int size) {
   std::cout << '\n';
 }
 
-void verifyResults(int* cpu, int* gpu, int size) {
+void verifyResultsCoalescedGroupsShflUp(int* cpu, int* gpu, int size) {
   for (unsigned int i = 0; i < size / sizeof(int); i++) {
     if (cpu[i] != gpu[i]) {
       INFO(" Results do not match.");
@@ -132,14 +132,14 @@ static void test_group_partition(unsigned tileSz) {
 
     int arrSize = blockSize * threadsPerBlock * sizeof(int);
 
-    hipHostMalloc(&hPtr, arrSize);
-    hipMalloc(&dPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
+    HIPCHECK(hipMalloc(&dPtr, arrSize));
 
     // Launch Kernel
-    hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
+    hipLaunchKernelGGL(kernel_cg_group_partition_shfl_up, blockSize, threadsPerBlock,
                      threadsPerBlock * sizeof(int), 0, dPtr, tileSz, i);
     HIP_CHECK(hipGetLastError()); 
-    hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost));
     err = hipDeviceSynchronize();
     if (err != hipSuccess) {
       fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
@@ -153,12 +153,12 @@ static void test_group_partition(unsigned tileSz) {
     //std::cout << "\nPrefix sum results on GPU\n";
     //printResults(hPtr, tileSz);
     std::cout << "\n";
-    verifyResults(hPtr, cpuPrefixSum, tileSz);
+    verifyResultsCoalescedGroupsShflUp(hPtr, cpuPrefixSum, tileSz);
     std::cout << "Results verified!\n";
 
     delete[] cpuPrefixSum;
-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
   }
 }
 
@@ -185,7 +185,7 @@ static void test_shfl_up() {
 
     int arrSize = blockSize * threadsPerBlock * sizeof(int);
 
-    hipHostMalloc(&hPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
     // Fill up the array
     for (int i = 0; i < WAVE_SIZE; i++) {
       hPtr[i] = rand() % 1000;
@@ -200,14 +200,14 @@ static void test_shfl_up() {
     //printf("Printing cpu results arr\n");
     //printResults(cpuResultsArr, WAVE_SIZE);
 
-    hipMalloc(&dPtr, group_size_in_bytes);
-    hipMalloc(&dResults, group_size_in_bytes);
+    HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
+    HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
 
-    hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
     // Launch Kernel
     hipLaunchKernelGGL(kernel_shfl_up, blockSize, threadsPerBlock,
                        threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
-    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
     HIP_CHECK(hipGetLastError()); 
     err = hipDeviceSynchronize();
     if (err != hipSuccess) {
@@ -216,22 +216,21 @@ static void test_shfl_up() {
     //printf("GPU computation array :\n");
     //printResults(hPtr, WAVE_SIZE);
 
-    verifyResults(hPtr, cpuResultsArr, group_size_in_bytes);
+    verifyResultsCoalescedGroupsShflUp(hPtr, cpuResultsArr, group_size_in_bytes);
     std::cout << "Results verified!\n";
 
-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
     free(cpuResultsArr);
   }
 }
 
-TEST_CASE("Unit_coalesced_groups_shfl_down") {
+TEST_CASE("Unit_coalesced_groups_shfl_up") {
   // Use default device for validating the test
   int deviceId;
   ASSERT_EQUAL(hipGetDevice(&deviceId), hipSuccess);
   hipDeviceProp_t deviceProperties;
   ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
-  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
 
   for (int i = 0; i < 100; i++) {
       test_shfl_up();
diff --git a/catch/unit/cooperativeGrps/simple_coalesced_groups.cc b/catch/unit/cooperativeGrps/simple_coalesced_groups.cc
index cef1d94e96..b07ccfe96f 100644
--- a/catch/unit/cooperativeGrps/simple_coalesced_groups.cc
+++ b/catch/unit/cooperativeGrps/simple_coalesced_groups.cc
@@ -51,7 +51,7 @@ using namespace cooperative_groups;
 
 __device__ int atomicAggInc(int *ptr) {
    coalesced_group g = coalesced_threads();
-   int prev;
+   int prev = 0;
    // elect the first active thread to perform atomic add
    if (g.thread_rank() == 0) {
      prev = atomicAdd(ptr, g.size());
@@ -104,7 +104,6 @@ __global__ void filter_arr(int *dst, int *nres, const int *src, int n) {
  */
 __device__ int reduction_kernel(coalesced_group g, int* x, int val) {
   int lane = g.thread_rank();
-  int sz = g.size();
 
   for (int i = g.size() / 2; i > 0; i /= 2) {
     // use lds to store the temporary result
@@ -138,7 +137,6 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
   int id = threadIdx.x + blockIdx.x * blockDim.x;
   if (id % cg_sizes == 0) {
     coalesced_group threadBlockCGTy = coalesced_threads();
-    int threadBlockGroupSize = threadBlockCGTy.size();
 
     int* workspace = NULL;
 
@@ -150,13 +148,11 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
       workspace = sharedMem;
     }
 
-    int input, outputSum, expectedOutput;
+    int input, outputSum;
 
     // input to reduction, for each thread, is its' rank in the group
     input = threadBlockCGTy.thread_rank();
 
-    expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
-
     outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
 
     if (threadBlockCGTy.thread_rank() == 0) {
@@ -189,10 +185,8 @@ __global__ void kernel_cg_coalesced_group_partition(unsigned int tileSz, int* re
 
 __global__ void kernel_coalesced_active_groups() {
   thread_block threadBlockCGTy = this_thread_block();
-  int threadBlockGroupSize = threadBlockCGTy.size();
 
   // input to reduction, for each thread, is its' rank in the group
-  int input = threadBlockCGTy.thread_rank();
 
   if (threadBlockCGTy.thread_rank() == 0) {
     printf(" Creating odd and even set of active thread groups based on branch divergence\n\n");
@@ -222,14 +216,14 @@ __global__ void kernel_coalesced_active_groups() {
   return;
 }
 
-void printResults(int* ptr, int size) {
+void printResultsSimpleCoalescedGroups(int* ptr, int size) {
   for (int i = 0; i < size; i++) {
     std::cout << ptr[i] << " ";
   }
   std::cout << '\n';
 }
 
-void compareResults(int* cpu, int* gpu, int size) {
+void compareResultsSimpleCoalescedGroups(int* cpu, int* gpu, int size) {
   for (unsigned int i = 0; i < size / sizeof(int); i++) {
     if (cpu[i] != gpu[i]) {
       INFO(" results do not match.");
@@ -254,7 +248,7 @@ static void test_active_threads_grouping() {
 }
 
 // Search if the sum exists in the expected results array
-void verifyResults(int* hPtr, int* dPtr, int size) {
+void verifyResultsSimpleCoalescedGroups(int* hPtr, int* dPtr, int size) {
   int i = 0, j = 0;
   for (i = 0; i < size; i++) {
     for (j = 0; j < size; j++) {
@@ -294,15 +288,15 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
     }
 
     int* dResult = NULL;
-    hipMalloc(&dResult, sizeof(int) * numTiles);
+    HIPCHECK(hipMalloc(&dResult, sizeof(int) * numTiles));
 
     int* globalMem = NULL;
     if (useGlobalMem) {
-      hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int));
+      HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
     }
 
     int* hResult = NULL;
-    hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault);
+    HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
     memset(hResult, 0, numTiles * sizeof(int));
 
     // Launch Kernel
@@ -326,13 +320,13 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
       }
     }
 
-    hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost);
-    verifyResults(expectedSum, hResult, numTiles);
+    HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
+    verifyResultsSimpleCoalescedGroups(expectedSum, hResult, numTiles);
     // Free all allocated memory on host and device
-    hipFree(dResult);
-    hipFree(hResult);
+    HIPCHECK(hipFree(dResult));
+    HIPCHECK(hipHostFree(hResult));
     if (useGlobalMem) {
-      hipFree(globalMem);
+      HIPCHECK(hipFree(globalMem));
     }
     delete[] expectedSum;
 
@@ -363,7 +357,7 @@ static void test_shfl_any_to_any() {
 
     int arrSize = blockSize * threadsPerBlock * sizeof(int);
 
-    hipHostMalloc(&hPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
     // Fill up the array
     for (int i = 0; i < WAVE_SIZE; i++) {
       hPtr[i] = rand() % 1000;
@@ -382,36 +376,36 @@ static void test_shfl_any_to_any() {
     }
 
     //printf("Array passed to GPU for computation\n");
-    //printResults(hPtr, WAVE_SIZE);
-    hipMalloc(&dPtr, group_size_in_bytes);
-    hipMalloc(&dResults, group_size_in_bytes);
+    //printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE);
+    HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
+    HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
 
-    hipMalloc(&dsrcArr, group_size_in_bytes);
-    hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMalloc(&dsrcArr, group_size_in_bytes));
+    HIPCHECK(hipMemcpy(dsrcArr, srcArr, group_size_in_bytes, hipMemcpyHostToDevice));
 
-    hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
     // Launch Kernel
     hipLaunchKernelGGL(kernel_shfl_any_to_any, blockSize, threadsPerBlock,
                        threadsPerBlock * sizeof(int), 0 , dPtr, dsrcArr, dResults, i);
     HIP_CHECK(hipGetLastError()); 
-    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
     err = hipDeviceSynchronize();
     if (err != hipSuccess) {
       fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
     }
     //printf("GPU results: \n");
-    //printResults(hPtr, group_size);
+    //printResultsSimpleCoalescedGroups(hPtr, group_size);
     //printf("Printing cpu to be verified array\n");
-    //printResults(cpuResultsArr, group_size);
+    //printResultsSimpleCoalescedGroups(cpuResultsArr, group_size);
     //printf("Printing srcLane array that was passed\n");
-    //printResults(srcArr, group_size);
+    //printResultsSimpleCoalescedGroups(srcArr, group_size);
     //printf("Printing srcLane array on the CPU\n");
-    //printResults(srcArrCpu, group_size);
-    compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
+    //printResultsSimpleCoalescedGroups(srcArrCpu, group_size);
+    compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes);
     std::cout << "Results verified!\n";
 
-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
     free(srcArr);
     free(srcArrCpu);
     free(cpuResultsArr);
@@ -440,7 +434,7 @@ static void test_shfl_broadcast() {
 
     int arrSize = blockSize * threadsPerBlock * sizeof(int);
 
-    hipHostMalloc(&hPtr, arrSize);
+    HIPCHECK(hipHostMalloc(&hPtr, arrSize));
     // Fill up the array
     for (int i = 0; i < WAVE_SIZE; i++) {
       hPtr[i] = rand() % 1000;
@@ -455,30 +449,30 @@ static void test_shfl_broadcast() {
       cpuResultsArr[i] = srcLaneCpu;
     }
     printf("Array passed to GPU for computation\n");
-    printResults(hPtr, WAVE_SIZE);
-    hipMalloc(&dPtr, group_size_in_bytes);
-    hipMalloc(&dResults, group_size_in_bytes);
+    printResultsSimpleCoalescedGroups(hPtr, WAVE_SIZE);
+    HIPCHECK(hipMalloc(&dPtr, group_size_in_bytes));
+    HIPCHECK(hipMalloc(&dResults, group_size_in_bytes));
 
-    hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice);
+    HIPCHECK(hipMemcpy(dPtr, hPtr, group_size_in_bytes, hipMemcpyHostToDevice));
     // Launch Kernel
     hipLaunchKernelGGL(kernel_shfl, blockSize, threadsPerBlock,
                        threadsPerBlock * sizeof(int), 0, dPtr, dResults, srcLane, i);
     HIP_CHECK(hipGetLastError()); 
-    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
+    HIPCHECK(hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost));
     err = hipDeviceSynchronize();
     if (err != hipSuccess) {
       fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
     }
     printf("GPU results: \n");
-    printResults(hPtr, group_size);
+    printResultsSimpleCoalescedGroups(hPtr, group_size);
     printf("Printing cpu to be verified array\n");
-    printResults(cpuResultsArr, group_size);
+    printResultsSimpleCoalescedGroups(cpuResultsArr, group_size);
 
-    compareResults(hPtr, cpuResultsArr, group_size_in_bytes);
+    compareResultsSimpleCoalescedGroups(hPtr, cpuResultsArr, group_size_in_bytes);
     std::cout << "Results verified!\n";
 
-    hipFree(hPtr);
-    hipFree(dPtr);
+    HIPCHECK(hipHostFree(hPtr));
+    HIPCHECK(hipFree(dPtr));
     free(cpuResultsArr);
   }
 }
@@ -489,7 +483,6 @@ TEST_CASE("Unit_coalesced_groups") {
   HIP_CHECK(hipGetDevice(&deviceId));
   hipDeviceProp_t deviceProperties;
   HIP_CHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
-  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
 
   std::cout << "Now testing coalesced_groups" << '\n' << std::endl;
 
@@ -512,7 +505,7 @@ TEST_CASE("Unit_coalesced_groups") {
 
   HIP_CHECK(hipMemcpy(d_data_to_filter, data_to_filter,
                              sizeof(int) * NUM_ELEMS, hipMemcpyHostToDevice));
-  hipMemset(d_nres, 0, sizeof(int));
+  HIPCHECK(hipMemset(d_nres, 0, sizeof(int)));
 
   dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1);
   dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1);