diff --git a/catch/hipTestMain/main.cc b/catch/hipTestMain/main.cc
index 74096c4af4..bfd52e8d96 100644
--- a/catch/hipTestMain/main.cc
+++ b/catch/hipTestMain/main.cc
@@ -30,8 +30,11 @@ int main(int argc, char** argv) {
     | Opt(cmd_options.progress)
         ["-P"]["--progress"]
         ("Show progress bar when running performance tests")
-    | Opt(cmd_options.cg_extended_run, "cg_extened_run")
-        ["-E"]["--cg-extended-run"]
+    | Opt(cmd_options.extended_run)
+        ["-E"]["--extended-run"]
+        ("TODO: Description goes here")
+    | Opt(cmd_options.cg_iterations, "cg_iterations")
+        ["-C"]["--cg-iterations"]
         ("Number of iterations used for cooperative groups sync tests (default: 5)")
   ;
   // clang-format on
diff --git a/catch/include/cmd_options.hh b/catch/include/cmd_options.hh
index 99e565491b..6caf7a0f48 100644
--- a/catch/include/cmd_options.hh
+++ b/catch/include/cmd_options.hh
@@ -26,8 +26,9 @@ struct CmdOptions {
   int iterations = 10;
   int warmups = 100;
   int cg_extended_run = 5;
+  int cg_iterations = 5;
   bool no_display = false;
   bool progress = false;
 };
 
-extern CmdOptions cmd_options;
\ No newline at end of file
+extern CmdOptions cmd_options;
diff --git a/catch/unit/cooperativeGrps/CMakeLists.txt b/catch/unit/cooperativeGrps/CMakeLists.txt
index 84d7417796..8bf1ce439b 100644
--- a/catch/unit/cooperativeGrps/CMakeLists.txt
+++ b/catch/unit/cooperativeGrps/CMakeLists.txt
@@ -9,9 +9,11 @@ set(TEST_SRC
   hipLaunchCooperativeKernel_old.cc
   hipLaunchCooperativeKernelMultiDevice_old.cc
   multi_grid_group.cc
+  coalesced_groups_shfl_down_old.cc
+  coalesced_groups_shfl_up_old.cc
+  hipCGCoalescedGroups_old.cc
+  coalesced_group.cc
   grid_group.cc
-  coalesced_groups_shfl_down.cc
-  coalesced_groups_shfl_up.cc
   coalesced_tiled_groups_metagrp.cc
 )
 if(HIP_PLATFORM STREQUAL "nvidia")
diff --git a/catch/unit/cooperativeGrps/coalesced_group.cc b/catch/unit/cooperativeGrps/coalesced_group.cc
new file mode 100644
index 0000000000..3c5f3c9c62
--- /dev/null
+++ b/catch/unit/cooperativeGrps/coalesced_group.cc
@@ -0,0 +1,760 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include "cooperative_groups_common.hh"
+
+#include <cmd_options.hh>
+#include <cpu_grid.h>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup coalesced_group coalesced_group
+ * @{
+ * @ingroup DeviceLanguageTest
+ * Contains unit tests for all coalesced_group basic APIs
+ */
+
+namespace cg = cooperative_groups;
+
+template <unsigned int warp_size, typename BaseType = cg::coalesced_group>
+static __global__ void coalesced_group_size_getter(unsigned int* sizes, uint64_t active_mask) {
+  const cg::thread_block_tile<warp_size> tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
+    BaseType active = cg::coalesced_threads();
+    sizes[thread_rank_in_grid()] = active.size();
+  }
+}
+
+template <unsigned int warp_size, typename BaseType = cg::coalesced_group>
+static __global__ void coalesced_group_thread_rank_getter(unsigned int* thread_ranks,
+                                                          uint64_t active_mask) {
+  const cg::thread_block_tile<warp_size> tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
+    BaseType active = cg::coalesced_threads();
+    thread_ranks[thread_rank_in_grid()] = active.thread_rank();
+  }
+}
+
+template <unsigned int warp_size>
+static __global__ void coalesced_group_non_member_size_getter(unsigned int* sizes,
+                                                              uint64_t active_mask) {
+  const cg::thread_block_tile<warp_size> tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
+    cg::coalesced_group active = cg::coalesced_threads();
+    sizes[thread_rank_in_grid()] = cg::group_size(active);
+  }
+}
+
+template <unsigned int warp_size>
+static __global__ void coalesced_group_non_member_thread_rank_getter(unsigned int* thread_ranks,
+                                                                     uint64_t active_mask) {
+  const cg::thread_block_tile<warp_size> tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
+    cg::coalesced_group active = cg::coalesced_threads();
+    thread_ranks[thread_rank_in_grid()] = cg::thread_rank(active);
+  }
+}
+
+static unsigned int get_active_thread_count(uint64_t active_mask, unsigned int partition_size) {
+  unsigned int active_thread_count = 0;
+  for (int i = 0; i < partition_size; i++) {
+    if (active_mask & (static_cast<uint64_t>(1) << i)) active_thread_count++;
+  }
+  return active_thread_count;
+}
+
+static uint64_t get_active_mask(unsigned int test_case) {
+  uint64_t active_mask = 0;
+  switch (test_case) {
+    case 0:  // 1st thread
+      active_mask = 1;
+      break;
+    case 1:  // last thread
+      active_mask = static_cast<uint64_t>(1) << (kWarpSize - 1);
+      break;
+    case 2:  // all threads
+      active_mask = 0xFFFFFFFFFFFFFFFF;
+      break;
+    case 3:  // every second thread
+      active_mask = 0xAAAAAAAAAAAAAAAA;
+      break;
+    default:  // random
+      static std::mt19937_64 mt(test_case);
+      std::uniform_int_distribution<uint64_t> dist(0, std::numeric_limits<uint64_t>::max());
+      active_mask = dist(mt);
+  }
+  return active_mask;
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Launches kernels that write the return values of size and thread_rank member
+ * functions of coalesced groups, created according to the generated mask, to an output array that
+ * is validated on the host side. The kernels are run sequentially, reusing the output array, to
+ * avoid running out of device memory for large kernel launches
+ * Test source
+ * ------------------------
+ *  - unit/cooperativeGrps/coalesced_group.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Coalesced_Group_Getters_Positive_Basic") {
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  const auto blocks = GenerateBlockDimensionsForShuffle();
+  const auto threads = GenerateThreadDimensionsForShuffle();
+  auto test_case = GENERATE(range(0, 4));
+  uint64_t active_mask = get_active_mask(test_case);
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  INFO("Coalesced group mask: " << active_mask);
+  const CPUGrid grid(blocks, threads);
+
+  LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
+                                              grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
+                                          grid.thread_count_ * sizeof(unsigned int));
+  HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int)));
+
+  // Launch Kernel
+  coalesced_group_size_getter<kWarpSize><<<blocks, threads>>>(uint_arr_dev.ptr(), active_mask);
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int)));
+  HIP_CHECK(hipDeviceSynchronize());
+  coalesced_group_thread_rank_getter<kWarpSize>
+      <<<blocks, threads>>>(uint_arr_dev.ptr(), active_mask);
+
+  // Verify coalesced_group.size() values
+  unsigned int coalesced_size = 0;
+  const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    // If the number of threads in a block is not a multiple of warp size, the
+    // last warp will have inactive threads and coalesced group size must be recalculated
+    if (rank_in_block == (partitions_in_block - 1) * kWarpSize) {
+      unsigned int partition_size =
+          grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize;
+      coalesced_size = get_active_thread_count(active_mask, partition_size);
+    } else if (rank_in_block == 0) {
+      coalesced_size = get_active_thread_count(active_mask, kWarpSize);
+    }
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      if (uint_arr.ptr()[i] != coalesced_size) {
+        REQUIRE(uint_arr.ptr()[i] == coalesced_size);
+      }
+    }
+  }
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Verify coalesced_group.thread_rank() values
+  unsigned int coalesced_rank = 0;
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    if (rank_in_partition == 0) coalesced_rank = 0;
+
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      if (uint_arr.ptr()[i] != coalesced_rank) {
+        REQUIRE(uint_arr.ptr()[i] == coalesced_rank);
+      }
+      coalesced_rank++;
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches kernels that write the return values of size and thread_rank member functions to an
+ * output array that is validated on the host side, while treating the coalesced group, created
+ * according to the generated mask, as a thread group. The kernels are run sequentially, reusing the
+ * output array, to avoid running out of device memory for large kernel launches
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Coalesced_Group_Getters_Via_Base_Type_Positive_Basic") {
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  const auto blocks = GenerateBlockDimensionsForShuffle();
+  const auto threads = GenerateThreadDimensionsForShuffle();
+  auto test_case = GENERATE(range(0, 4));
+  uint64_t active_mask = get_active_mask(test_case);
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  INFO("Coalesced group mask: " << active_mask);
+
+  const CPUGrid grid(blocks, threads);
+
+  LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
+                                              grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
+                                          grid.thread_count_ * sizeof(unsigned int));
+  HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int)));
+
+  // Launch Kernel
+  coalesced_group_size_getter<kWarpSize, cg::thread_group>
+      <<<blocks, threads>>>(uint_arr_dev.ptr(), active_mask);
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int)));
+  HIP_CHECK(hipDeviceSynchronize());
+  coalesced_group_thread_rank_getter<kWarpSize, cg::thread_group>
+      <<<blocks, threads>>>(uint_arr_dev.ptr(), active_mask);
+
+  // Verify coalesced_group.size() values
+  unsigned int coalesced_size = 0;
+  const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    // If the number of threads in a block is not a multiple of warp size, the
+    // last warp will have inactive threads and coalesced group size must be recalculated
+    if (rank_in_block == (partitions_in_block - 1) * kWarpSize) {
+      unsigned int partition_size =
+          grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize;
+      coalesced_size = get_active_thread_count(active_mask, partition_size);
+    } else if (rank_in_block == 0) {
+      coalesced_size = get_active_thread_count(active_mask, kWarpSize);
+    }
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      if (uint_arr.ptr()[i] != coalesced_size) {
+        REQUIRE(uint_arr.ptr()[i] == coalesced_size);
+      }
+    }
+  }
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Verify coalesced_group.thread_rank() values
+  unsigned int coalesced_rank = 0;
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    if (rank_in_partition == 0) coalesced_rank = 0;
+
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      if (uint_arr.ptr()[i] != coalesced_rank) {
+        REQUIRE(uint_arr.ptr()[i] == coalesced_rank);
+      }
+      coalesced_rank++;
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches kernels that write the return values of size and thread_rank non-member functions
+ * of coalesced groups, created according to the generated mask, to an output array that is
+ * validated on the host side. The kernels are run sequentially, reusing the output array, to avoid
+ * running out of device memory for large kernel launches.
+ * Test source
+ * ------------------------
+ *  - unit/cooperativeGrps/coalesced_group.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Coalesced_Group_Getters_Via_Non_Member_Functions_Positive_Basic") {
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  const auto blocks = GenerateBlockDimensionsForShuffle();
+  const auto threads = GenerateThreadDimensionsForShuffle();
+  auto test_case = GENERATE(range(0, 4));
+  uint64_t active_mask = get_active_mask(test_case);
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  INFO("Coalesced group mask: " << active_mask);
+
+  const CPUGrid grid(blocks, threads);
+
+  LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
+                                              grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
+                                          grid.thread_count_ * sizeof(unsigned int));
+  HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int)));
+
+  // Launch Kernel
+  coalesced_group_non_member_size_getter<kWarpSize>
+      <<<blocks, threads>>>(uint_arr_dev.ptr(), active_mask);
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int)));
+  HIP_CHECK(hipDeviceSynchronize());
+  coalesced_group_non_member_thread_rank_getter<kWarpSize>
+      <<<blocks, threads>>>(uint_arr_dev.ptr(), active_mask);
+
+  // Verify coalesced_group.size() values
+  unsigned int coalesced_size = 0;
+  const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    // If the number of threads in a block is not a multiple of warp size, the
+    // last warp will have inactive threads and coalesced group size must be recalculated
+    if (rank_in_block == (partitions_in_block - 1) * kWarpSize) {
+      unsigned int partition_size =
+          grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize;
+      coalesced_size = get_active_thread_count(active_mask, partition_size);
+    } else if (rank_in_block == 0) {
+      coalesced_size = get_active_thread_count(active_mask, kWarpSize);
+    }
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      if (uint_arr.ptr()[i] != coalesced_size) {
+        REQUIRE(uint_arr.ptr()[i] == coalesced_size);
+      }
+    }
+  }
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Verify coalesced_group.thread_rank() values
+  unsigned int coalesced_rank = 0;
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    if (rank_in_partition == 0) coalesced_rank = 0;
+
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      if (uint_arr.ptr()[i] != coalesced_rank) {
+        REQUIRE(uint_arr.ptr()[i] == coalesced_rank);
+      }
+      coalesced_rank++;
+    }
+  }
+}
+
+template <typename T, unsigned int warp_size>
+__global__ void coalesced_group_shfl_up(T* const out, const unsigned int delta,
+                                        const uint64_t active_mask) {
+  const cg::thread_block_tile<warp_size> tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
+    cg::coalesced_group active = cg::coalesced_threads();
+    T var = static_cast<T>(active.thread_rank());
+    out[thread_rank_in_grid()] = active.shfl_up(var, delta);
+  }
+}
+
+template <typename T> void CoalescedGroupShflUpTestImpl() {
+  const auto blocks = GenerateBlockDimensionsForShuffle();
+  const auto threads = GenerateThreadDimensionsForShuffle();
+  auto test_case = GENERATE(range(0, 4));
+  uint64_t active_mask = get_active_mask(test_case);
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  INFO("Coalesced group mask: " << active_mask);
+  unsigned int active_thread_count = get_active_thread_count(active_mask, kWarpSize);
+
+  auto delta = GENERATE(range(static_cast<size_t>(0), kWarpSize));
+  delta = delta % active_thread_count;
+  INFO("Delta: " << delta);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+  coalesced_group_shfl_up<T, kWarpSize><<<blocks, threads>>>(arr_dev.ptr(), delta, active_mask);
+  HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  unsigned int coalesced_rank = 0;
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    if (rank_in_partition == 0) coalesced_rank = 0;
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      int target = coalesced_rank - delta;
+      target = target < 0 ? coalesced_rank : target;
+      if (arr.ptr()[i] != target) {
+        REQUIRE(arr.ptr()[i] == target);
+      }
+      coalesced_rank++;
+    }
+  }
+}
+
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle up behavior of coalesced group, created according to the generated
+ * mask, for various delta values
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Shfl_Up_Positive_Basic", "", int, unsigned int, long,
+                   unsigned long, long long, unsigned long long, float, double) {
+  CoalescedGroupShflUpTestImpl<TestType>();
+}
+
+template <typename T, unsigned int warp_size>
+__global__ void coalesced_group_shfl_down(T* const out, const unsigned int delta,
+                                          const uint64_t active_mask) {
+  const cg::thread_block_tile<warp_size> tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
+    cg::coalesced_group active = cg::coalesced_threads();
+    T var = static_cast<T>(active.thread_rank());
+    out[thread_rank_in_grid()] = active.shfl_down(var, delta);
+  }
+}
+
+template <typename T> void CoalescedGroupShflDownTest() {
+  const auto blocks = GenerateBlockDimensionsForShuffle();
+  const auto threads = GenerateThreadDimensionsForShuffle();
+  auto test_case = GENERATE(range(0, 4));
+  uint64_t active_mask = get_active_mask(test_case);
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  INFO("Coalesced group mask: " << active_mask);
+  unsigned int active_thread_count = get_active_thread_count(active_mask, kWarpSize);
+
+  auto delta = GENERATE(range(static_cast<size_t>(0), kWarpSize));
+  delta = delta % active_thread_count;
+  INFO("Delta: " << delta);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+  coalesced_group_shfl_down<T, kWarpSize><<<blocks, threads>>>(arr_dev.ptr(), delta, active_mask);
+  HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  unsigned int coalesced_rank = 0;
+  unsigned int coalesced_size = 0;
+  const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    if (rank_in_partition == 0) coalesced_rank = 0;
+    // If the number of threads in a block is not a multiple of warp size, the
+    // last warp will have inactive threads and coalesced group size must be recalculated
+    if (rank_in_block == (partitions_in_block - 1) * kWarpSize) {
+      unsigned int partition_size =
+          grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize;
+      coalesced_size = get_active_thread_count(active_mask, partition_size);
+    } else if (rank_in_block == 0) {
+      coalesced_size = get_active_thread_count(active_mask, kWarpSize);
+    }
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      int target = coalesced_rank + delta;
+      target = target < coalesced_size ? target : coalesced_rank;
+      if (arr.ptr()[i] != target) {
+        REQUIRE(arr.ptr()[i] == target);
+      }
+      coalesced_rank++;
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle down behavior of coalesced group, created according to the generated
+ * mask, for various delta values
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Shfl_Down_Positive_Basic", "", int, unsigned int, long,
+                   unsigned long, long long, unsigned long long, float, double) {
+  CoalescedGroupShflDownTest<TestType>();
+}
+
+template <typename T, unsigned int warp_size>
+__global__ void coalesced_group_shfl(T* const out, uint8_t* target_lanes,
+                                     const uint64_t active_mask) {
+  const cg::thread_block_tile<warp_size> tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
+    cg::coalesced_group active = cg::coalesced_threads();
+    T var = static_cast<T>(active.thread_rank());
+    out[thread_rank_in_grid()] = active.shfl(var, target_lanes[active.thread_rank()]);
+    ;
+  }
+}
+
+template <typename T> void CoalescedGroupShflTest() {
+  const auto blocks = GenerateBlockDimensionsForShuffle();
+  const auto threads = GenerateThreadDimensionsForShuffle();
+  auto test_case = GENERATE(range(0, 4));
+  uint64_t active_mask = get_active_mask(test_case);
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  INFO("Coalesced group mask: " << active_mask);
+  unsigned int active_thread_count = get_active_thread_count(active_mask, kWarpSize);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+  LinearAllocGuard<uint8_t> target_lanes_dev(LinearAllocs::hipMalloc,
+                                             active_thread_count * sizeof(uint8_t));
+  LinearAllocGuard<uint8_t> target_lanes(LinearAllocs::hipHostMalloc,
+                                         active_thread_count * sizeof(uint8_t));
+  // Generate a couple different combinations for target lanes
+  for (auto i = 0u; i < active_thread_count; ++i) {
+    target_lanes.ptr()[i] = active_thread_count - 1 - i;
+  }
+
+  HIP_CHECK(hipMemcpy(target_lanes_dev.ptr(), target_lanes.ptr(),
+                      active_thread_count * sizeof(uint8_t), hipMemcpyHostToDevice));
+  coalesced_group_shfl<T, kWarpSize>
+      <<<blocks, threads>>>(arr_dev.ptr(), target_lanes_dev.ptr(), active_mask);
+  HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+
+  unsigned int coalesced_rank = 0;
+  unsigned int coalesced_size = 0;
+  const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    if (rank_in_partition == 0) coalesced_rank = 0;
+    // If the number of threads in a block is not a multiple of warp size, the
+    // last warp will have inactive threads and coalesced group size must be recalculated
+    if (rank_in_block == (partitions_in_block - 1) * kWarpSize) {
+      unsigned int partition_size =
+          grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize;
+      coalesced_size = get_active_thread_count(active_mask, partition_size);
+    } else if (rank_in_block == 0) {
+      coalesced_size = get_active_thread_count(active_mask, kWarpSize);
+    }
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      auto target = target_lanes.ptr()[coalesced_rank];
+      if (target >= coalesced_size) target = 0;
+      if (arr.ptr()[i] != target) {
+        REQUIRE(arr.ptr()[i] == target);
+      }
+      coalesced_rank++;
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle behavior of of coalesced group, created according to the generated
+ * mask, for generated shuffle target lanes
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Shfl_Positive_Basic", "", int, unsigned int, long,
+                   unsigned long, long long, unsigned long long, float, double) {
+  CoalescedGroupShflTest<TestType>();
+}
+
+static inline std::mt19937& GetRandomGenerator() {
+  static std::mt19937 mt(11);
+  return mt;
+}
+
+template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  return dist(GetRandomGenerator());
+}
+
+template <bool use_global, size_t warp_size, typename T>
+__global__ void coalesced_group_sync_check(T* global_data, unsigned int* wait_modifiers,
+                                           const uint64_t active_mask) {
+  extern __shared__ uint8_t shared_data[];
+  T* const data = use_global ? global_data : reinterpret_cast<T*>(shared_data);
+  const auto tid = cg::this_grid().thread_rank();
+  const auto block = cg::this_thread_block();
+  const cg::thread_block_tile<warp_size> partition = cg::tiled_partition<warp_size>(block);
+
+  const auto data_idx = [&block](unsigned int i) { return use_global ? i : (i % block.size()); };
+
+  const auto partition_rank = block.thread_rank() / partition.size();
+
+  const auto block_base_idx = tid / block.size() * block.size();
+  const auto tile_base_idx = block_base_idx + partition_rank * partition.size();
+  const auto wait_modifier = wait_modifiers[tid];
+
+  if (active_mask & (static_cast<uint64_t>(1) << partition.thread_rank())) {
+    cg::coalesced_group active = cg::coalesced_threads();
+    busy_wait(wait_modifier);
+    data[data_idx(tid)] = active.thread_rank();
+    active.sync();
+    bool valid = true;
+    for (auto i = 0; i < active.size(); ++i) {
+      const auto expected = (active.thread_rank() + i) % active.size();
+      unsigned int active_count = 0;
+      int offset = -1;
+      while (active_count <= expected) {
+        offset++;
+        if (active_mask & (static_cast<uint64_t>(1) << offset)) active_count++;
+      }
+
+      if (!(valid &= (data[data_idx(tile_base_idx + offset)] == expected))) {
+        break;
+      }
+    }
+    active.sync();
+    data[data_idx(tid)] = valid;
+
+    if constexpr (!use_global) {
+      global_data[tid] = data[data_idx(tid)];
+    }
+  }
+}
+
+template <bool global_memory, typename T> void CoalescedGroupSyncTest() {
+  const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations));
+  const auto blocks = GenerateBlockDimensionsForShuffle();
+  const auto threads = GenerateThreadDimensionsForShuffle();
+  auto test_case = GENERATE(range(0, 4));
+  uint64_t active_mask = get_active_mask(test_case);
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  INFO("Coalesced group mask: " << active_mask);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  const auto alloc_size_per_block = alloc_size / grid.block_count_;
+
+  int max_shared_mem_per_block = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&max_shared_mem_per_block,
+                                  hipDeviceAttributeMaxSharedMemoryPerBlock, 0));
+  if (!global_memory && (max_shared_mem_per_block < alloc_size_per_block)) {
+    return;
+  }
+
+  LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+  LinearAllocGuard<unsigned int> wait_modifiers_dev(LinearAllocs::hipMalloc,
+                                                    grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> wait_modifiers(LinearAllocs::hipHostMalloc,
+                                                grid.thread_count_ * sizeof(unsigned int));
+  if (randomized_run_count != 0) {
+    std::generate(wait_modifiers.ptr(), wait_modifiers.ptr() + grid.thread_count_,
+                  [] { return GenerateRandomInteger(0u, 1500u); });
+  } else {
+    std::fill_n(wait_modifiers.ptr(), grid.thread_count_, 0u);
+  }
+
+  const auto shared_memory_size = global_memory ? 0u : alloc_size_per_block;
+  HIP_CHECK(hipMemcpy(wait_modifiers_dev.ptr(), wait_modifiers.ptr(),
+                      grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
+
+  coalesced_group_sync_check<global_memory, kWarpSize><<<blocks, threads, shared_memory_size>>>(
+      arr_dev.ptr(), wait_modifiers_dev.ptr(), active_mask);
+  HIP_CHECK(hipGetLastError());
+
+  HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  for (int i = 0; i < grid.thread_count_; i++) {
+    const auto rank_in_block = grid.thread_rank_in_block(i).value();
+    const int rank_in_partition = rank_in_block % kWarpSize;
+    if (active_mask & (static_cast<uint64_t>(1) << rank_in_partition)) {
+      if (arr.ptr()[i] != 1) {
+        REQUIRE(arr.ptr()[i] == 1);
+      }
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches a kernel where blocks are devided into coalesced groups and every thread writes its
+ * intra-tile rank into an array slot determined by its grid-wide linear index. The array is either
+ * in global or dynamic shared memory based on a compile time switch, and the test is run for arrays
+ * of 1, 2, and 4 byte elements. Before the write each thread executes a busy wait loop for a random
+ * amount of clock cycles, the amount being read from an input array. After the write a sync for
+ * active threads is performed and each thread validates that it can read the expected values that
+ * other active threads within the same coalesced group have written to their respective array
+ * slots. Each thread begins the validation from a given offset from its own index. For the first
+ * run of the test, all the offsets are zero, so memory reads should be coalesced as adjacent
+ * threads read from adjacent memory locations. On subsequent runs the offsets are randomized for
+ * each thread, leading to non-coalesced reads and cache thrashing.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Sync_Positive_Basic", "", uint8_t, uint16_t, uint32_t) {
+  SECTION("Global memory") { CoalescedGroupSyncTest<true, TestType>(); }
+  SECTION("Shared memory") { CoalescedGroupSyncTest<false, TestType>(); }
+}
\ No newline at end of file
diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc
similarity index 100%
rename from catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc
rename to catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc
diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc
similarity index 100%
rename from catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc
rename to catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc