SWDEV-348282 - fixed intermittent failure fo hipCGMultiGrid* and hipCGThread* tests, porting over to catch2 in the process (#3067)

Change-Id: Id22dfb4ab2ee8171242fbf6a10886ff3e4abc926 [ROCm/hip-tests commit: 74a6e407e7]
2022-11-19 01:44:12 +05:30
@@ -33,6 +33,7 @@ add_subdirectory(kernel)
 add_subdirectory(multiThread)
 add_subdirectory(compiler)
 add_subdirectory(errorHandling)
+add_subdirectory(cooperativeGrps)
 if(HIP_PLATFORM STREQUAL "amd")
 add_subdirectory(clock)
 endif()
@@ -0,0 +1,22 @@
+# Common Tests - Test independent of all platforms
+set(TEST_SRC
+  hipCGThreadBlockType.cc
+  hipCGThreadBlockTypeViaBaseType.cc
+  hipCGThreadBlockTypeViaPublicApi.cc
+  hipCGMultiGridGroupType.cc
+  hipCGMultiGridGroupTypeViaBaseType.cc
+  hipCGMultiGridGroupTypeViaPublicApi.cc
+)
+if(HIP_PLATFORM STREQUAL "nvidia")
+  set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+  set_source_files_properties(hipCGMultiGridGroupTypeViaBaseType.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+  set_source_files_properties(hipCGMultiGridGroupTypeViaPublicApi.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+  hip_add_exe_to_target(NAME coopGrpTest
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+else()
+  hip_add_exe_to_target(NAME coopGrpTest
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests)
+endif()
@@ -0,0 +1,240 @@
+/*
+Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
+#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
+#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
+
+using namespace cooperative_groups;
+constexpr int MaxGPUs = 8;
+
+static __global__
+void kernel_cg_multi_grid_group_type(int* numGridsTestD,
+                                     int* gridRankTestD,
+                                     int *sizeTestD,
+                                     int *thdRankTestD,
+                                     int *isValidTestD,
+                                     int *syncTestD,
+                                     int *syncResultD)
+{
+  multi_grid_group mg = this_multi_grid();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test num_grids
+  numGridsTestD[gIdx] = mg.num_grids();
+
+  // Test grid_rank
+  gridRankTestD[gIdx] = mg.grid_rank();
+
+  // Test size
+  sizeTestD[gIdx] = mg.size();
+
+  // Test thread_rank
+  thdRankTestD[gIdx] = mg.thread_rank();
+
+  // Test is_valid
+  isValidTestD[gIdx] = mg.is_valid();
+
+  // Test sync
+  //
+  // Eech thread assign 1 to their respective location
+  syncTestD[gIdx] = 1;
+  // Grid level sync
+  this_grid().sync();
+  // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
+      syncTestD[0] += syncTestD[i];
+    }
+    syncResultD[mg.grid_rank() + 1] = syncTestD[0];
+  }
+  // multi-grid level sync
+  mg.sync();
+  // grid (gpu) 0 does final reduction across all grids (gpus)
+  if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
+    syncResultD[0] = 0;
+    for (uint i = 1; i <= mg.num_grids(); ++i) {
+      syncResultD[0] += syncResultD[i];
+    }
+  }
+}
+
+static void test_cg_multi_grid_group_type(int blockSize, int nGpu)
+{
+  // Create a stream each device
+  hipStream_t stream[MaxGPUs];
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipDeviceSynchronize());  // Make sure work is done on this device
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  // Allocate host and device memory
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int *numGridsTestD[MaxGPUs], *numGridsTestH[MaxGPUs];
+  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
+  int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
+  int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
+  int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
+  int *syncTestD[MaxGPUs], *syncResultD;
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    HIPCHECK(hipMalloc(&numGridsTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
+
+    HIPCHECK(hipHostMalloc(&numGridsTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
+
+    if (i == 0) {
+      HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
+    }
+  }
+
+  // Launch Kernel
+  constexpr int NumKernelArgs = 7;
+  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
+  void* args[MaxGPUs * NumKernelArgs];
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    args[i * NumKernelArgs]     = &numGridsTestD[i];
+    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
+    args[i * NumKernelArgs + 2] = &sizeTestD[i];
+    args[i * NumKernelArgs + 3] = &thdRankTestD[i];
+    args[i * NumKernelArgs + 4] = &isValidTestD[i];
+    args[i * NumKernelArgs + 5] = &syncTestD[i];
+    args[i * NumKernelArgs + 6] = &syncResultD;
+
+    launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type);
+    launchParamsList[i].gridDim = 2;
+    launchParamsList[i].blockDim = blockSize;
+    launchParamsList[i].sharedMem = 0;
+    launchParamsList[i].stream = stream[i];
+    launchParamsList[i].args = &args[i * NumKernelArgs];
+  }
+  HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
+
+  // Copy result from device to host
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipMemcpy(numGridsTestH[i], numGridsTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
+  }
+
+  // Validate results
+  int gridsSeen[MaxGPUs];
+  for (int i = 0; i < nGpu; ++i) {
+    for (int j = 0; j < 2 * blockSize; ++j) {
+      ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
+      ASSERT_GE(gridRankTestH[i][j], 0);
+      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
+      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
+      ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
+      int gridRank = gridRankTestH[i][j];
+      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
+      ASSERT_EQUAL(isValidTestH[i][j], 1);
+    }
+    ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
+
+    // Validate uniqueness property of grid rank
+    gridsSeen[i] = gridRankTestH[i][0];
+    for (int k = 0; k < i; ++k) {
+      if (gridsSeen[k] == gridsSeen[i]) {
+        assert(false && "Grid rank in multi-gpu setup should be unique");
+      }
+    }
+  }
+  ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
+
+  // Free host and device memory
+  delete [] launchParamsList;
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    HIPCHECK(hipFree(numGridsTestD[i]));
+    HIPCHECK(hipFree(gridRankTestD[i]));
+    HIPCHECK(hipFree(sizeTestD[i]));
+    HIPCHECK(hipFree(thdRankTestD[i]));
+    HIPCHECK(hipFree(isValidTestD[i]));
+    HIPCHECK(hipFree(syncTestD[i]));
+
+    if (i == 0) {
+      HIPCHECK(hipHostFree(syncResultD));
+    }
+    HIPCHECK(hipHostFree(numGridsTestH[i]));
+    HIPCHECK(hipHostFree(gridRankTestH[i]));
+    HIPCHECK(hipHostFree(sizeTestH[i]));
+    HIPCHECK(hipHostFree(thdRankTestH[i]));
+    HIPCHECK(hipHostFree(isValidTestH[i]));
+  }
+}
+
+TEST_CASE("Unit_hipCGMultiGridGroupType") {
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  nGpu = min(nGpu, MaxGPUs);
+
+  // Set `maxThreadsPerBlock` by taking minimum among all available devices
+  int maxThreadsPerBlock = INT_MAX;
+  hipDeviceProp_t deviceProperties;
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
+    if (!deviceProperties.cooperativeMultiDeviceLaunch) {
+      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+      return;
+    }
+    maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
+  }
+
+  // Test for blockSizes in powers of 2
+  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
+    test_cg_multi_grid_group_type(blockSize, nGpu);
+  }
+
+  // Test for random blockSizes, but the sequence is the same every execution
+  srand(0);
+  for (int i = 0; i < 10; i++) {
+    // Test fails for 0 thread per block
+    test_cg_multi_grid_group_type(max(2, rand() % maxThreadsPerBlock), nGpu);
+  }
+}
@@ -0,0 +1,234 @@
+/*
+Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <cmath>
+#include <cstdlib>
+#include <climits>
+
+#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
+#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
+#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
+
+using namespace cooperative_groups;
+constexpr int MaxGPUs = 8;
+
+static __global__
+void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
+                                                   int* gridRankTestD,
+                                                   int *thdRankTestD,
+                                                   int *isValidTestD,
+                                                   int *syncTestD,
+                                                   int *syncResultD)
+{
+  thread_group tg = this_multi_grid();  // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda
+
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test size
+  sizeTestD[gIdx] = tg.size();
+
+  // Test thread_rank
+  gridRankTestD[gIdx] = this_multi_grid().grid_rank();
+  thdRankTestD[gIdx] = tg.thread_rank();
+
+  // Test is_valid
+#ifdef __HIP_PLATFORM_AMD__
+  isValidTestD[gIdx] = tg.is_valid();
+#else
+  // Cuda has no thread_group.is_valid()
+  isValidTestD[gIdx] = true;
+#endif
+  // Test sync
+  //
+  // Eech thread assign 1 to their respective location
+  syncTestD[gIdx] = 1;
+  // Grid level sync
+  this_grid().sync();
+  // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
+      syncTestD[0] += syncTestD[i];
+    }
+    syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
+  }
+  // multi-grid level sync
+  tg.sync();
+  // grid (gpu) 0 does final reduction across all grids (gpus)
+  if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
+    syncResultD[0] = 0;
+    for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
+      syncResultD[0] += syncResultD[i];
+    }
+  }
+}
+
+static void test_cg_multi_grid_group_type_via_base_type(int blockSize, int nGpu)
+{
+  // Create a stream each device
+  hipStream_t stream[MaxGPUs];
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipDeviceSynchronize());  // Make sure work is done on this device
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  // Allocate host and device memory
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
+  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
+  int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
+  int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
+  int *syncTestD[MaxGPUs], *syncResultD;
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
+
+    HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
+
+    if (i == 0) {
+      HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
+    }
+  }
+
+  // Launch Kernel
+  constexpr int NumKernelArgs = 6;
+  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
+  void* args[MaxGPUs * NumKernelArgs];
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    args[i * NumKernelArgs    ] = &sizeTestD[i];
+    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
+    args[i * NumKernelArgs + 2] = &thdRankTestD[i];
+    args[i * NumKernelArgs + 3] = &isValidTestD[i];
+    args[i * NumKernelArgs + 4] = &syncTestD[i];
+    args[i * NumKernelArgs + 5] = &syncResultD;
+
+    launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
+    launchParamsList[i].gridDim = 2;
+    launchParamsList[i].blockDim = blockSize;
+    launchParamsList[i].sharedMem = 0;
+    launchParamsList[i].stream = stream[i];
+    launchParamsList[i].args = &args[i * NumKernelArgs];
+  }
+  HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
+
+  // Copy result from device to host
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
+  }
+
+  // Validate results
+  int gridsSeen[MaxGPUs];
+  for (int i = 0; i < nGpu; ++i) {
+    for (int j = 0; j < 2 * blockSize; ++j) {
+      ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
+      ASSERT_GE(gridRankTestH[i][j], 0);
+      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
+      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
+      int gridRank = gridRankTestH[i][j];
+      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
+      ASSERT_EQUAL(isValidTestH[i][j], 1);
+    }
+    ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
+
+    // Validate uniqueness property of grid rank
+    gridsSeen[i] = gridRankTestH[i][0];
+    for (int k = 0; k < i; ++k) {
+      if (gridsSeen[k] == gridsSeen[i]) {
+        assert (false && "Grid rank in multi-gpu setup should be unique");
+      }
+    }
+  }
+  ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
+
+  // Free host and device memory
+  delete [] launchParamsList;
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    HIPCHECK(hipFree(sizeTestD[i]));
+    HIPCHECK(hipFree(gridRankTestD[i]));
+    HIPCHECK(hipFree(thdRankTestD[i]));
+    HIPCHECK(hipFree(isValidTestD[i]));
+    HIPCHECK(hipFree(syncTestD[i]));
+
+    if (i == 0)
+      HIPCHECK(hipHostFree(syncResultD));
+
+    HIPCHECK(hipHostFree(sizeTestH[i]));
+    HIPCHECK(hipHostFree(gridRankTestH[i]));
+    HIPCHECK(hipHostFree(thdRankTestH[i]));
+    HIPCHECK(hipHostFree(isValidTestH[i]));
+  }
+}
+
+TEST_CASE("Unit_hipCGMultiGridGroupType_BaseType") {
+  // Set `maxThreadsPerBlock` by taking minimum among all available devices
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  nGpu = min(nGpu, MaxGPUs);
+
+  int maxThreadsPerBlock = INT_MAX;
+  hipDeviceProp_t deviceProperties;
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
+    if (!deviceProperties.cooperativeMultiDeviceLaunch) {
+      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+      return;
+    }
+    maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
+  }
+
+  // Test for blockSizes in powers of 2
+  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
+    test_cg_multi_grid_group_type_via_base_type(blockSize, nGpu);
+  }
+
+  // Test for random blockSizes, but the sequence is the same every execution
+  srand(0);
+  for (int i = 0; i < 10; i++) {
+    // Test fails for 0 thread per block
+    test_cg_multi_grid_group_type_via_base_type(max(2, rand() % maxThreadsPerBlock), nGpu);
+  }
+}
@@ -0,0 +1,230 @@
+/*
+Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <cmath>
+#include <cstdlib>
+#include <climits>
+
+#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
+#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
+#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
+
+using namespace cooperative_groups;
+constexpr int MaxGPUs = 8;
+
+static __global__
+void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
+                                                    int* gridRankTestD,
+                                                    int *thdRankTestD,
+                                                    int *isValidTestD,
+                                                    int *syncTestD,
+                                                    int *syncResultD)
+{
+  multi_grid_group mg = this_multi_grid();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test group_size api
+  sizeTestD[gIdx] = group_size(mg);
+
+  // Test thread_rank api
+  gridRankTestD[gIdx] = this_multi_grid().grid_rank();
+  thdRankTestD[gIdx] = thread_rank(mg);
+
+  // Test is_valid api
+  isValidTestD[gIdx] = mg.is_valid();
+
+  // Test sync api
+  //
+  // Eech thread assign 1 to their respective location
+  syncTestD[gIdx] = 1;
+  // Grid level sync
+  sync(this_grid());
+  // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
+      syncTestD[0] += syncTestD[i];
+    }
+    syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
+  }
+  // multi-grid level sync via public api
+  sync(mg);
+  // grid (gpu) 0 does final reduction across all grids (gpus)
+  if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
+    syncResultD[0] = 0;
+    for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
+      syncResultD[0] += syncResultD[i];
+    }
+  }
+}
+
+static void test_cg_multi_grid_group_type_via_public_api(int blockSize, int nGpu)
+{
+  // Create a stream each device
+  hipStream_t stream[MaxGPUs];
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipDeviceSynchronize());  // Make sure work is done on this device
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  // Allocate host and device memory
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
+  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
+  int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
+  int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
+  int *syncTestD[MaxGPUs], *syncResultD;
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
+    HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
+
+    HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
+    HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
+
+    if (i == 0) {
+      HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
+    }
+  }
+
+  // Launch Kernel
+  constexpr int NumKernelArgs = 6;
+  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
+  void* args[MaxGPUs * NumKernelArgs];
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    args[i * NumKernelArgs    ] = &sizeTestD[i];
+    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
+    args[i * NumKernelArgs + 2] = &thdRankTestD[i];
+    args[i * NumKernelArgs + 3] = &isValidTestD[i];
+    args[i * NumKernelArgs + 4] = &syncTestD[i];
+    args[i * NumKernelArgs + 5] = &syncResultD;
+
+    launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
+    launchParamsList[i].gridDim = 2;
+    launchParamsList[i].blockDim = blockSize;
+    launchParamsList[i].sharedMem = 0;
+    launchParamsList[i].stream = stream[i];
+    launchParamsList[i].args = &args[i * NumKernelArgs];
+  }
+  HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
+
+  // Copy result from device to host
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
+  }
+
+  // Validate results
+  int gridsSeen[MaxGPUs];
+  for (int i = 0; i < nGpu; ++i) {
+    for (int j = 0; j < 2 * blockSize; ++j) {
+      ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
+      ASSERT_GE(gridRankTestH[i][j], 0);
+      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
+      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
+      int gridRank = gridRankTestH[i][j];
+      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
+      ASSERT_EQUAL(isValidTestH[i][j], 1);
+    }
+    ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
+
+    // Validate uniqueness property of grid rank
+    gridsSeen[i] = gridRankTestH[i][0];
+    for (int k = 0; k < i; ++k) {
+      if (gridsSeen[k] == gridsSeen[i]) {
+        assert (false && "Grid rank in multi-gpu setup should be unique");
+      }
+    }
+  }
+  ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
+
+  // Free host and device memory
+  delete [] launchParamsList;
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    HIPCHECK(hipFree(sizeTestD[i]));
+    HIPCHECK(hipFree(gridRankTestD[i]));
+    HIPCHECK(hipFree(thdRankTestD[i]));
+    HIPCHECK(hipFree(isValidTestD[i]));
+    HIPCHECK(hipFree(syncTestD[i]));
+
+    if (i == 0)
+      HIPCHECK(hipHostFree(syncResultD));
+
+    HIPCHECK(hipHostFree(sizeTestH[i]));
+    HIPCHECK(hipHostFree(gridRankTestH[i]));
+    HIPCHECK(hipHostFree(thdRankTestH[i]));
+    HIPCHECK(hipHostFree(isValidTestH[i]));
+  }
+}
+
+TEST_CASE("Unit_hipCGMultiGridGroupType_PublicApi") {
+  // Set `maxThreadsPerBlock` by taking minimum among all available devices
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  nGpu = min(nGpu, MaxGPUs);
+
+  int maxThreadsPerBlock = INT_MAX;
+  hipDeviceProp_t deviceProperties;
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
+    if (!deviceProperties.cooperativeMultiDeviceLaunch) {
+      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+      return;
+    }
+    maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
+  }
+
+  // Test for blockSizes in powers of 2
+  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
+    test_cg_multi_grid_group_type_via_public_api(blockSize, nGpu);
+  }
+
+  // Test for random blockSizes, but the sequence is the same every execution
+  srand(0);
+  for (int i = 0; i < 10; i++) {
+    // Test fails for 0 thread per block
+    test_cg_multi_grid_group_type_via_public_api(max(2, rand() % maxThreadsPerBlock), nGpu);
+  }
+}
@@ -0,0 +1,164 @@
+/*
+Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <cstdlib>
+
+#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
+
+using namespace cooperative_groups;
+
+static __global__
+void kernel_cg_thread_block_type(int *sizeTestD,
+                                 int *thdRankTestD,
+                                 int *syncTestD,
+                                 dim3 *groupIndexTestD,
+                                 dim3 *thdIndexTestD)
+{
+  thread_block tb = this_thread_block();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  // Test size
+  sizeTestD[gIdx] = tb.size();
+
+  // Test thread_rank
+  thdRankTestD[gIdx] = tb.thread_rank();
+
+  // Test sync
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  tb.sync();
+  syncTestD[gIdx] = sm[1] * sm[0];
+
+  // Test group_index
+  groupIndexTestD[gIdx] = tb.group_index();
+
+  // Test thread_index
+  thdIndexTestD[gIdx] = tb.thread_index();
+}
+
+static void test_cg_thread_block_type(int blockSize)
+{
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int nDim3Bytes = sizeof(dim3) * 2 * blockSize;
+  int *sizeTestD, *sizeTestH;
+  int *thdRankTestD, *thdRankTestH;
+  int *syncTestD, *syncTestH;
+  dim3 *groupIndexTestD, *groupIndexTestH;
+  dim3 *thdIndexTestD, *thdIndexTestH;
+
+  // Allocate device memory
+  HIPCHECK(hipMalloc(&sizeTestD, nBytes));
+  HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
+  HIPCHECK(hipMalloc(&syncTestD, nBytes));
+  HIPCHECK(hipMalloc(&groupIndexTestD, nDim3Bytes));
+  HIPCHECK(hipMalloc(&thdIndexTestD, nDim3Bytes));
+
+  // Allocate host memory
+  HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
+  HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
+  HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
+  HIPCHECK(hipHostMalloc(&groupIndexTestH, nDim3Bytes));
+  HIPCHECK(hipHostMalloc(&thdIndexTestH, nDim3Bytes));
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_cg_thread_block_type,
+                     2,
+                     blockSize,
+                     0,
+                     0,
+                     sizeTestD,
+                     thdRankTestD,
+                     syncTestD,
+                     groupIndexTestD,
+                     thdIndexTestD);
+
+  // Copy result from device to host
+  HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
+  HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
+  HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
+  HIPCHECK(hipMemcpy(groupIndexTestH, groupIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
+  HIPCHECK(hipMemcpy(thdIndexTestH, thdIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
+
+  // Validate results for both blocks together
+  for (int i = 0; i < 2 * blockSize; ++i) {
+    ASSERT_EQUAL(sizeTestH[i], blockSize);
+    ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
+    ASSERT_EQUAL(syncTestH[i], 200);
+    ASSERT_EQUAL(groupIndexTestH[i].x, (uint) i / blockSize);
+    ASSERT_EQUAL(groupIndexTestH[i].y, 0);
+    ASSERT_EQUAL(groupIndexTestH[i].z, 0);
+    ASSERT_EQUAL(thdIndexTestH[i].x, (uint) i % blockSize);
+    ASSERT_EQUAL(thdIndexTestH[i].y, 0);
+    ASSERT_EQUAL(thdIndexTestH[i].z, 0);
+  }
+
+  // Free device memory
+  HIPCHECK(hipFree(sizeTestD));
+  HIPCHECK(hipFree(thdRankTestD));
+  HIPCHECK(hipFree(syncTestD));
+  HIPCHECK(hipFree(groupIndexTestD));
+  HIPCHECK(hipFree(thdIndexTestD));
+
+  //Free host memory
+  HIPCHECK(hipHostFree(sizeTestH));
+  HIPCHECK(hipHostFree(thdRankTestH));
+  HIPCHECK(hipHostFree(syncTestH));
+  HIPCHECK(hipHostFree(groupIndexTestH));
+  HIPCHECK(hipHostFree(thdIndexTestH));
+}
+
+TEST_CASE("Unit_hipCGThreadBlockType") {
+  // Use default device for validating the test
+  int deviceId;
+  hipDeviceProp_t deviceProperties;
+  HIPCHECK(hipGetDevice(&deviceId));
+  HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
+
+  if (!deviceProperties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  // Test for blockSizes in powers of 2
+  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
+  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
+    test_cg_thread_block_type(blockSize);
+  }
+
+  // Test for random blockSizes, but the sequence is the same every execution
+  srand(0);
+  for (int i = 0; i < 10; i++) {
+    // Test fails for only 1 thread per block
+    test_cg_thread_block_type(max(2, rand() % maxThreadsPerBlock));
+  }
+}
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip_test_common.hh>
+#include "hip/hip_cooperative_groups.h"
+#include <cstdlib>
+
+#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
+
+using namespace cooperative_groups;
+
+static __global__
+void kernel_cg_thread_block_type_via_base_type(int *sizeTestD,
+                                               int *thdRankTestD,
+                                               int *syncTestD)
+{
+  thread_group tg = this_thread_block();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test size
+  sizeTestD[gIdx] = tg.size();
+
+  // Test thread_rank
+  thdRankTestD[gIdx] = tg.thread_rank();
+
+  // Test sync
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  tg.sync();
+  syncTestD[gIdx] = sm[1] * sm[0];
+}
+
+static void test_cg_thread_block_type_via_base_type(int blockSize)
+{
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int *sizeTestD, *sizeTestH;
+  int *thdRankTestD, *thdRankTestH;
+  int *syncTestD, *syncTestH;
+
+  // Allocate device memory
+  HIPCHECK(hipMalloc(&sizeTestD, nBytes));
+  HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
+  HIPCHECK(hipMalloc(&syncTestD, nBytes));
+
+  // Allocate host memory
+  HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
+  HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
+  HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type,
+                     2,
+                     blockSize,
+                     0,
+                     0,
+                     sizeTestD,
+                     thdRankTestD,
+                     syncTestD);
+
+  // Copy result from device to host
+  HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
+  HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
+  HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
+
+  // Validate results for both blocks together
+  for (int i = 0; i < 2 * blockSize; ++i) {
+    ASSERT_EQUAL(sizeTestH[i], blockSize);
+    ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
+    ASSERT_EQUAL(syncTestH[i], 200);
+  }
+
+  // Free device memory
+  HIPCHECK(hipFree(sizeTestD));
+  HIPCHECK(hipFree(thdRankTestD));
+  HIPCHECK(hipFree(syncTestD));
+
+  //Free host memory
+  HIPCHECK(hipHostFree(sizeTestH));
+  HIPCHECK(hipHostFree(thdRankTestH));
+  HIPCHECK(hipHostFree(syncTestH));
+}
+
+TEST_CASE("Unit_hipCGThreadBlockType_BaseType") {
+  // Use default device for validating the test
+  int deviceId;
+  hipDeviceProp_t deviceProperties;
+  HIPCHECK(hipGetDevice(&deviceId));
+  HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
+
+  if (!deviceProperties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  // Test for blockSizes in powers of 2
+  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
+  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
+    test_cg_thread_block_type_via_base_type(blockSize);
+  }
+
+  // Test for random blockSizes, but the sequence is the same every execution
+  srand(0);
+  for (int i = 0; i < 10; i++) {
+    // Test fails for only 1 thread per block
+    test_cg_thread_block_type_via_base_type(max(2, rand() % maxThreadsPerBlock));
+  }
+}
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip_test_common.hh>
+#include "hip/hip_cooperative_groups.h"
+#include <cstdlib>
+
+#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
+
+using namespace cooperative_groups;
+
+static __global__
+void kernel_cg_thread_block_type_via_public_api(int *sizeTestD,
+                                                int *thdRankTestD,
+                                                int *syncTestD)
+{
+  thread_block tb = this_thread_block();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test group_size api
+  sizeTestD[gIdx] = group_size(tb);
+
+  // Test thread_rank api
+  thdRankTestD[gIdx] = thread_rank(tb);
+
+  // Test sync api
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  sync(tb);
+  syncTestD[gIdx] = sm[1] * sm[0];
+}
+
+static void test_cg_thread_block_type_via_public_api(int blockSize)
+{
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int *sizeTestD, *sizeTestH;
+  int *thdRankTestD, *thdRankTestH;
+  int *syncTestD, *syncTestH;
+
+  // Allocate device memory
+  HIPCHECK(hipMalloc(&sizeTestD, nBytes));
+  HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
+  HIPCHECK(hipMalloc(&syncTestD, nBytes));
+
+  // Allocate host memory
+  HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
+  HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
+  HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api,
+                     2,
+                     blockSize,
+                     0,
+                     0,
+                     sizeTestD,
+                     thdRankTestD,
+                     syncTestD);
+
+  // Copy result from device to host
+  HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
+  HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
+  HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
+
+  // Validate results for both blocks together
+  for (int i = 0; i < 2 * blockSize; ++i) {
+    ASSERT_EQUAL(sizeTestH[i], blockSize);
+    ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
+    ASSERT_EQUAL(syncTestH[i], 200);
+  }
+
+  // Free device memory
+  HIPCHECK(hipFree(sizeTestD));
+  HIPCHECK(hipFree(thdRankTestD));
+  HIPCHECK(hipFree(syncTestD));
+
+  //Free host memory
+  HIPCHECK(hipHostFree(sizeTestH));
+  HIPCHECK(hipHostFree(thdRankTestH));
+  HIPCHECK(hipHostFree(syncTestH));
+}
+
+TEST_CASE("Unit_hipCGThreadBlockType_PublicApi") {
+  // Use default device for validating the test
+  int deviceId;
+  hipDeviceProp_t deviceProperties;
+  HIPCHECK(hipGetDevice(&deviceId));
+  HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
+
+  if (!deviceProperties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  // Test for blockSizes in powers of 2
+  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
+  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
+    test_cg_thread_block_type_via_public_api(blockSize);
+  }
+
+  // Test for random blockSizes, but the sequence is the same every execution
+  srand(0);
+  for (int i = 0; i < 10; i++) {
+    // Test fails for only 1 thread per block
+    test_cg_thread_block_type_via_public_api(max(2, rand() % maxThreadsPerBlock));
+  }
+}