SWDEV-1 - Merge github PRs to amd-staging

- https://github.com/ROCm/hip-tests/pull/194 - https://github.com/ROCm/hip-tests/pull/36 - https://github.com/ROCm/hip-tests/pull/44 - https://github.com/ROCm/hip-tests/pull/47 - https://github.com/ROCm/hip-tests/pull/62 - https://github.com/ROCm/hip-tests/pull/63 - https://github.com/ROCm/hip-tests/pull/64 - https://github.com/ROCm/hip-tests/pull/65 - https://github.com/ROCm/hip-tests/pull/66 - https://github.com/ROCm/hip-tests/pull/67 - https://github.com/ROCm/hip-tests/pull/68 - https://github.com/ROCm/hip-tests/pull/69 - https://github.com/ROCm/hip-tests/pull/142 - https://github.com/ROCm/hip-tests/pull/196 - https://github.com/ROCm/hip-tests/pull/238 Change-Id: I74f7fef76d7d536b1cf89dad3e527c92d1cd21b5 [ROCm/hip-tests commit: 6429ef1b60]
2023-12-20 10:24:27 +00:00
@@ -54,6 +54,8 @@
        "Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
        "Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
        "Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters",
        "Unit_hipGraphMemcpyNodeSetParamsToSymbol_Positive_Basic",
        "Unit_hipGraphExecMemcpyNodeSetParamsToSymbol_Positive_Basic",
        "Unit_hipGraphExecMemcpyNodeSetParamsFromSymbol_Positive_Basic",
@@ -183,6 +185,62 @@
        "Unit_hipMemUnmap_negative",
        "=== SWDEV-432556,SWDEV-434211:Below test randomly failing in stress test ===",
        "Unit_hipDeviceGetUuid_From_RocmInfo",
+        "=== SWDEV-434171: Below tests took long time to complete in stress test on 17/11/23 ===",
+        "Unit_Warp_Shfl_Positive_Basic - int",
+        "Unit_Warp_Shfl_Positive_Basic - unsigned int",
+        "Unit_Warp_Shfl_Positive_Basic - long",
+        "Unit_Warp_Shfl_Positive_Basic - unsigned long",
+        "Unit_Warp_Shfl_Positive_Basic - long long",
+        "Unit_Warp_Shfl_Positive_Basic - unsigned long long",
+        "Unit_Warp_Shfl_Positive_Basic - float",
+        "Unit_Warp_Shfl_Positive_Basic - double",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - int",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned int",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - long",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - long long",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long long",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - float",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - double",
+        "=== SWDEV-434878: Below tests failed in stress test on 24/11/23 ===",
+        "Unit_hipGraphUpload_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation",
+        "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation",
+        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation",
+        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation",
+        "=== SWDEV-435667: Below tests failing randomly in stress test on 01/12/23 ===",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - int",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - float",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - double",
+        "Unit_atomicExch_Positive_Multi_Kernel - int",
+        "Unit_atomicExch_Positive_Multi_Kernel - unsigned int",
+        "Unit_atomicExch_Positive_Multi_Kernel - unsigned long",
+        "Unit_atomicExch_Positive_Multi_Kernel - unsigned long long",
+        "Unit_atomicExch_Positive_Multi_Kernel - float",
+        "Unit_atomicExch_Positive_Multi_Kernel - double",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - int",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned int",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long long",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - float",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - double",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - int",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - float",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - double",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - int",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned int",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long long",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double",
    #endif
    #if defined VEGA20
        "=== SWDEV-419112 Below tests fail in stress test on 29/08/23 ===",
@@ -119,6 +119,8 @@
        "Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
        "Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
        "Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters",
        "Unit_hipGraphMemcpyNodeSetParamsToSymbol_Positive_Basic",
        "Unit_hipGraphExecMemcpyNodeSetParamsToSymbol_Positive_Basic",
        "Unit_hipGraphMemcpyNodeSetParamsFromSymbol_Positive_Basic",
@@ -282,6 +284,62 @@
        "Unit_hipMemSetAccess_MultiProc",
        "Unit_hipMemSetAccess_negative",
        "Unit_hipMemUnmap_negative",
+        "=== SWDEV-434171: Below tests took long time to complete in stress test on 17/11/23 ===",
+        "Unit_Warp_Shfl_Positive_Basic - int",
+        "Unit_Warp_Shfl_Positive_Basic - unsigned int",
+        "Unit_Warp_Shfl_Positive_Basic - long",
+        "Unit_Warp_Shfl_Positive_Basic - unsigned long",
+        "Unit_Warp_Shfl_Positive_Basic - long long",
+        "Unit_Warp_Shfl_Positive_Basic - unsigned long long",
+        "Unit_Warp_Shfl_Positive_Basic - float",
+        "Unit_Warp_Shfl_Positive_Basic - double",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - int",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned int",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - long",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - long long",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long long",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - float",
+        "Unit_Warp_Shfl_XOR_Positive_Basic - double",
+        "=== SWDEV-434878: Below tests failed in stress test on 24/11/23 ===",
+        "Unit_hipGraphUpload_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation",
+        "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation",
+        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
+        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation",
+        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation",
+        "=== SWDEV-435667: Below tests failing randomly in stress test on 01/12/23 ===",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - int",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - float",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - double",
+        "Unit_atomicExch_Positive_Multi_Kernel - int",
+        "Unit_atomicExch_Positive_Multi_Kernel - unsigned int",
+        "Unit_atomicExch_Positive_Multi_Kernel - unsigned long",
+        "Unit_atomicExch_Positive_Multi_Kernel - unsigned long long",
+        "Unit_atomicExch_Positive_Multi_Kernel - float",
+        "Unit_atomicExch_Positive_Multi_Kernel - double",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - int",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned int",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long long",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - float",
+        "Unit_atomicExch_system_Positive_Peer_GPUs - double",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - int",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - float",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - double",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - int",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned int",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long long",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float",
+        "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double",
    #endif
        "End of json"
    ]
@@ -44,6 +44,14 @@
        "Grid_Group_Getters_Via_Non_Member_Functions_Positive_Basic",
        "Grid_Group_Sync_Positive_Basic",
        "dynamic_loading_device_kernels_from_library",
-        "Unit_tiled_partition"
+        "Unit_tiled_partition",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - int",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long",
+        "Unit_atomicExch_Positive_Same_Address_Compile_Time - float",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - int",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
+        "Unit_atomicExch_system_Positive_Host_And_GPU - float"
    ]
 }
@@ -30,9 +30,9 @@ int main(int argc, char** argv) {
    | Opt(cmd_options.progress)
        ["-P"]["--progress"]
        ("Show progress bar when running performance tests")
-    | Opt(cmd_options.extended_run)
-        ["-E"]["--extended-run"]
-        ("TODO: Description goes here")
+    | Opt(cmd_options.cg_extended_run, "cg_extened_run")
+        ["-E"]["--cg-extended-run"]
+        ("Number of iterations used for cooperative groups sync tests (default: 5)")
  ;
  // clang-format on

@@ -23,11 +23,11 @@ THE SOFTWARE.
 #pragma once

 struct CmdOptions {
-  int iterations = 1000;
+  int iterations = 10;
  int warmups = 100;
+  int cg_extended_run = 5;
  bool no_display = false;
  bool progress = false;
-  bool extended_run = false;
 };

 extern CmdOptions cmd_options;
@@ -78,6 +78,7 @@ struct CPUGrid {
  unsigned int thread_count_;
 };

+/* Generate dimensions for 1D, 2D and 3D blocks of threads */
 inline dim3 GenerateThreadDimensions() {
  hipDeviceProp_t props;
  HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -99,6 +100,7 @@ inline dim3 GenerateThreadDimensions() {
      dim3(props.warpSize + 1, 3, 3));
 }

+/* Generate dimensions for 1D, 2D and 3D grids of blocks */
 inline dim3 GenerateBlockDimensions() {
  hipDeviceProp_t props;
  HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -116,6 +118,7 @@ inline dim3 GenerateBlockDimensions() {
                       dim3(5, 5, 5));
 }

+/* Generate dimensions for 1D, 2D and 3D blocks of threads - reduced set */
 inline dim3 GenerateThreadDimensionsForShuffle() {
  hipDeviceProp_t props;
  HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -136,6 +139,7 @@ inline dim3 GenerateThreadDimensionsForShuffle() {
      dim3(props.warpSize + 1, 3, 3));
 }

+/* Generate dimensions for 1D, 2D and 3D grids of blocks - reduced set */
 inline dim3 GenerateBlockDimensionsForShuffle() {
  hipDeviceProp_t props;
  HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -102,6 +102,19 @@ THE SOFTWARE.
    }                                                                                              \
  }

+// Check that an expression, errorExpr, evaluates to the expected error_t, expectedError.
+#define HIPRTC_CHECK_ERROR(errorExpr, expectedError)                                               \
+  {                                                                                                \
+    auto localError = errorExpr;                                                                   \
+    INFO("Matching Errors: "                                                                       \
+         << "\n    Expected Error: " << hiprtcGetErrorString(expectedError)                        \
+         << "\n    Expected Code: " << expectedError << '\n'                                       \
+         << "                  Actual Error:   " << hiprtcGetErrorString(localError)               \
+         << "\n    Actual Code:   " << localError << "\nStr: " << #errorExpr                       \
+         << "\n    In File: " << __FILE__ << "\n    At line: " << __LINE__);                       \
+    REQUIRE(localError == expectedError);                                                          \
+  }
+
 // Although its assert, it will be evaluated at runtime
 #define HIP_ASSERT(x)                                                                              \
  { REQUIRE((x)); }
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

+#include <hip_test_common.hh>
+
 // Test groups are named based on the group names from hip_api_runtime.h, with adding "Test" suffix

 /**
@@ -95,8 +97,46 @@ THE SOFTWARE.

 /**
 * @defgroup KernelTest Kernel Functions Management
+* @{
+* This section describes the various kernel functions invocation.
+* @}
+*/
+
+/**
+ * @defgroup AtomicsTest Device Atomics
 * @{
- * This section describes the various kernel functions invocation.
+ * This section describes tests for the Device Atomic APIs.
+ * @}
+ */
+
+ /**
+ * @addtogroup atomicExch atomicExch
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Compiles atomicExch with invalid parameters.
+ *  - Compiles the source with specialized Python tool.
+ *    -# Utilizes sub-process to invoke compilation of faulty source.
+ *    -# Performs post-processing of compiler output and counts errors.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/CMakeLists.txt
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicExch_Negative_Parameters") {}
+/**
+ * End doxygen group atomicExch.
+ * @}
+ */
+
+/**
+ * End doxygen group AtomicsTest.
 * @}
 */

@@ -115,7 +155,14 @@ THE SOFTWARE.
 * @}
 */

- /**
+/**
+ * @defgroup PerformanceTest Performance tests
+ * @{
+ * This section describes performance tests for the target API groups and use-cases.
+ * @}
+ */
+
+/**
 * @defgroup ShflTest warp shuffle function Management
 * @{
 * This section describes the warp shuffle types & functions of HIP runtime API.
@@ -34,6 +34,7 @@ THE SOFTWARE.
 #include <resource_guards.hh>

 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
+#pragma clang diagnostic ignored "-Wunused-parameter"
 #pragma clang diagnostic ignored "-Wunused-function"

 #if defined(_WIN32)
@@ -29,10 +29,30 @@ enum class LinearAllocs {
  hipHostMalloc,
  hipMalloc,
  hipMallocManaged,
+  noAlloc
 };

+inline std::string to_string(const LinearAllocs allocation_type) {
+  switch (allocation_type) {
+    case LinearAllocs::malloc:
+      return "host pageable";
+    case LinearAllocs::mallocAndRegister:
+      return "registered";
+    case LinearAllocs::hipHostMalloc:
+      return "host pinned";
+    case LinearAllocs::hipMalloc:
+      return "device malloc";
+    case LinearAllocs::hipMallocManaged:
+      return "managed";
+    default:
+      return "unknown alloc type";
+  }
+}
+
 template <typename T> class LinearAllocGuard {
 public:
+  LinearAllocGuard() = default;
+
  LinearAllocGuard(const LinearAllocs allocation_type, const size_t size,
                   const unsigned int flags = 0u)
      : allocation_type_{allocation_type} {
@@ -55,15 +75,36 @@ template <typename T> class LinearAllocGuard {
      case LinearAllocs::hipMallocManaged:
        HIP_CHECK(hipMallocManaged(reinterpret_cast<void**>(&ptr_), size, flags ? flags : 1u));
        host_ptr_ = ptr_;
+        break;
+      case LinearAllocs::noAlloc:
+        break;
    }
  }

  LinearAllocGuard(const LinearAllocGuard&) = delete;
-  LinearAllocGuard(LinearAllocGuard&&) = delete;
+
+  LinearAllocGuard(LinearAllocGuard&& o)
+      : allocation_type_{o.allocation_type_}, ptr_{o.ptr_}, host_ptr_{o.host_ptr_} {
+    o.allocation_type_ = LinearAllocs::noAlloc;
+    o.ptr_ = nullptr;
+    o.host_ptr_ = nullptr;
+  }
+
+  LinearAllocGuard& operator=(LinearAllocGuard&& o) {
+    allocation_type_ = o.allocation_type_;
+    ptr_ = o.ptr_;
+    host_ptr_ = o.host_ptr_;
+
+    o.allocation_type_ = LinearAllocs::noAlloc;
+    o.ptr_ = nullptr;
+    o.host_ptr_ = nullptr;
+  }

  ~LinearAllocGuard() {
    // No Catch macros, don't want to possibly throw in the destructor
    switch (allocation_type_) {
+      case LinearAllocs::noAlloc:
+        break;
      case LinearAllocs::malloc:
        free(ptr_);
        break;
@@ -85,7 +126,7 @@ template <typename T> class LinearAllocGuard {
  T* host_ptr() const { return host_ptr_; }

 private:
-  const LinearAllocs allocation_type_;
+  LinearAllocs allocation_type_ = LinearAllocs::noAlloc;
  T* ptr_ = nullptr;
  T* host_ptr_ = nullptr;
 };
@@ -200,7 +241,10 @@ enum class Streams { nullstream, perThread, created, withFlags, withPriority };

 class StreamGuard {
 public:
-  StreamGuard(const Streams stream_type, unsigned int flags = hipStreamDefault, int priority = 0) : stream_type_{stream_type}, flags_{flags}, priority_{priority} {
+  StreamGuard() = default;
+
+  StreamGuard(const Streams stream_type, unsigned int flags = hipStreamDefault, int priority = 0)
+      : stream_type_{stream_type}, flags_{flags}, priority_{priority} {
    switch (stream_type_) {
      case Streams::nullstream:
        stream_ = nullptr;
@@ -219,7 +263,28 @@ class StreamGuard {
  }

  StreamGuard(const StreamGuard&) = delete;
-  StreamGuard(StreamGuard&&) = delete;
+
+  StreamGuard(StreamGuard&& o)
+      : stream_type_{o.stream_type_}, flags_{o.flags_}, priority_{o.priority_}, stream_{o.stream_} {
+    o.stream_type_ = Streams::nullstream;
+    o.flags_ = 0u;
+    o.priority_ = 0;
+    o.stream_ = nullptr;
+  }
+
+  StreamGuard& operator=(StreamGuard&& o) {
+    stream_type_ = o.stream_type_;
+    flags_ = o.flags_;
+    priority_ = o.priority_;
+    stream_ = o.stream_;
+
+    o.stream_type_ = Streams::nullstream;
+    o.flags_ = 0u;
+    o.priority_ = 0;
+    o.stream_ = nullptr;
+
+    return *this;
+  }

  ~StreamGuard() {
    if (stream_type_ == Streams::created) {
@@ -230,23 +295,23 @@ class StreamGuard {
  hipStream_t stream() const { return stream_; }

 private:
-  const Streams stream_type_;
-  unsigned int flags_;
-  int priority_;
-  hipStream_t stream_;
+  Streams stream_type_ = Streams::nullstream;
+  unsigned int flags_ = 0u;
+  int priority_ = 0;
+  hipStream_t stream_ = nullptr;
 };

 class EventsGuard {
-public:
+ public:
  EventsGuard(size_t N) : events_(N) {
-    for (auto &e : events_) HIP_CHECK(hipEventCreate(&e));
+    for (auto& e : events_) HIP_CHECK(hipEventCreate(&e));
  }

  EventsGuard(const EventsGuard&) = delete;
  EventsGuard(EventsGuard&&) = delete;

  ~EventsGuard() {
-    for (auto &e : events_) static_cast<void>(hipEventDestroy(e));
+    for (auto& e : events_) static_cast<void>(hipEventDestroy(e));
  }

  hipEvent_t& operator[](int index) { return events_[index]; }
@@ -255,21 +320,21 @@ public:

  std::vector<hipEvent_t>& event_list() { return events_; }

-private:
+ private:
  std::vector<hipEvent_t> events_;
 };

 class StreamsGuard {
-public:
+ public:
  StreamsGuard(size_t N) : streams_(N) {
-    for (auto &s : streams_) HIP_CHECK(hipStreamCreate(&s));
+    for (auto& s : streams_) HIP_CHECK(hipStreamCreate(&s));
  }

  StreamsGuard(const StreamsGuard&) = delete;
  StreamsGuard(StreamsGuard&&) = delete;

  ~StreamsGuard() {
-    for (auto &s : streams_) static_cast<void>(hipStreamDestroy(s));
+    for (auto& s : streams_) static_cast<void>(hipStreamDestroy(s));
  }

  hipStream_t& operator[](int index) { return streams_[index]; }
@@ -278,6 +343,6 @@ public:

  std::vector<hipStream_t>& stream_list() { return streams_; }

-private:
+ private:
  std::vector<hipStream_t> streams_;
 };
@@ -18,5 +18,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.

+add_subdirectory(stream)
 add_subdirectory(event)
 add_subdirectory(example)
@@ -0,0 +1,63 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+if(HIP_PLATFORM MATCHES "amd")
+set(TEST_SRC
+    hipStreamWaitEvent.cc
+    hipStreamGetFlags.cc
+    hipStreamGetPriority.cc
+    hipExtStreamCreateWithCUMask.cc
+    hipExtStreamGetCUMask.cc
+    hipStreamAddCallback.cc
+    hipStreamWaitValue.cc
+    hipStreamWriteValue.cc
+    hipMallocAsync.cc
+    hipFreeAsync.cc
+    hipMemPoolCreate.cc
+    hipMemPoolDestroy.cc
+    hipMemPoolTrimTo.cc
+    hipMemPoolSetAttribute.cc
+    hipMemPoolGetAttribute.cc
+    hipMemPoolSetAccess.cc
+    hipMallocFromPoolAsync.cc
+    hipMemPoolExportToShareableHandle.cc
+    hipMemPoolImportFromShareableHandle.cc
+    hipMemPoolExportPointer.cc
+    hipMemPoolImportPointer.cc
+    hipStreamBasic.cc
+)
+else()
+set(TEST_SRC
+    hipStreamWaitEvent.cc
+    hipStreamGetFlags.cc
+    hipStreamGetPriority.cc
+    hipStreamAddCallback.cc
+    hipStreamWaitValue.cc
+    hipStreamWriteValue.cc
+    hipMallocAsync.cc
+    hipFreeAsync.cc
+    hipStreamBasic.cc
+)
+endif()
+
+hip_add_exe_to_target(NAME StreamPerformance
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      COMPILE_OPTIONS -std=c++17)
@@ -0,0 +1,65 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ * Contains performance tests for all stream management HIP APIs.
+ */
+
+class ExtStreamCreateWithCUMaskBenchmark : public Benchmark<ExtStreamCreateWithCUMaskBenchmark> {
+ public:
+  void operator()() {
+    hipDeviceProp_t props;
+    HIP_CHECK(hipGetDeviceProperties(&props, 0));
+    std::vector<uint32_t> cu_mask(props.multiProcessorCount, 0);
+    hipStream_t stream{};
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipExtStreamCreateWithCUMask(&stream, cu_mask.size(), cu_mask.data()));
+    }
+
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+};
+
+static void RunBenchmark() {
+  ExtStreamCreateWithCUMaskBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipExtStreamCreateWithCUMask`.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipExtStreamCreateWithCUMask.cc
+ * Test requirements
+ * ------------------------
+ *  - Platform specific (AMD)
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipExtStreamCreateWithCUMask") {
+  RunBenchmark();
+}
@@ -0,0 +1,67 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class ExtStreamGetCUMaskBenchmark : public Benchmark<ExtStreamGetCUMaskBenchmark> {
+ public:
+  void operator()() {
+    hipDeviceProp_t props;
+    HIP_CHECK(hipGetDeviceProperties(&props, 0));
+    std::vector<uint32_t> cu_mask(props.multiProcessorCount, 0);
+    hipStream_t stream{};
+    HIP_CHECK(hipExtStreamCreateWithCUMask(&stream, cu_mask.size(), cu_mask.data()));
+    std::vector<uint32_t> new_cu_mask(cu_mask.size(), 0);
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipExtStreamGetCUMask(stream, new_cu_mask.size(), new_cu_mask.data()));
+    }
+
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+};
+
+static void RunBenchmark() {
+  ExtStreamGetCUMaskBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipExtStreamGetCUMask`.
+ *  - Creates basic mask and gets it into the new one.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipExtStreamGetCUMask.cc
+ * Test requirements
+ * ------------------------
+ *  - Platform specific (AMD)
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipExtStreamGetCUMask") {
+  RunBenchmark();
+}
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class FreeAsyncBenchmark : public Benchmark<FreeAsyncBenchmark> {
+ public:
+  void operator()(const size_t array_size) {
+    const StreamGuard stream_guard{Streams::created};
+    const hipStream_t stream = stream_guard.stream();
+    float* dev_ptr{nullptr};
+    HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&dev_ptr), array_size * sizeof(float), stream));
+
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipFreeAsync(dev_ptr, stream));
+    }
+
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(const size_t array_size) {
+  FreeAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(array_size));
+  benchmark.Run(array_size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipFreeAsync` with created stream:
+ *    -# Allocation size:
+ *      - 4 KB
+ *      - 4 MB
+ *      - 16 MB
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipFreeAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipFreeAsync") {
+  size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(array_size);
+}
@@ -0,0 +1,68 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MallocAsyncBenchmark : public Benchmark<MallocAsyncBenchmark> {
+ public:
+  void operator()(const size_t array_size) {
+    const StreamGuard stream_guard{Streams::created};
+    const hipStream_t stream = stream_guard.stream();
+    float* dev_ptr{nullptr};
+
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&dev_ptr), array_size * sizeof(float), stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+    HIP_CHECK(hipFree(dev_ptr));
+  }
+};
+
+static void RunBenchmark(const size_t array_size) {
+  MallocAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(array_size));
+  benchmark.Run(array_size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMallocAsync` with created stream:
+ *    -# Allocation size:
+ *      - 4 KB
+ *      - 4 MB
+ *      - 16 MB
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMallocAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMallocAsync") {
+  size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(array_size);
+}
@@ -0,0 +1,82 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MallocFromPoolAsyncBenchmark : public Benchmark<MallocFromPoolAsyncBenchmark> {
+ public:
+  void operator()(const size_t array_size) {
+    const StreamGuard stream_guard{Streams::created};
+    const hipStream_t stream = stream_guard.stream();
+
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
+
+    float* array_ptr{nullptr};
+
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMallocFromPoolAsync(&array_ptr, array_size * sizeof(float), mem_pool, stream));
+    }
+
+    REQUIRE(array_ptr != nullptr);
+
+    HIP_CHECK(hipFreeAsync(array_ptr, stream));
+    HIP_CHECK(hipStreamSynchronize(stream));
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark(const size_t array_size) {
+  MallocFromPoolAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(array_size));
+  benchmark.Run(array_size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMallocFromPoolAsync`:
+ *    -# Allocation size:
+ *      - 4 KB
+ *      - 4 MB
+ *      - 16 MB
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMallocFromPoolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMallocFromPoolAsync") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(array_size);
+}
@@ -0,0 +1,71 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolCreateBenchmark : public Benchmark<MemPoolCreateBenchmark> {
+ public:
+  void operator()() {
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
+    }
+
+    REQUIRE(mem_pool != nullptr);
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark() {
+  MemPoolCreateBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolCreate`.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolCreate.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolCreate") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  RunBenchmark();
+}
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolDestroyBenchmark : public Benchmark<MemPoolDestroyBenchmark> {
+ public:
+  void operator()() {
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolDestroy(mem_pool));
+    }
+  }
+};
+
+static void RunBenchmark() {
+  MemPoolDestroyBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Creates new mem pool.
+ *  - Executes `hipMemPoolDestroy`.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolDestroy.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolDestroy") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  RunBenchmark();
+}
@@ -0,0 +1,84 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolExportPointerBenchmark : public Benchmark<MemPoolExportPointerBenchmark> {
+ public:
+  void operator()(const size_t array_size) {
+    float* device_ptr{nullptr};
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolPtrExportData exp_data;
+
+    hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
+    HIP_CHECK(hipMallocFromPoolAsync(&device_ptr, array_size * sizeof(float), mem_pool, nullptr));
+    HIP_CHECK(hipStreamSynchronize(nullptr));
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolExportPointer(&exp_data, device_ptr));
+    }
+
+    HIP_CHECK(hipFreeAsync(device_ptr, nullptr));
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark(const size_t array_size) {
+  MemPoolExportPointerBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(array_size));
+  benchmark.Run(array_size);
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolExportPointer`:
+ *    -# Allocation size:
+ *      - 4 KB
+ *      - 4 MB
+ *      - 16 MB
+ *  - Uses the same process for import and export operations.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolExportPointer.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolExportPointer") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(array_size);
+}
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolExportToShareableHandleBenchmark : public Benchmark<MemPoolExportToShareableHandleBenchmark> {
+ public:
+  void operator()() {
+    hipMemPool_t mem_pool{nullptr};
+    int share_handle;
+
+    hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolExportToShareableHandle(&share_handle, mem_pool, kHandleType, 0));
+    }
+
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark() {
+  MemPoolExportToShareableHandleBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolExportToShareableHandle`.
+ *  - Uses the same process for import and export operations.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolExportToShareableHandle.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolExportToShareableHandle") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  RunBenchmark();
+}
@@ -0,0 +1,76 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolGetAccessBenchmark : public Benchmark<MemPoolGetAccessBenchmark> {
+ public:
+  void operator()() {
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
+
+    hipMemAccessFlags flags = hipMemAccessFlagsProtNone;
+    hipMemLocation location = {
+      hipMemLocationTypeDevice,
+      0
+    };
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolGetAccess(&flags, mem_pool, location));
+    }
+
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark() {
+  MemPoolGetAccessBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolGetAccess`.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolGetAccess.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolGetAccess") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  RunBenchmark();
+}
@@ -0,0 +1,83 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolGetAttributeBenchmark : public Benchmark<MemPoolGetAttributeBenchmark> {
+ public:
+  void operator()(const hipMemPoolAttr attribute) {
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
+
+    uint64_t value{0};
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attribute, &value));
+    }
+
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark(const hipMemPoolAttr attribute) {
+  MemPoolGetAttributeBenchmark benchmark;
+  benchmark.AddSectionName(GetMemPoolAttrSectionName(attribute));
+  benchmark.Run(attribute);
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolGetAttribute`:
+ *    -# Supported attributes:
+ *      - `hipMemPoolAttrReleaseThreshold`
+ *      - `hipMemPoolReuseFollowEventDependencies`
+ *      - `hipMemPoolReuseAllowOpportunistic`
+ *      - `hipMemPoolReuseAllowInternalDependencies`
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolGetAttribute.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolGetAttribute") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  hipMemPoolAttr attribute = GENERATE(hipMemPoolAttrReleaseThreshold,
+                                      hipMemPoolReuseFollowEventDependencies,
+                                      hipMemPoolReuseAllowOpportunistic,
+                                      hipMemPoolReuseAllowInternalDependencies);
+  RunBenchmark(attribute);
+}
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolImportFromShareableHandleBenchmark : public Benchmark<MemPoolImportFromShareableHandleBenchmark> {
+ public:
+  void operator()() {
+    hipMemPool_t mem_pool{nullptr};
+    int share_handle;
+
+    hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
+    HIP_CHECK(hipMemPoolExportToShareableHandle(&share_handle, mem_pool, kHandleType, 0));
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolImportFromShareableHandle(&mem_pool, &share_handle, kHandleType, 0));
+    }
+
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark() {
+  MemPoolImportFromShareableHandleBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolImportFromShareableHandle`.
+ *  - Uses the same process for import and export operations.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolImportFromShareableHandle.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolImportFromShareableHandle") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  RunBenchmark();
+}
@@ -0,0 +1,87 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolImportPointerBenchmark : public Benchmark<MemPoolImportPointerBenchmark> {
+ public:
+  void operator()(const size_t array_size) {
+    float* device_ptr{nullptr};
+    float* device_ptr_import{nullptr};
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolPtrExportData exp_data;
+
+    hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
+    HIP_CHECK(hipMallocFromPoolAsync(&device_ptr, array_size * sizeof(float), mem_pool, nullptr));
+    HIP_CHECK(hipStreamSynchronize(nullptr));
+    HIP_CHECK(hipMemPoolExportPointer(&exp_data, device_ptr));
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolImportPointer(reinterpret_cast<void**>(device_ptr_import), mem_pool, &exp_data));
+    }
+
+    HIP_CHECK(hipFree(device_ptr));
+    HIP_CHECK(hipFree(device_ptr_import));
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark(const size_t array_size) {
+  MemPoolImportPointerBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(array_size));
+  benchmark.Run(array_size);
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolImportPointer`:
+ *    -# Allocation size:
+ *      - 4 KB
+ *      - 4 MB
+ *      - 16 MB
+ *  - Uses the same process for import and export operations.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolImportPointer.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolImportPointer") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(array_size);
+}
@@ -0,0 +1,79 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolSetAccessBenchmark : public Benchmark<MemPoolSetAccessBenchmark> {
+ public:
+  void operator()() {
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
+
+    hipMemAccessDesc desc_list = {
+      {
+        hipMemLocationTypeDevice,
+        0
+      },
+      hipMemAccessFlagsProtReadWrite
+    };
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolSetAccess(mem_pool, &desc_list, 1));
+    }
+
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark() {
+  MemPoolSetAccessBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolSetAccess` with `hipMemAccessFlagsProtReadWrite`.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolSetAccess.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolSetAccess") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  RunBenchmark();
+}
@@ -0,0 +1,83 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolSetAttributeBenchmark : public Benchmark<MemPoolSetAttributeBenchmark> {
+ public:
+  void operator()(const hipMemPoolAttr attribute) {
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
+
+    int value{0};
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attribute, &value));
+    }
+
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark(const hipMemPoolAttr attribute) {
+  MemPoolSetAttributeBenchmark benchmark;
+  benchmark.AddSectionName(GetMemPoolAttrSectionName(attribute));
+  benchmark.Run(attribute);
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolSetAttribute`:
+ *    -# Supported attributes:
+ *      - `hipMemPoolAttrReleaseThreshold`
+ *      - `hipMemPoolReuseFollowEventDependencies`
+ *      - `hipMemPoolReuseAllowOpportunistic`
+ *      - `hipMemPoolReuseAllowInternalDependencies`
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolSetAttribute.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolSetAttribute") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  hipMemPoolAttr attribute = GENERATE(hipMemPoolAttrReleaseThreshold,
+                                      hipMemPoolReuseFollowEventDependencies,
+                                      hipMemPoolReuseAllowOpportunistic,
+                                      hipMemPoolReuseAllowInternalDependencies);
+  RunBenchmark(attribute);
+}
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "mem_pools_performance_common.hh"
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemPoolTrimToBenchmark : public Benchmark<MemPoolTrimToBenchmark> {
+ public:
+  void operator()(const size_t min_bytes_to_hold) {
+    hipMemPool_t mem_pool{nullptr};
+    hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
+    HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemPoolTrimTo(mem_pool, min_bytes_to_hold));
+    }
+
+    HIP_CHECK(hipMemPoolDestroy(mem_pool));
+  }
+};
+
+static void RunBenchmark(const size_t min_bytes_to_hold) {
+  MemPoolTrimToBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(min_bytes_to_hold));
+  benchmark.Run(min_bytes_to_hold);
+}
+
+/**
+ * @warning **MemPool APIs are not fully implemented within current version
+ *          or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
+ *          Therefore, all tests related to MemPool APIs are implemented without formal
+ *          verification and will be verified once HIP fully supports MemPool APIs.**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemPoolTrimTo`:
+ *    -# Minimum bytes to hold:
+ *      - 4 KB
+ *      - 4 MB
+ *      - 16 MB
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipMemPoolTrimTo.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports memory pools
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemPoolTrimTo") {
+  if (!AreMemPoolsSupported(0)) {
+    HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
+                           "attribute. Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  size_t min_bytes_to_hold = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(min_bytes_to_hold);
+}
@@ -0,0 +1,61 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+void Callback(hipStream_t stream, hipError_t status, void* user_data) {}
+
+class StreamAddCallbackBenchmark : public Benchmark<StreamAddCallbackBenchmark> {
+ public:
+  void operator()() {
+    const StreamGuard stream_guard{Streams::created};
+    const hipStream_t stream = stream_guard.stream();
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipStreamAddCallback(stream, Callback, nullptr, 0));
+    }
+  }
+};
+
+static void RunBenchmark() {
+  StreamAddCallbackBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamAddCallback` on the created stream.
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamAddCallback.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamAddCallback") {
+  RunBenchmark();
+}
@@ -0,0 +1,269 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ * Contains performance tests for all hipStream related APIs
+ */
+ 
+ class HipDeviceGetStreamPriorityRangeBenchmark : public Benchmark<HipDeviceGetStreamPriorityRangeBenchmark> {
+ public:
+  void operator()() {
+    int priority_min, priority_max;
+    TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_min, &priority_max)); }
+  }
+};
+
+class HipStreamQueryBenchmark : public Benchmark<HipStreamQueryBenchmark> {
+ public:
+  void operator()(bool perform_work) {
+    hipError_t error;
+    hipStream_t stream;
+    HIP_CHECK(hipStreamCreate(&stream));
+    void *dptr;
+    
+    if(perform_work) {
+      HIP_CHECK(hipMallocAsync(&dptr, 2048 * 4, stream));
+    }
+
+    TIMED_SECTION(kTimerTypeCpu) { error = hipStreamQuery(stream); }
+    
+    if(perform_work) {
+      HIP_CHECK(hipFreeAsync(dptr, stream));
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+};
+
+class HipStreamSynchronizeBenchmark : public Benchmark<HipStreamSynchronizeBenchmark> {
+ public:
+  void operator()() {
+    hipError_t error;
+    hipStream_t stream;
+    HIP_CHECK(hipStreamCreate(&stream));
+    
+    TIMED_SECTION(kTimerTypeCpu) { error = hipStreamSynchronize(stream); }
+      
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+};
+
+class HipStreamDestroyBenchmark : public Benchmark<HipStreamDestroyBenchmark> {
+ public:
+  void operator()() {
+    hipStream_t stream;
+    HIP_CHECK(hipStreamCreate(&stream));
+
+    TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamDestroy(stream)); }
+  }
+};
+
+class HipStreamCreateBenchmark : public Benchmark<HipStreamCreateBenchmark> {
+ public:
+  void operator()() {
+    hipStream_t stream;
+
+    TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreate(&stream)); }
+
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+};
+
+class HipStreamCreateWithPriorityBenchmark : public Benchmark<HipStreamCreateWithPriorityBenchmark> {
+ public:
+  void operator()(unsigned int flag) {
+    hipStream_t stream;
+    int priority_min, priority_max, priority_mid;
+    
+    HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_min, &priority_max));
+    priority_mid = (priority_max + priority_min) / 2;
+    
+    TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreateWithPriority(&stream, flag, priority_mid)); }
+
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+};
+
+
+
+static std::string GetStreamCreateFlagName(unsigned flag) {
+  switch (flag) {
+    case hipStreamDefault:
+      return "hipStreamDefault";
+    case hipStreamNonBlocking:
+      return "hipStreamNonBlocking";
+    default:
+      return "flag combination";
+  }
+}
+
+class HipStreamCreateWithFlagsBenchmark : public Benchmark<HipStreamCreateWithFlagsBenchmark> {
+ public:
+  void operator()(unsigned int flag) {
+    hipStream_t stream;
+
+    TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreateWithFlags(&stream, flag)); }
+
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+};
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamCreate`:
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamBasic.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamCreate") {
+  HipStreamCreateBenchmark benchmark;
+  benchmark.Run();
+}
+
+static void RunBenchmark(unsigned flag) {
+  HipStreamCreateWithFlagsBenchmark benchmark;
+  benchmark.AddSectionName(GetStreamCreateFlagName(flag));
+  benchmark.Run(flag);
+}
+
+static void RunBenchmarkWithPriority(unsigned flag) {
+  HipStreamCreateWithPriorityBenchmark benchmark;
+  benchmark.AddSectionName(GetStreamCreateFlagName(flag));
+  benchmark.Run(flag);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamCreateWithFlags` with all flags:
+ *    -# Flags
+ *      - hipStreamDefault
+ *      - hipStreamNonBlocking
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamBasic.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamCreateWithFlags") {
+  const auto flag = GENERATE(hipStreamDefault, hipStreamNonBlocking);
+  RunBenchmark(flag);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamCreateWithPriority` with all flags:
+ *    -# Flags
+ *      - hipStreamDefault
+ *      - hipStreamNonBlocking
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamBasic.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamCreateWithPriority") {
+  const auto flag = GENERATE(hipStreamDefault, hipStreamNonBlocking);
+  RunBenchmarkWithPriority(flag);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamDestroy`:
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamBasic.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamDestroy") {
+  HipStreamDestroyBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipDeviceGetStreamPriorityRange`:
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamBasic.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipDeviceGetStreamPriorityRange") {
+  HipDeviceGetStreamPriorityRangeBenchmark benchmark;
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamQuery`:
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamBasic.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamQuery") {
+  const auto perform_work = GENERATE(true, false);
+  HipStreamQueryBenchmark benchmark;
+  if(perform_work) {
+    benchmark.AddSectionName("stream with work");
+  } else {
+    benchmark.AddSectionName("stream without work");
+  }
+  benchmark.Run(perform_work);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipDeviceGetStreamPriorityRange`:
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamBasic.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamSynchronize") {
+  HipStreamSynchronizeBenchmark benchmark;
+  benchmark.Run();
+}
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class StreamGetFlagsBenchmark : public Benchmark<StreamGetFlagsBenchmark> {
+ public:
+  void operator()(unsigned int expected_flag) {
+    unsigned int returned_flags{};
+    hipStream_t stream;
+
+    HIP_CHECK(hipStreamCreateWithFlags(&stream, expected_flag));
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipStreamGetFlags(stream, &returned_flags))
+    }
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+};
+
+static void RunBenchmark(unsigned int expected_flag) {
+  StreamGetFlagsBenchmark benchmark;
+  switch (expected_flag) {
+    case hipStreamDefault:
+      benchmark.AddSectionName("hipStreamDefault");
+      break;
+    case hipStreamNonBlocking:
+      benchmark.AddSectionName("hipStreamNonBlocking");
+      break;
+    default:
+      benchmark.AddSectionName("unknown flag type");
+  }
+  benchmark.Run(expected_flag);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamGetFlags`:
+ *    -# Flags:
+ *      - `hipStreamDefault`
+ *      - `hipStreamNonBlocking`
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamGetFlags.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamGetFlags") {
+  unsigned int expected_flag = GENERATE(hipStreamDefault, hipStreamNonBlocking);
+  RunBenchmark(expected_flag);
+}
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class StreamGetPriorityBenchmark : public Benchmark<StreamGetPriorityBenchmark> {
+ public:
+  void operator()(Streams stream_type) {
+    const StreamGuard stream_guard{stream_type};
+    const hipStream_t stream = stream_guard.stream();
+
+    int priority{};
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipStreamGetPriority(stream, &priority));
+    }
+  }
+};
+
+static void RunBenchmark(Streams stream_type) {
+  StreamGetPriorityBenchmark benchmark;
+  switch (stream_type) {
+    case Streams::nullstream:
+      benchmark.AddSectionName("null stream");
+      break;
+    case Streams::created:
+      benchmark.AddSectionName("created");
+      break;
+    default:
+      benchmark.AddSectionName("per thread stream");
+  }
+  benchmark.Run(stream_type);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamGetPriority`:
+ *    -# Stream types:
+ *      - `null`
+ *      - created
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamGetPriority.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamGetPriority") {
+  Streams stream_type = GENERATE(Streams::nullstream, Streams::created);
+  RunBenchmark(stream_type);
+}
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class StreamWaitEventBenchmark : public Benchmark<StreamWaitEventBenchmark> {
+ public:
+  void operator()(Streams stream_type) {
+    const StreamGuard stream_guard{stream_type};
+    const hipStream_t stream = stream_guard.stream();
+    hipEvent_t wait_event{nullptr};
+
+    HIP_CHECK(hipEventCreate(&wait_event));
+    REQUIRE(wait_event != nullptr);
+    HIP_CHECK(hipEventRecord(wait_event, stream));
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipStreamWaitEvent(stream, wait_event, 0));
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    HIP_CHECK(hipEventDestroy(wait_event));
+  }
+};
+
+static void RunBenchmark(Streams stream_type) {
+  StreamWaitEventBenchmark benchmark{};
+  switch (stream_type) {
+    case Streams::nullstream:
+      benchmark.AddSectionName("null stream");
+      break;
+    case Streams::created:
+      benchmark.AddSectionName("created");
+      break;
+    default:
+      benchmark.AddSectionName("per thread stream");
+  }
+  benchmark.Run(stream_type);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamWaitEvent`:
+ *    -# Stream types:
+ *      - `null`
+ *      - created
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamWaitEvent.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamWaitEvent") {
+  Streams stream_type = GENERATE(Streams::nullstream, Streams::created);
+  RunBenchmark(stream_type);
+}
@@ -0,0 +1,172 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+static int IsStreamWaitValueSupported(int device_id) {
+  int wait_value_supported = 0;
+#if HT_AMD
+  HIP_CHECK(hipDeviceGetAttribute(&wait_value_supported, hipDeviceAttributeCanUseStreamWaitValue,
+                                  device_id));
+#else
+  cuDeviceGetAttribute(&wait_value_supported, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS,
+                       device_id);
+#endif
+  return wait_value_supported;
+}
+
+class StreamWaitValue32Benchmark : public Benchmark<StreamWaitValue32Benchmark> {
+ public:
+  void operator()(const size_t array_size, unsigned int flag) {
+    const StreamGuard stream_guard{Streams::created};
+    const hipStream_t stream = stream_guard.stream();
+    uint32_t* value_ptr;
+    uint32_t value{0};
+    if (flag == hipStreamWaitValueAnd) {
+      value = 1;
+    }
+    HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint32_t) * array_size));
+    HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint32_t) * array_size));
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipStreamWaitValue32(stream, value_ptr, value, flag));
+    }
+    HIP_CHECK(hipFree(value_ptr));
+  }
+};
+
+class StreamWaitValue64Benchmark : public Benchmark<StreamWaitValue64Benchmark> {
+ public:
+  void operator()(const size_t array_size, unsigned int flag) {
+    const StreamGuard stream_guard{Streams::created};
+    const hipStream_t stream = stream_guard.stream();
+    uint64_t* value_ptr;
+    uint64_t value{0};
+    if (flag == hipStreamWaitValueAnd) {
+      value = 1;
+    }
+    HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint64_t) * array_size));
+    HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint64_t) * array_size));
+
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipStreamWaitValue64(stream, value_ptr, value, flag));
+    }
+    HIP_CHECK(hipFree(value_ptr));
+  }
+};
+
+template <typename WaitValueBenchmark>
+static void RunBenchmark(const size_t array_size, unsigned int flag) {
+  WaitValueBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(array_size));
+  switch (flag) {
+    case hipStreamWaitValueGte:
+      benchmark.AddSectionName("greater than or equal");
+      break;
+    case hipStreamWaitValueEq:
+      benchmark.AddSectionName("equal");
+      break;
+    case hipStreamWaitValueAnd:
+      benchmark.AddSectionName("logical and");
+      break;
+    case hipStreamWaitValueNor:
+      benchmark.AddSectionName("logical nor");
+      break;
+    default:
+      benchmark.AddSectionName("unknown flag");
+  }
+  benchmark.Run(array_size, flag);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamWaitValue32` for different array sizes:
+ *    -# 4 KB
+ *    -# 4 MB
+ *    -# 16 MB
+ *  - Uses different flag types for wait criteria:
+ *    -# Greater than or equal
+ *    -# Equal
+ *    -# Logical AND
+ *    -# Logical OR
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamWaitValue.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports Stream Wait Value operations
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamWaitValue32") {
+#if HT_AMD
+  if (!IsStreamWaitValueSupported(0)) {
+    HipTest::HIP_SKIP_TEST(
+        "GPU 0 doesn't support hipStreamWaitValue32() function. "
+        "Hence skipping the testing with Pass result.\n");
+    return;
+  }
+
+  size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
+  unsigned int flag = GENERATE(hipStreamWaitValueGte, hipStreamWaitValueEq, hipStreamWaitValueAnd,
+                               hipStreamWaitValueNor);
+  RunBenchmark<StreamWaitValue32Benchmark>(array_size, flag);
+#endif
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamWaitValue64`:
+ *    -# Allocation size:
+ *      - 4 KB
+ *      - 4 MB
+ *      - 16 MB
+ *    -# Wait type:
+ *      - Greater than or equal
+ *      - Equal
+ *      - Logical AND
+ *      - Logical OR
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamWaitValue.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports Stream Wait Value operations
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamWaitValue64") {
+  if (!IsStreamWaitValueSupported(0)) {
+    HipTest::HIP_SKIP_TEST(
+        "GPU 0 doesn't support hipStreamWaitValue64() function. "
+        "Hence skipping the testing with Pass result.\n");
+    return;
+  }
+  size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
+  unsigned int flag = GENERATE(hipStreamWaitValueGte, hipStreamWaitValueEq, hipStreamWaitValueAnd,
+                               hipStreamWaitValueNor);
+  RunBenchmark<StreamWaitValue64Benchmark>(array_size, flag);
+}
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+/**
+ * @addtogroup stream stream
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+#if HT_NVIDIA
+static int IsStreamWriteValueSupported(int device_id) {
+  int write_value_supported = 0;
+
+  cuDeviceGetAttribute(&write_value_supported, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS,
+                       device_id);
+  return write_value_supported;
+}
+#endif
+
+class StreamWriteValue32Benchmark : public Benchmark<StreamWriteValue32Benchmark> {
+ public:
+  void operator()(const size_t array_size) {
+    const StreamGuard stream_guard{Streams::created};
+    const hipStream_t stream = stream_guard.stream();
+    uint32_t* value_ptr;
+    uint32_t value{0};
+    HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint32_t) * array_size));
+    HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint32_t) * array_size));
+
+    TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamWriteValue32(stream, value_ptr, value, 0)); }
+    HIP_CHECK(hipFree(value_ptr));
+  }
+};
+
+class StreamWriteValue64Benchmark : public Benchmark<StreamWriteValue64Benchmark> {
+ public:
+  void operator()(const size_t array_size) {
+    const StreamGuard stream_guard{Streams::created};
+    const hipStream_t stream = stream_guard.stream();
+    uint64_t* value_ptr;
+    uint64_t value{0};
+    HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint64_t) * array_size));
+    HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint64_t) * array_size));
+
+    TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamWriteValue64(stream, value_ptr, value, 0)); }
+    HIP_CHECK(hipFree(value_ptr));
+  }
+};
+
+template <typename WriteValueBenchmark> static void RunBenchmark(const size_t array_size) {
+  WriteValueBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(array_size));
+  benchmark.Run(array_size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamWriteValue32`:
+ *    -# Allocation size:
+ *      - 4 KB
+ *      - 4 MB
+ *      - 16 MB
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamWriteValue.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamWriteValue32") {
+#if HT_AMD
+  size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark<StreamWriteValue32Benchmark>(array_size);
+#endif
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipStreamWriteValue64`:
+ *    -# Allocation size:
+ *      - 4 KB
+ *      - 4 MB
+ *      - 16 MB
+ * Test source
+ * ------------------------
+ *  - performance/stream/hipStreamWriteValue.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipStreamWriteValue64") {
+#if HT_NVIDIA
+  if (!IsStreamWriteValueSupported(0)) {
+    HipTest::HIP_SKIP_TEST(
+        "GPU 0 doesn't support hipStreamWriteValue64() function. "
+        "Hence skipping the testing with Pass result.\n");
+    return;
+  }
+#endif
+  size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark<StreamWriteValue64Benchmark>(array_size);
+}
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+#if __linux__
+  static const hipMemAllocationHandleType kHandleType = hipMemHandleTypePosixFileDescriptor;
+#else
+  static const hipMemAllocationHandleType kHandleType = hipMemHandleTypeWin32;
+#endif
+
+static int AreMemPoolsSupported(int device_id) {
+  int mem_pools_supported = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&mem_pools_supported,
+                                  hipDeviceAttributeMemoryPoolsSupported, 0));
+  return mem_pools_supported;
+}
+
+static hipMemPoolProps CreateMemPoolProps(const int device_id, const hipMemAllocationHandleType handle_type) {
+  hipMemPoolProps kPoolProps = {
+    hipMemAllocationTypePinned,
+    handle_type,
+    {
+      hipMemLocationTypeDevice,
+      device_id
+    },
+    nullptr,
+    {0}
+  };
+
+  return kPoolProps;
+}
+
+static std::string GetMemPoolAttrSectionName(const hipMemPoolAttr attribute) {
+  switch (attribute) {
+    case hipMemPoolReuseFollowEventDependencies:
+      return "ReuseFollowEventDependencies";
+    case hipMemPoolReuseAllowOpportunistic:
+      return "ReuseAllowOpportunistic";
+    case hipMemPoolReuseAllowInternalDependencies:
+      return "ReuseAllowInternalDependencies";
+    case hipMemPoolAttrReleaseThreshold:
+      return "AttrReleaseThreshold";
+    case hipMemPoolAttrReservedMemCurrent:
+      return "AttrReservedMemCurrent";
+    case hipMemPoolAttrReservedMemHigh:
+      return "AttrReservedMemHigh";
+    case hipMemPoolAttrUsedMemCurrent:
+      return "AttrUsedMemCurrent";
+    case hipMemPoolAttrUsedMemHigh:
+      return "AttrUsedMemHigh";
+    default:
+      return "unknown attribute";
+  }
+}
@@ -36,11 +36,14 @@ add_subdirectory(compiler)
 add_subdirectory(errorHandling)
 add_subdirectory(cooperativeGrps)
 add_subdirectory(context)
+add_subdirectory(warp)
 add_subdirectory(dynamicLoading)
 add_subdirectory(g++)
 add_subdirectory(module)
 add_subdirectory(channelDescriptor)
 add_subdirectory(executionControl)
+add_subdirectory(vector_types)
+add_subdirectory(atomics)
 add_subdirectory(p2p)
 add_subdirectory(gcc)

@@ -49,5 +52,5 @@ add_subdirectory(callback)
 add_subdirectory(clock)
 # Vulkan interop APIs currently undefined for Nvidia
 add_subdirectory(vulkan_interop)
+add_subdirectory(gl_interop) # Disabled on NVIDIA due to defect - EXSWHTEC-246
 endif()
-add_subdirectory(vector_types)
@@ -0,0 +1,48 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+set(TEST_SRC
+    atomicExch.cc
+    atomicExch_system.cc
+)
+
+if(HIP_PLATFORM MATCHES "nvidia")
+    set_source_files_properties(atomicExch_system.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+    hip_add_exe_to_target(NAME AtomicsTest
+                        TEST_SRC ${TEST_SRC}
+                        TEST_TARGET_NAME build_tests
+                        LINKER_LIBS "nvrtc -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+elseif(HIP_PLATFORM MATCHES "amd")
+    hip_add_exe_to_target(NAME AtomicsTest
+                        TEST_SRC ${TEST_SRC}
+                        TEST_TARGET_NAME build_tests
+                        LINKER_LIBS hiprtc)
+endif()
+
+# SWDEV-435667: Below 2 tests failed in stress test on 01/12/23
+#add_test(NAME Unit_atomicExch_Negative_Parameters
+#         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+#         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+#         atomicExch_negative_kernels.cc 40)
+#
+#add_test(NAME Unit_atomicExch_system_Negative_Parameters
+#         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+#         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+#         atomicExch_system_negative_kernels.cc 40)
@@ -0,0 +1,213 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "atomicExch_common.hh"
+#include "atomicExch_negative_kernels_rtc.hh"
+
+/**
+ * @addtogroup atomicExch atomicExch
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel wherein all threads will perform an atomic exchange in the same(compile
+ * time deducible) memory location. Each thread will exchange its own grid wide linear index + 1
+ * into the memory location, storing the return value into a separate output array slot
+ * corresponding to it. Once complete, the union of output array and exchange memory is validated to
+ * contain all values in the range [0, number_of_threads].
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicExch
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
+ *      - Exchange memory located in shared memory
+ *      - Several grid and block dimension combinations(only one block is used for shared memory)
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicExch.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+#if HT_NVIDIA
+TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Same_Address_Compile_Time", "", int, unsigned int,
+                   unsigned long long, float) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Same_Address_Compile_Time", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+#endif // HT_NVIDIA
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    AtomicExchSameAddressTest<TestType, AtomicScopes::device>();
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * exchange into a runtime determined memory location. Each thread will exchange its own grid wide
+ * linear index + offset into the memory location, storing the return value into a separate output
+ * array slot corresponding to it. Once complete, the union of output array and exchange memory is
+ * validated to contain all values in the range [0, number_of_threads +
+ * number_of_exchange_memory_slots). Several memory access patterns are tested:
+ *      -# All threads exchange to a single memory location
+ *      -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicExch
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
+ *      - Exchange memory located in shared memory
+ *      - Several grid and block dimension combinations(only one block is used for shared memory)
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicExch.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+#if HT_NVIDIA
+TEMPLATE_TEST_CASE("Unit_atomicExch_Positive", "", int, unsigned int,
+                   unsigned long long, float) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicExch_Positive", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+#endif // HT_NVIDIA
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::device>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::device>(warp_size,
+                                                                             sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::device>(warp_size,
+                                                                             cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a single device wherein all threads will perform
+ * an atomic exchange into a runtime determined memory location. Each thread will exchange its own
+ * grid wide linear index + offset into the memory location, storing the return value into a
+ * separate output array slot corresponding to it. Once complete, the union of output array and
+ * exchange memory is validated to contain all values in the range [0, number_of_threads +
+ * number_of_exchange_memory_slots). Several memory access patterns are tested:
+ *      -# All threads exchange to a single memory location
+ *      -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
+ * for indexing
+ *      -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicExch
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
+ *      - Several grid and block dimension combinations
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicExch.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+#if HT_NVIDIA
+TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Multi_Kernel", "", int, unsigned int,
+                   unsigned long long, float) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Multi_Kernel", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+#endif // HT_NVIDIA
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      AtomicExchSingleDeviceMultipleKernelTest<TestType, AtomicScopes::device>(2, 1,
+                                                                               sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      AtomicExchSingleDeviceMultipleKernelTest<TestType, AtomicScopes::device>(2, warp_size,
+                                                                               sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      AtomicExchSingleDeviceMultipleKernelTest<TestType, AtomicScopes::device>(2, warp_size,
+                                                                               cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
+ * atomicExch
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicExch.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicExch_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source = GENERATE(kAtomicExchInt, kAtomicExchUnsignedInt, kAtomicExchULL,
+                                       kAtomicExchFloat, kAtomicExchDouble);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicExch_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,381 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <numeric>
+
+#include <hip_test_common.hh>
+#include <resource_guards.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <cmd_options.hh>
+
+enum class AtomicScopes { device, system };
+
+template <typename T, AtomicScopes scope> __device__ T perform_atomic_exch(T* address, T val) {
+  if constexpr (scope == AtomicScopes::device) {
+    return atomicExch(address, val);
+  } else if (scope == AtomicScopes::system) {
+    return atomicExch_system(address, val);
+  }
+}
+
+template <typename T, bool use_shared_mem, AtomicScopes scope>
+__global__ void atomic_exch_kernel_compile_time(T* const global_mem, T* const old_vals) {
+  __shared__ T shared_mem;
+
+  const auto tid = cooperative_groups::this_grid().thread_rank();
+
+  T* const mem = use_shared_mem ? &shared_mem : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid == 0) mem[0] = global_mem[0];
+    __syncthreads();
+  }
+
+  old_vals[tid] = perform_atomic_exch<T, scope>(mem, static_cast<T>(tid + 1));
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid == 0) global_mem[0] = mem[0];
+  }
+}
+
+template <typename T>
+__host__ __device__ T* pitched_offset(T* const ptr, const unsigned int pitch,
+                                      const unsigned int idx) {
+  const auto byte_ptr = reinterpret_cast<uint8_t*>(ptr);
+  return reinterpret_cast<T*>(byte_ptr + idx * pitch);
+}
+
+template <typename T, bool use_shared_mem, AtomicScopes scope>
+__global__ void atomic_exch_kernel(T* const global_mem, T* const old_vals, const unsigned int width,
+                                   const unsigned pitch, const T base_val = 0) {
+  extern __shared__ uint8_t shared_mem[];
+
+  const auto tid = cooperative_groups::this_grid().thread_rank();
+
+  T* const mem = use_shared_mem ? reinterpret_cast<T*>(shared_mem) : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid < width) {
+      const auto target = pitched_offset(mem, pitch, tid);
+      *target = *pitched_offset(global_mem, pitch, tid);
+    };
+    __syncthreads();
+  }
+
+  old_vals[tid] = perform_atomic_exch<T, scope>(pitched_offset(mem, pitch, tid % width),
+                                                base_val + static_cast<T>(tid + width));
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid < width) {
+      const auto target = pitched_offset(global_mem, pitch, tid);
+      *target = *pitched_offset(mem, pitch, tid);
+    };
+  }
+}
+
+
+template <typename TestType, bool use_shared_mem, AtomicScopes scope>
+void AtomicExchSameAddress(const dim3 blocks, const dim3 threads, const LinearAllocs alloc_type) {
+  LinearAllocGuard<TestType> mem_dev(alloc_type, sizeof(TestType));
+
+  const auto thread_count = blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z;
+  const auto old_vals_alloc_size = thread_count * sizeof(TestType);
+  LinearAllocGuard<TestType> old_vals_dev(LinearAllocs::hipMalloc, old_vals_alloc_size);
+  std::vector<TestType> old_vals(thread_count + 1);
+
+
+  HIP_CHECK(hipMemset(mem_dev.ptr(), 0, sizeof(TestType)));
+  atomic_exch_kernel_compile_time<TestType, use_shared_mem, scope>
+      <<<blocks, threads>>>(mem_dev.ptr(), old_vals_dev.ptr());
+  HIP_CHECK(
+      hipMemcpy(old_vals.data(), old_vals_dev.ptr(), old_vals_alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(old_vals.data() + thread_count, mem_dev.ptr(), sizeof(TestType),
+                      hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Every thread will exchange its grid-wide linear id into a target location within mem_dev,
+  // receiving back the value previously present therein. This previous value is written to
+  // old_vals_dev.
+  // old_vals_dev will not contain values that the final scheduled warp exchanged into mem_dev, but
+  // mem_dev obviously will.
+  // Given that mem_dev initially contains values in the range [0, width) and that the maximum value
+  // the final thread shall write is thread_count + width - 1, presuming correct operation of
+  // atomicExch, the union of mem_dev and old_vals_dev shall contain values in the range
+  //[0, thread_count + width)
+  std::sort(old_vals.begin(), old_vals.end());
+  for (auto i = 0u; i < old_vals.size(); ++i) {
+    REQUIRE(i == old_vals[i]);
+  }
+}
+
+template <typename TestType, AtomicScopes scope> void AtomicExchSameAddressTest() {
+  const auto threads = GENERATE(dim3(1024), dim3(1023), dim3(511), dim3(17), dim3(31));
+
+  SECTION("Global memory") {
+    const auto blocks = GENERATE(dim3(20));
+    using LA = LinearAllocs;
+    const auto allocation_type =
+        GENERATE(LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister);
+    AtomicExchSameAddress<TestType, false, AtomicScopes::device>(blocks, threads, allocation_type);
+  }
+
+  SECTION("Shared memory") {
+    const auto blocks = dim3(1);
+    AtomicExchSameAddress<TestType, true, AtomicScopes::device>(blocks, threads,
+                                                                LinearAllocs::hipMalloc);
+  }
+}
+
+struct AtomicExchParams {
+  dim3 blocks;
+  dim3 threads;
+  unsigned int num_devices = 1u;
+  unsigned int kernel_count = 1u;
+  unsigned int width = 1u;
+  unsigned int pitch = 0u;
+  unsigned int host_thread_count = 0u;
+  LinearAllocs alloc_type;
+};
+
+
+template <typename Derived, typename T, bool use_shared_mem, AtomicScopes scope>
+class AtomicExchCRTP {
+ public:
+  void run(const AtomicExchParams& p) const {
+    const auto thread_count =
+        p.blocks.x * p.blocks.y * p.blocks.z * p.threads.x * p.threads.y * p.threads.z;
+
+    const auto old_vals_alloc_size = p.kernel_count * thread_count * sizeof(T);
+    std::vector<LinearAllocGuard<T>> old_vals_devs;
+    std::vector<StreamGuard> streams;
+    for (auto i = 0; i < p.num_devices; ++i) {
+      HIP_CHECK(hipSetDevice(i));
+      old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size);
+      for (auto j = 0; j < p.kernel_count; ++j) {
+        streams.emplace_back(Streams::created);
+      }
+    }
+
+    const auto mem_alloc_size = p.width * p.pitch;
+    LinearAllocGuard<T> mem_dev(p.alloc_type, mem_alloc_size);
+
+    const auto host_iters_per_thread =
+        std::max(p.num_devices * p.kernel_count * thread_count / 20, p.width);
+
+    std::vector<T> old_vals(p.num_devices * p.kernel_count * thread_count + p.width +
+                            p.host_thread_count * host_iters_per_thread);
+    std::iota(old_vals.begin(), old_vals.begin() + p.width, 0);
+
+    HIP_CHECK(hipMemcpy2D(mem_dev.ptr(), p.pitch, old_vals.data(), sizeof(T), sizeof(T), p.width,
+                          hipMemcpyHostToDevice));
+
+    const auto shared_mem_size = use_shared_mem ? mem_alloc_size : 0u;
+    for (auto i = 0u; i < p.num_devices; ++i) {
+      const auto device_offset = i * p.kernel_count * thread_count;
+      for (auto j = 0u; j < p.kernel_count; ++j) {
+        const auto& stream = streams[i * p.kernel_count + j].stream();
+        const auto kern_offset = j * thread_count;
+        const auto old_vals = old_vals_devs[i].ptr() + kern_offset;
+        CastToDerived().LaunchKernel(shared_mem_size, stream, mem_dev.ptr(), old_vals,
+                                     device_offset + kern_offset, p);
+      }
+    }
+
+    PerformHostAtomicExchange(p.host_thread_count, host_iters_per_thread, mem_dev.host_ptr(),
+                              old_vals.data(), p);
+
+    for (auto i = 0u; i < p.num_devices; ++i) {
+      const auto device_offset = i * p.kernel_count * thread_count;
+      HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(),
+                          old_vals_alloc_size, hipMemcpyDeviceToHost));
+    }
+    HIP_CHECK(hipMemcpy2D(old_vals.data() + p.num_devices * p.kernel_count * thread_count,
+                          sizeof(T), mem_dev.ptr(), p.pitch, sizeof(T), p.width,
+                          hipMemcpyDeviceToHost));
+
+    CastToDerived().ValidateResults(old_vals);
+  }
+
+ private:
+  const Derived& CastToDerived() const { return static_cast<const Derived&>(*this); }
+
+  static void HostAtomicExchange(const unsigned int iterations, T* mem, T* const old_vals,
+                                 const unsigned int width, const unsigned pitch, T base_val) {
+    for (auto i = 0u; i < iterations; ++i) {
+      T new_val = base_val + static_cast<T>(i);
+      T old_val;
+      __atomic_exchange(pitched_offset(mem, pitch, i % width), &new_val, &old_val,
+                        __ATOMIC_RELAXED);
+      old_vals[i] = old_val;
+    }
+  }
+
+  void PerformHostAtomicExchange(const unsigned int thread_count, const unsigned int iterations,
+                                 T* mem, T* const old_vals, const AtomicExchParams& p) const {
+    if (thread_count == 0) {
+      return;
+    }
+    const auto dev_threads =
+        p.blocks.x * p.blocks.y * p.blocks.z * p.threads.x * p.threads.y * p.threads.z;
+    const auto host_base_val = p.num_devices * p.kernel_count * dev_threads + p.width;
+
+    std::vector<std::thread> threads;
+    for (auto i = 0u; i < thread_count; ++i) {
+      const auto thread_base_val = host_base_val + i * iterations;
+      threads.push_back(std::thread(HostAtomicExchange, iterations, mem, old_vals + thread_base_val,
+                                    p.width, p.pitch, thread_base_val));
+    }
+
+    for (auto& th : threads) {
+      th.join();
+    }
+  }
+};
+
+template <typename T, bool use_shared_mem, AtomicScopes scope>
+class AtomicExch
+    : public AtomicExchCRTP<AtomicExch<T, use_shared_mem, scope>, T, use_shared_mem, scope> {
+ public:
+  void LaunchKernel(const unsigned int shared_mem_size, const hipStream_t stream, T* const mem,
+                    T* const old_vals, const T base_val, const AtomicExchParams& p) const {
+    atomic_exch_kernel<T, use_shared_mem, scope><<<p.blocks, p.threads, shared_mem_size, stream>>>(
+        mem, old_vals, p.width, p.pitch, base_val);
+  }
+
+  void ValidateResults(std::vector<T>& old_vals) const {
+    std::sort(old_vals.begin(), old_vals.end());
+    for (auto i = 0u; i < old_vals.size(); ++i) {
+      REQUIRE(i == old_vals[i]);
+    }
+  }
+};
+
+inline dim3 GenerateAtomicExchThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
+
+inline dim3 GenerateAtomicExchBlockDimensions() {
+  int sm_count = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
+  return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
+}
+
+template <typename TestType, AtomicScopes scope>
+void AtomicExchSingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
+  AtomicExchParams params;
+  params.num_devices = 1;
+  params.kernel_count = 1;
+  params.threads = GenerateAtomicExchThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+
+  SECTION("Global memory") {
+    params.blocks = GenerateAtomicExchBlockDimensions();
+    using LA = LinearAllocs;
+    for (const auto alloc_type :
+         {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+      params.alloc_type = alloc_type;
+      DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+        AtomicExch<TestType, false, scope>().run(params);
+      }
+    }
+  }
+
+  SECTION("Shared memory") {
+    params.blocks = dim3(1);
+    params.alloc_type = LinearAllocs::hipMalloc;
+    AtomicExch<TestType, true, scope>().run(params);
+  }
+}
+
+template <typename TestType, AtomicScopes scope>
+void AtomicExchSingleDeviceMultipleKernelTest(const unsigned int kernel_count,
+                                              const unsigned int width, const unsigned int pitch) {
+  int concurrent_kernels = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0));
+  if (!concurrent_kernels) {
+    HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+    return;
+  }
+
+  AtomicExchParams params;
+  params.num_devices = 1;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateAtomicExchBlockDimensions();
+  params.threads = GenerateAtomicExchThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type :
+       {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      AtomicExch<TestType, false, scope>().run(params);
+    }
+  }
+}
+
+template <typename TestType>
+void AtomicExchMultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices,
+                                                       const unsigned int kernel_count,
+                                                       const unsigned int width,
+                                                       const unsigned int pitch,
+                                                       const unsigned int host_thread_count = 0u) {
+  if (num_devices > 1) {
+    if (HipTest::getDeviceCount() < num_devices) {
+      std::string msg = std::to_string(num_devices) + " devices are required";
+      HipTest::HIP_SKIP_TEST(msg.c_str());
+      return;
+    }
+  }
+
+  if (kernel_count > 1) {
+    for (auto i = 0u; i < num_devices; ++i) {
+      int concurrent_kernels = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
+      if (!concurrent_kernels) {
+        HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+        return;
+      }
+    }
+  }
+
+  AtomicExchParams params;
+  params.num_devices = num_devices;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateAtomicExchBlockDimensions();
+  params.threads = GenerateAtomicExchThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+  params.host_thread_count = host_thread_count;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      AtomicExch<TestType, false, AtomicScopes::system>().run(params);
+    }
+  }
+}
@@ -0,0 +1,94 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+struct Dummy {
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/*int atomicExch(int*, int)*/
+__global__ void atomicExch_int_n1(int* p, int v) { atomicExch(p, p); }
+__global__ void atomicExch_int_n2(int* p, int v) { atomicExch(&p, v); }
+__global__ void atomicExch_int_n3(char* p, int v) { atomicExch(p, v); }
+__global__ void atomicExch_int_n4(short* p, int v) { atomicExch(p, v); }
+__global__ void atomicExch_int_n5(long* p, int v) { atomicExch(p, v); }
+__global__ void atomicExch_int_n6(long long* p, int v) { atomicExch(p, v); }
+__global__ void atomicExch_int_n7(Dummy* p, int v) { atomicExch(p, v); }
+__global__ void atomicExch_int_n8(int* p, Dummy v) { atomicExch(p, v); }
+
+/*unsigned int atomicExch(unsigned int*, unsigned int)*/
+__global__ void atomicExch_unsigned_int_n1(unsigned int* p, unsigned int v) { atomicExch(p, p); }
+__global__ void atomicExch_unsigned_int_n2(unsigned int* p, unsigned int v) { atomicExch(&p, v); }
+__global__ void atomicExch_unsigned_int_n3(char* p, unsigned int v) { atomicExch(p, v); }
+__global__ void atomicExch_unsigned_int_n4(short* p, unsigned int v) { atomicExch(p, v); }
+__global__ void atomicExch_unsigned_int_n5(long* p, unsigned int v) { atomicExch(p, v); }
+__global__ void atomicExch_unsigned_int_n6(long long* p, unsigned int v) { atomicExch(p, v); }
+__global__ void atomicExch_unsigned_int_n7(Dummy* p, unsigned int v) { atomicExch(p, v); }
+__global__ void atomicExch_unsigned_int_n8(unsigned int* p, Dummy v) { atomicExch(p, v); }
+
+// /*unsigned long long atomicExch(unsigned long long*, unsigned long long)*/
+__global__ void atomicExch_unsigned_long_long_n1(unsigned long long* p, unsigned long long v) {
+  atomicExch(p, p);
+}
+__global__ void atomicExch_unsigned_long_long_n2(unsigned long long* p, unsigned long long v) {
+  atomicExch(&p, v);
+}
+__global__ void atomicExch_unsigned_long_long_n3(char* p, unsigned long long v) {
+  atomicExch(p, v);
+}
+__global__ void atomicExch_unsigned_long_long_n4(short* p, unsigned long long v) {
+  atomicExch(p, v);
+}
+__global__ void atomicExch_unsigned_long_long_n5(long* p, unsigned long long v) {
+  atomicExch(p, v);
+}
+__global__ void atomicExch_unsigned_long_long_n6(long long* p, unsigned long long v) {
+  atomicExch(p, v);
+}
+__global__ void atomicExch_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
+  atomicExch(p, v);
+}
+__global__ void atomicExch_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
+  atomicExch(p, v);
+}
+
+// /*float atomicExch(float*, float)*/
+__global__ void atomicExch_float_n1(float* p, float v) { atomicExch(p, p); }
+__global__ void atomicExch_float_n2(float* p, float v) { atomicExch(&p, v); }
+__global__ void atomicExch_float_n3(char* p, float v) { atomicExch(p, v); }
+__global__ void atomicExch_float_n4(short* p, float v) { atomicExch(p, v); }
+__global__ void atomicExch_float_n5(long* p, float v) { atomicExch(p, v); }
+__global__ void atomicExch_float_n6(long long* p, float v) { atomicExch(p, v); }
+__global__ void atomicExch_float_n7(Dummy* p, float v) { atomicExch(p, v); }
+__global__ void atomicExch_float_n8(float* p, Dummy v) { atomicExch(p, v); }
+
+// /*double atomicExch(double*, double)*/
+__global__ void atomicExch_double_n1(double* p, double v) { atomicExch(p, p); }
+__global__ void atomicExch_double_n2(double* p, double v) { atomicExch(&p, v); }
+__global__ void atomicExch_double_n3(char* p, double v) { atomicExch(p, v); }
+__global__ void atomicExch_double_n4(short* p, double v) { atomicExch(p, v); }
+__global__ void atomicExch_double_n5(long* p, double v) { atomicExch(p, v); }
+__global__ void atomicExch_double_n6(long long* p, double v) { atomicExch(p, v); }
+__global__ void atomicExch_double_n7(Dummy* p, double v) { atomicExch(p, v); }
+__global__ void atomicExch_double_n8(double* p, Dummy v) { atomicExch(p, v); }
@@ -0,0 +1,124 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+static constexpr auto kAtomicExchInt{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_int_n1(int* p, int v) { atomicExch(p, p); }
+        __global__ void atomicExch_int_n2(int* p, int v) { atomicExch(&p, v); }
+        __global__ void atomicExch_int_n3(char* p, int v) { atomicExch(p, v); }
+        __global__ void atomicExch_int_n4(short* p, int v) { atomicExch(p, v); }
+        __global__ void atomicExch_int_n5(long* p, int v) { atomicExch(p, v); }
+        __global__ void atomicExch_int_n6(long long* p, int v) { atomicExch(p, v); }
+        __global__ void atomicExch_int_n7(Dummy* p, int v) { atomicExch(p, v); }
+        __global__ void atomicExch_int_n8(int* p, Dummy v) { atomicExch(p, v); }
+    )"};
+
+static constexpr auto kAtomicExchUnsignedInt{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_unsigned_int_n1(unsigned int* p, unsigned int v) { atomicExch(p, p); }
+        __global__ void atomicExch_unsigned_int_n2(unsigned int* p, unsigned int v) { atomicExch(&p, v); }
+        __global__ void atomicExch_unsigned_int_n3(char* p, unsigned int v) { atomicExch(p, v); }
+        __global__ void atomicExch_unsigned_int_n4(short* p, unsigned int v) { atomicExch(p, v); }
+        __global__ void atomicExch_unsigned_int_n5(long* p, unsigned int v) { atomicExch(p, v); }
+        __global__ void atomicExch_unsigned_int_n6(long long* p, unsigned int v) { atomicExch(p, v); }
+        __global__ void atomicExch_unsigned_int_n7(Dummy* p, unsigned int v) { atomicExch(p, v); }
+        __global__ void atomicExch_unsigned_int_n8(unsigned int* p, Dummy v) { atomicExch(p, v); }
+    )"};
+
+static constexpr auto kAtomicExchULL{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_unsigned_long_long_n1(unsigned long long* p, unsigned long long v) {
+          atomicExch(p, p);
+        }
+        __global__ void atomicExch_unsigned_long_long_n2(unsigned long long* p, unsigned long long v) {
+          atomicExch(&p, v);
+        }
+        __global__ void atomicExch_unsigned_long_long_n3(char* p, unsigned long long v) {
+          atomicExch(p, v);
+        }
+        __global__ void atomicExch_unsigned_long_long_n4(short* p, unsigned long long v) {
+          atomicExch(p, v);
+        }
+        __global__ void atomicExch_unsigned_long_long_n5(long* p, unsigned long long v) {
+          atomicExch(p, v);
+        }
+        __global__ void atomicExch_unsigned_long_long_n6(long long* p, unsigned long long v) {
+          atomicExch(p, v);
+        }
+        __global__ void atomicExch_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
+          atomicExch(p, v);
+        }
+        __global__ void atomicExch_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
+          atomicExch(p, v);
+        }
+    )"};
+
+static constexpr auto kAtomicExchFloat{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_float_n1(float* p, float v) { atomicExch(p, p); }
+        __global__ void atomicExch_float_n2(float* p, float v) { atomicExch(&p, v); }
+        __global__ void atomicExch_float_n3(char* p, float v) { atomicExch(p, v); }
+        __global__ void atomicExch_float_n4(short* p, float v) { atomicExch(p, v); }
+        __global__ void atomicExch_float_n5(long* p, float v) { atomicExch(p, v); }
+        __global__ void atomicExch_float_n6(long long* p, float v) { atomicExch(p, v); }
+        __global__ void atomicExch_float_n7(Dummy* p, float v) { atomicExch(p, v); }
+        __global__ void atomicExch_float_n8(float* p, Dummy v) { atomicExch(p, v); }
+    )"};
+
+static constexpr auto kAtomicExchDouble{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_double_n1(double* p, double v) { atomicExch(p, p); }
+        __global__ void atomicExch_double_n2(double* p, double v) { atomicExch(&p, v); }
+        __global__ void atomicExch_double_n3(char* p, double v) { atomicExch(p, v); }
+        __global__ void atomicExch_double_n4(short* p, double v) { atomicExch(p, v); }
+        __global__ void atomicExch_double_n5(long* p, double v) { atomicExch(p, v); }
+        __global__ void atomicExch_double_n6(long long* p, double v) { atomicExch(p, v); }
+        __global__ void atomicExch_double_n7(Dummy* p, double v) { atomicExch(p, v); }
+        __global__ void atomicExch_double_n8(double* p, Dummy v) { atomicExch(p, v); }
+    )"};
@@ -0,0 +1,235 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "atomicExch_common.hh"
+#include "atomicExch_system_negative_kernels_rtc.hh"
+
+/**
+ * @addtogroup atomicExch_system atomicExch_system
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on two devices wherein all threads will perform
+ * an atomic exchange into a runtime determined memory location. Each thread will exchange its own
+ * grid wide linear index + offset into the memory location, storing the return value into a
+ * separate output array slot corresponding to it. Once complete, the union of output array and
+ * exchange memory is validated to contain all values in the range [0, number_of_threads +
+ * number_of_exchange_memory_slots). Several memory access patterns are tested:
+ *      -# All threads exchange to a single memory location
+ *      -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
+ * for indexing
+ *      -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicExch_system
+ *      - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
+ *      - Several grid and block dimension combinations
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicExch_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+#if HT_NVIDIA
+TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Peer_GPUs", "", int, unsigned int,
+                   unsigned long long, float) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Peer_GPUs", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+#endif // HT_NVIDIA
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size,
+                                                                  sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel on a single device wherein all threads will perform an atomic exchange
+ * into a runtime determined memory location. Each thread will exchange its own grid wide linear
+ * index + offset into the memory location, storing the return value into a separate output array
+ * slot corresponding to it. While the kernel is running, the host performs atomic exchanges, in 4
+ * threads, into the same memory location(s). Once complete, the union of output array, exchange
+ * memory, and host output is validated to contain all values in the range [0, number_of_threads +
+ * number_of_exchange_memory_slots + number_of_host_iterations). Several memory access patterns are
+ * tested:
+ *      -# All threads exchange to a single memory location
+ *      -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
+ * for indexing
+ *      -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicExch_system
+ *      - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
+ *      - Several grid and block dimension combinations
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicExch_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+#if HT_NVIDIA
+TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_GPU", "", int, unsigned int,
+                  unsigned long long, float) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_GPU", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+#endif // HT_NVIDIA
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(1, 1, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(1, 1, warp_size, sizeof(TestType),
+                                                                  4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(1, 1, warp_size, cache_line_size,
+                                                                  4);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on two devices wherein all threads will perform
+ * an atomic exchange into a runtime determined memory location. Each thread will exchange its own
+ * grid wide linear index + offset into the memory location, storing the return value into a
+ * separate output array slot corresponding to it. While the kernels are running, the
+ * host performs atomic exchanges, in 4 threads, into the same memory location(s). Once complete,
+ * the union of output array, exchange memory, and host output is validated to contain all values in
+ * the range [0, number_of_threads + number_of_exchange_memory_slots + number_of_host_iterations).
+ * Several memory access patterns are tested:
+ *      -# All threads exchange to a single memory location
+ *      -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
+ * for indexing
+ *      -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicExch_system
+ *      - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
+ *      - Several grid and block dimension combinations
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicExch_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+#if HT_NVIDIA
+TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
+                   unsigned long long, float) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+#endif // HT_NVIDIA
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size, sizeof(TestType),
+                                                                  4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size, cache_line_size,
+                                                                  4);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
+ * atomicExch_system
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicExch_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicExch_system_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source =
+      GENERATE(kAtomicExchSystemInt, kAtomicExchSystemUnsignedInt, kAtomicExchSystemULL,
+               kAtomicExchSystemFloat, kAtomicExchSystemDouble);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicExch_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,112 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+struct Dummy {
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/*int atomicExch_system(int*, int)*/
+__global__ void atomicExch_system_int_n1(int* p, int v) { atomicExch_system(p, p); }
+__global__ void atomicExch_system_int_n2(int* p, int v) { atomicExch_system(&p, v); }
+__global__ void atomicExch_system_int_n3(char* p, int v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_int_n4(short* p, int v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_int_n5(long* p, int v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_int_n6(long long* p, int v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_int_n7(Dummy* p, int v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_int_n8(int* p, Dummy v) { atomicExch_system(p, v); }
+
+/*unsigned int atomicExch_system(unsigned int*, unsigned int)*/
+__global__ void atomicExch_system_unsigned_int_n1(unsigned int* p, unsigned int v) {
+  atomicExch_system(p, p);
+}
+__global__ void atomicExch_system_unsigned_int_n2(unsigned int* p, unsigned int v) {
+  atomicExch_system(&p, v);
+}
+__global__ void atomicExch_system_unsigned_int_n3(char* p, unsigned int v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_int_n4(short* p, unsigned int v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_int_n5(long* p, unsigned int v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_int_n6(long long* p, unsigned int v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_int_n7(Dummy* p, unsigned int v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_int_n8(unsigned int* p, Dummy v) {
+  atomicExch_system(p, v);
+}
+
+// /*unsigned long long atomicExch_system(unsigned long long*, unsigned long long)*/
+__global__ void atomicExch_system_unsigned_long_long_n1(unsigned long long* p,
+                                                        unsigned long long v) {
+  atomicExch_system(p, p);
+}
+__global__ void atomicExch_system_unsigned_long_long_n2(unsigned long long* p,
+                                                        unsigned long long v) {
+  atomicExch_system(&p, v);
+}
+__global__ void atomicExch_system_unsigned_long_long_n3(char* p, unsigned long long v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_long_long_n4(short* p, unsigned long long v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_long_long_n5(long* p, unsigned long long v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_long_long_n6(long long* p, unsigned long long v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
+  atomicExch_system(p, v);
+}
+__global__ void atomicExch_system_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
+  atomicExch_system(p, v);
+}
+
+// /*float atomicExch_system(float*, float)*/
+__global__ void atomicExch_system_float_n1(float* p, float v) { atomicExch_system(p, p); }
+__global__ void atomicExch_system_float_n2(float* p, float v) { atomicExch_system(&p, v); }
+__global__ void atomicExch_system_float_n3(char* p, float v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_float_n4(short* p, float v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_float_n5(long* p, float v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_float_n6(long long* p, float v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_float_n7(Dummy* p, float v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_float_n8(float* p, Dummy v) { atomicExch_system(p, v); }
+
+// /*double atomicExch_system(double*, double)*/
+__global__ void atomicExch_system_double_n1(double* p, double v) { atomicExch_system(p, p); }
+__global__ void atomicExch_system_double_n2(double* p, double v) { atomicExch_system(&p, v); }
+__global__ void atomicExch_system_double_n3(char* p, double v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_double_n4(short* p, double v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_double_n5(long* p, double v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_double_n6(long long* p, double v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_double_n7(Dummy* p, double v) { atomicExch_system(p, v); }
+__global__ void atomicExch_system_double_n8(double* p, Dummy v) { atomicExch_system(p, v); }
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+static constexpr auto kAtomicExchSystemInt{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_system_int_n1(int* p, int v) { atomicExch_system(p, p); }
+        __global__ void atomicExch_system_int_n2(int* p, int v) { atomicExch_system(&p, v); }
+        __global__ void atomicExch_system_int_n3(char* p, int v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_int_n4(short* p, int v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_int_n5(long* p, int v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_int_n6(long long* p, int v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_int_n7(Dummy* p, int v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_int_n8(int* p, Dummy v) { atomicExch_system(p, v); }
+    )"};
+
+static constexpr auto kAtomicExchSystemUnsignedInt{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_system_unsigned_int_n1(unsigned int* p, unsigned int v) {
+          atomicExch_system(p, p);
+        }
+        __global__ void atomicExch_system_unsigned_int_n2(unsigned int* p, unsigned int v) {
+          atomicExch_system(&p, v);
+        }
+        __global__ void atomicExch_system_unsigned_int_n3(char* p, unsigned int v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_int_n4(short* p, unsigned int v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_int_n5(long* p, unsigned int v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_int_n6(long long* p, unsigned int v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_int_n7(Dummy* p, unsigned int v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_int_n8(unsigned int* p, Dummy v) {
+          atomicExch_system(p, v);
+        }
+    )"};
+
+static constexpr auto kAtomicExchSystemULL{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_system_unsigned_long_long_n1(unsigned long long* p,
+                                                                unsigned long long v) {
+          atomicExch_system(p, p);
+        }
+        __global__ void atomicExch_system_unsigned_long_long_n2(unsigned long long* p,
+                                                                unsigned long long v) {
+          atomicExch_system(&p, v);
+        }
+        __global__ void atomicExch_system_unsigned_long_long_n3(char* p, unsigned long long v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_long_long_n4(short* p, unsigned long long v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_long_long_n5(long* p, unsigned long long v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_long_long_n6(long long* p, unsigned long long v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
+          atomicExch_system(p, v);
+        }
+        __global__ void atomicExch_system_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
+          atomicExch_system(p, v);
+        }
+    )"};
+
+static constexpr auto kAtomicExchSystemFloat{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_system_float_n1(float* p, float v) { atomicExch_system(p, p); }
+        __global__ void atomicExch_system_float_n2(float* p, float v) { atomicExch_system(&p, v); }
+        __global__ void atomicExch_system_float_n3(char* p, float v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_float_n4(short* p, float v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_float_n5(long* p, float v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_float_n6(long long* p, float v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_float_n7(Dummy* p, float v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_float_n8(float* p, Dummy v) { atomicExch_system(p, v); }
+    )"};
+
+static constexpr auto kAtomicExchSystemDouble{
+    R"(
+        struct Dummy {
+          __device__ Dummy() {}
+          __device__ ~Dummy() {}
+        };
+
+        __global__ void atomicExch_system_double_n1(double* p, double v) { atomicExch_system(p, p); }
+        __global__ void atomicExch_system_double_n2(double* p, double v) { atomicExch_system(&p, v); }
+        __global__ void atomicExch_system_double_n3(char* p, double v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_double_n4(short* p, double v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_double_n5(long* p, double v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_double_n6(long long* p, double v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_double_n7(Dummy* p, double v) { atomicExch_system(p, v); }
+        __global__ void atomicExch_system_double_n8(double* p, Dummy v) { atomicExch_system(p, v); }
+    )"};
@@ -0,0 +1,107 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import subprocess
+import sys
+import unittest
+
+class CompileAndCapture(unittest.TestCase):
+  path = None
+  expected_error_count = 0
+  expected_warning_count = 0
+  hip_path = None
+  file = None
+  error_string = None
+  warning_string = None
+  platform = None
+
+  def setUp(self):
+    self.error_string = 'error:'
+    self.warning_string = 'warning:'
+    self.assertFalse(self.hip_path == None)
+    self.assertFalse(self.path == None)
+    self.assertFalse(self.file == None)
+    self.assertTrue(self.platform == 'amd' or self.platform == 'nvidia')
+
+  def test(self):
+    compiler_args = [
+      self.hip_path + '/bin/hipcc',
+      '-I' + self.path + '/../../external/Catch2',
+      '-I' + self.path + '/../../include',
+      '-I' + self.path + '/../../external/picojson',
+      '-c',
+      self.path + '/' + self.file,
+      ]
+    # HIP compiler on AMD platforms has limit of 20 errors, and some negative
+    # test cases expect that more errors are detected.
+    if (self.platform == 'amd'):
+      compiler_args.append('-ferror-limit=100')
+    compiler_output = subprocess.run(compiler_args, stderr=subprocess.PIPE)
+    # Get the compiler output in the stdout if -V flag is raised during ctest invocation.
+    compiler_stderr = compiler_output.stderr.decode('UTF-8')
+    print(compiler_stderr)
+
+    error_count = compiler_stderr.count(self.error_string)
+    if self.expected_error_count < 0:
+      self.assertGreater(error_count, 0)
+    else:
+      self.assertEqual(error_count, self.expected_error_count)
+
+    warning_count = compiler_stderr.count(self.warning_string)
+    if self.expected_warning_count < 0:
+      self.assertGreater(warning_count, 0)
+    else:
+      self.assertEqual(warning_count, self.expected_warning_count)
+
+if __name__ == '__main__':
+  try:
+    CompileAndCapture.path = sys.argv[1]
+  except IndexError:
+    CompileAndCapture.path = None
+
+  try:
+    CompileAndCapture.platform = sys.argv[2]
+  except IndexError:
+    CompileAndCapture.platform = None
+  
+  try:
+    CompileAndCapture.hip_path = sys.argv[3]
+  except IndexError:
+    CompileAndCapture.hip_path = None
+
+  try:
+    CompileAndCapture.file = sys.argv[4]
+  except IndexError:
+    CompileAndCapture.file = None
+
+  try:
+    CompileAndCapture.expected_error_count = int(sys.argv[5])
+  except IndexError:
+    CompileAndCapture.expected_error_count = 0
+  
+  try:
+    CompileAndCapture.expected_warning_count = int(sys.argv[6])
+  except IndexError:
+    CompileAndCapture.expected_warning_count = 0
+
+  # Unittest looks at the same argv's as the __main__ and doesn't know how
+  # to handle arguments other than the executable (0). Therefore passing only
+  # executable as the argv for unittest module.
+  unittest.main(argv=[sys.argv[0]])
@@ -1,26 +1,25 @@
 # Common Tests - Test independent of all platforms
 set(TEST_SRC
-  hipCGThreadBlockType.cc
-  hipCGThreadBlockTypeViaBaseType.cc
-  hipCGThreadBlockTypeViaPublicApi.cc
-  hipCGMultiGridGroupType.cc
-  hipCGMultiGridGroupTypeViaBaseType.cc
-  hipCGMultiGridGroupTypeViaPublicApi.cc
+  hipCGThreadBlockType_old.cc
+  hipCGMultiGridGroupType_old.cc
+  hipCGGridGroupType_old.cc
+  hipCGTiledPartitionType_old.cc
+  hipCGThreadBlockTileTypeShfl_old.cc
+  hipCGCoalescedGroups_old.cc
+  hipLaunchCooperativeKernel_old.cc
+  hipLaunchCooperativeKernelMultiDevice_old.cc
  grid_group.cc
  coalesced_groups_shfl_down.cc
  coalesced_groups_shfl_up.cc
-  hipCGTiledPartition.cc
-  hipCGCoalescedGroups.cc
  coalesced_tiled_groups_metagrp.cc
 )
 if(HIP_PLATFORM STREQUAL "nvidia")
-  set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
-  set_source_files_properties(hipCGMultiGridGroupTypeViaBaseType.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
-  set_source_files_properties(hipCGMultiGridGroupTypeViaPublicApi.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+  set_source_files_properties(hipCGMultiGridGroupType_old.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+  set_source_files_properties(hipLaunchCooperativeKernelMultiDevice_old.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
  hip_add_exe_to_target(NAME coopGrpTest
                      TEST_SRC ${TEST_SRC}
                      TEST_TARGET_NAME build_tests
-                      LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+                      LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80, -gencode arch=compute_86,code=sm_86, -gencode=arch=compute_86,code=compute_86")
 else()
  hip_add_exe_to_target(NAME coopGrpTest
                      TEST_SRC ${TEST_SRC}
@@ -0,0 +1,496 @@
+/*
+Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+#include "hip_cg_common.hh"
+
+namespace cg = cooperative_groups;
+
+static __device__ int gm[2];
+
+static __global__ void kernel_cg_grid_group_type(int* size_dev, int* thd_rank_dev,
+                                                 int* is_valid_dev, int* sync_dev) {
+  cg::grid_group gg = cg::this_grid();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test size
+  size_dev[gIdx] = gg.size();
+
+  // Test thread_rank
+  thd_rank_dev[gIdx] = gg.thread_rank();
+
+  // Test is_valid
+  is_valid_dev[gIdx] = gg.is_valid();
+
+  // Test sync
+  if (blockIdx.x == 0 && threadIdx.x == 0)
+    gm[0] = 10;
+  else if (blockIdx.x == 1 && threadIdx.x == 0)
+    gm[1] = 20;
+  gg.sync();
+  sync_dev[gIdx] = gm[1] * gm[0];
+}
+
+static __global__ void kernel_cg_grid_group_type_via_base_type(int* size_dev, int* thd_rank_dev,
+                                                               int* is_valid_dev, int* sync_dev) {
+  cg::thread_group tg = cg::this_grid();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test size
+  size_dev[gIdx] = tg.size();
+
+  // Test thread_rank
+  thd_rank_dev[gIdx] = tg.thread_rank();
+
+  // Test is_valid
+#ifdef __HIP_PLATFORM_AMD__
+  is_valid_dev[gIdx] = tg.is_valid();
+#else
+  // Cuda has no thread_group.is_valid()
+  is_valid_dev[gIdx] = true;
+#endif
+
+  // Test sync
+  if (blockIdx.x == 0 && threadIdx.x == 0)
+    gm[0] = 10;
+  else if (blockIdx.x == 1 && threadIdx.x == 0)
+    gm[1] = 20;
+  tg.sync();
+  sync_dev[gIdx] = gm[1] * gm[0];
+}
+
+static __global__ void kernel_cg_grid_group_type_via_public_api(int* size_dev, int* thd_rank_dev,
+                                                                int* is_valid_dev, int* sync_dev) {
+  cg::grid_group gg = cg::this_grid();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test group_size api
+  size_dev[gIdx] = cg::group_size(gg);
+
+  // Test thread_rank api
+  thd_rank_dev[gIdx] = cg::thread_rank(gg);
+
+  // Test is_valid api
+  is_valid_dev[gIdx] = gg.is_valid();
+
+  // Test sync
+  if (blockIdx.x == 0 && threadIdx.x == 0)
+    gm[0] = 10;
+  else if (blockIdx.x == 1 && threadIdx.x == 0)
+    gm[1] = 20;
+  cg::sync(gg);
+  sync_dev[gIdx] = gm[1] * gm[0];
+}
+
+static __global__ void coop_kernel(unsigned int* first_array, unsigned int* second_array,
+                                   unsigned int loops, unsigned int array_len) {
+  cg::grid_group grid = cg::this_grid();
+  unsigned int rank = grid.thread_rank();
+  unsigned int grid_size = grid.size();
+
+  for (int i = 0; i < loops; i++) {
+    // The goal of this loop is to directly add in values from
+    // array one into array two, on a per-wave basis.
+    for (int offset = rank; offset < array_len; offset += grid_size) {
+      second_array[offset] += first_array[offset];
+    }
+
+    grid.sync();
+
+    // The goal of this loop is to pull data the "mirror" lane in
+    // array two and add it back into array one. This causes inter-
+    // thread swizzling.
+    for (int offset = rank; offset < array_len; offset += grid_size) {
+      unsigned int swizzle_offset = array_len - offset - 1;
+      first_array[offset] += second_array[swizzle_offset];
+    }
+
+    grid.sync();
+  }
+}
+
+static __global__ void test_kernel(unsigned int* atomic_val, unsigned int* array,
+                                   unsigned int loops) {
+  cg::grid_group grid = cg::this_grid();
+  unsigned rank = grid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = clock64();
+      do {
+        long long cur_clock = clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while (time_diff < 1000000);
+    }
+
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
+    }
+    grid.sync();
+    offset += gridDim.x;
+  }
+}
+
+__global__ void test_kernel_gfx11(unsigned int* atomic_val, unsigned int* array,
+                                  unsigned int loops) {
+#if HT_AMD
+  cg::grid_group grid = cg::this_grid();
+  unsigned rank = grid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while (time_diff < 1000000);
+    }
+
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
+    }
+    grid.sync();
+    offset += gridDim.x;
+  }
+#endif
+}
+
+static void verify_coop_buffers(unsigned int* host_input, unsigned int* first_array,
+                                unsigned int* second_array, unsigned int loops,
+                                unsigned int array_len) {
+  unsigned int* expected_first_array = host_input;
+  unsigned int* expected_second_array =
+      reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * array_len));
+  memset(expected_second_array, 0, sizeof(unsigned int) * array_len);
+
+  for (int i = 0; i < loops; i++) {
+    for (int offset = 0; offset < array_len; offset++) {
+      expected_second_array[offset] += expected_first_array[offset];
+    }
+
+    for (int offset = 0; offset < array_len; offset++) {
+      unsigned int swizzle_offset = array_len - offset - 1;
+      expected_first_array[offset] += expected_second_array[swizzle_offset];
+    }
+  }
+
+  for (int i = 0; i < array_len; i++) {
+    REQUIRE(first_array[i] == expected_first_array[i]);
+    REQUIRE(second_array[i] == expected_second_array[i]);
+  }
+
+  free(expected_second_array);
+}
+
+static void verify_barrier_buffer(unsigned int loops, unsigned int warps,
+                                  unsigned int* host_buffer) {
+  unsigned int max_in_this_loop = 0;
+  for (unsigned int i = 0; i < loops; i++) {
+    max_in_this_loop += warps;
+    for (unsigned int j = 0; j < warps; j++) {
+      REQUIRE(host_buffer[i * warps + j] <= max_in_this_loop);
+    }
+  }
+}
+
+template <typename F> static void test_cg_grid_group_type(F kernel_func, int block_size) {
+  int num_bytes = sizeof(int) * 2 * block_size;
+  int *size_dev, *size_host;
+  int *thd_rank_dev, *thd_rank_host;
+  int *is_valid_dev, *is_valid_host;
+  int *sync_dev, *sync_host;
+
+  // Allocate device memory
+  HIP_CHECK(hipMalloc(&size_dev, num_bytes));
+  HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes));
+  HIP_CHECK(hipMalloc(&is_valid_dev, num_bytes));
+  HIP_CHECK(hipMalloc(&sync_dev, num_bytes));
+
+  // Allocate host memory
+  HIP_CHECK(hipHostMalloc(&size_host, num_bytes));
+  HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes));
+  HIP_CHECK(hipHostMalloc(&is_valid_host, num_bytes));
+  HIP_CHECK(hipHostMalloc(&sync_host, num_bytes));
+
+  // Launch Kernel
+  void* params[4];
+  params[0] = &size_dev;
+  params[1] = &thd_rank_dev;
+  params[2] = &is_valid_dev;
+  params[3] = &sync_dev;
+  HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, 2, block_size, params, 0, 0));
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(size_host, size_dev, num_bytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(is_valid_host, is_valid_dev, num_bytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost));
+
+  // Validate results for both blocks together
+  for (int i = 0; i < 2 * block_size; ++i) {
+    ASSERT_EQUAL(size_host[i], 2 * block_size);
+    ASSERT_EQUAL(thd_rank_host[i], i);
+    ASSERT_EQUAL(is_valid_host[i], 1);
+    ASSERT_EQUAL(sync_host[i], 200);
+  }
+
+  // Free device memory
+  HIP_CHECK(hipFree(size_dev));
+  HIP_CHECK(hipFree(thd_rank_dev));
+  HIP_CHECK(hipFree(is_valid_dev));
+  HIP_CHECK(hipFree(sync_dev));
+
+  // Free host memory
+  HIP_CHECK(hipHostFree(size_host));
+  HIP_CHECK(hipHostFree(thd_rank_host));
+  HIP_CHECK(hipHostFree(is_valid_host));
+  HIP_CHECK(hipHostFree(sync_host));
+}
+
+TEST_CASE("Unit_hipCGGridGroupType_Basic") {
+  // Use default device for validating the test
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  void* (*kernel_func)(void);
+
+  SECTION("Default grid group API test") {
+    kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_grid_group_type);
+  }
+#if HT_AMD
+  SECTION("Base type grid group API test") {
+    kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_grid_group_type_via_base_type);
+  }
+#endif
+
+  SECTION("Public API grid group test") {
+    kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_grid_group_type_via_public_api);
+  }
+
+  // Test for block_size in powers of 2
+  int max_threads_per_blk = device_properties.maxThreadsPerBlock;
+  for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
+    test_cg_grid_group_type(kernel_func, block_size);
+  }
+
+  // Test for random blockSizes, but the sequence is the same every execution
+  srand(0);
+  for (int i = 0; i < 10; i++) {
+    // Test fails for only 1 thread per block
+    test_cg_grid_group_type(kernel_func, max(2, rand() % max_threads_per_blk));
+  }
+}
+
+TEST_CASE("Unit_hipCGGridGroupType_DataSharing") {
+  const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
+  HIP_CHECK(hipSetDevice(device));
+
+  hipDeviceProp_t device_properties;
+
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  int loops = GENERATE(1, 2, 3, 4);
+  int width = GENERATE(512, 1024, 2048, 4096);
+
+  // Launch enough waves to fill up all of the GPU
+  int warp_size = device_properties.warpSize;
+  int num_sms = device_properties.multiProcessorCount;
+
+  // Calculate the device occupancy to know how many blocks can be run.
+  int max_blocks_per_sm;
+  HIP_CHECK(
+      hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, coop_kernel, warp_size, 0));
+
+  int num_blocks = max_blocks_per_sm * num_sms;
+
+  // Create Streams
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  // Allocate and initialize data
+
+  // Alocate the host input buffer, and two device buffers
+  unsigned int* input_buffer =
+      reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * width));
+  for (int i = 0; i < width; i++) {
+    input_buffer[i] = i;
+  }
+
+  unsigned int *dev_mem_1, *host_mem_1;
+  host_mem_1 = reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * width));
+  HIP_CHECK(hipMalloc(&dev_mem_1, sizeof(unsigned int) * width));
+  HIP_CHECK(hipMemcpyAsync(dev_mem_1, input_buffer, sizeof(unsigned int) * width,
+                           hipMemcpyHostToDevice, stream));
+
+  unsigned int *dev_mem_2, *host_mem_2;
+  host_mem_2 = reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * width));
+  HIP_CHECK(hipMalloc(&dev_mem_2, sizeof(unsigned int) * width));
+  HIP_CHECK(hipMemsetAsync(dev_mem_2, 0, width * sizeof(unsigned int), stream));
+
+  // Launch the kernels
+  INFO("Launching a cooperative kernel with " << num_blocks << " blocks, each with " << warp_size
+                                              << " threads");
+
+  void* coop_params[4];
+  coop_params[0] = reinterpret_cast<void*>(&dev_mem_1);
+  coop_params[1] = reinterpret_cast<void*>(&dev_mem_2);
+  coop_params[2] = reinterpret_cast<void*>(&loops);
+  coop_params[3] = reinterpret_cast<void*>(&width);
+  HIP_CHECK(hipLaunchCooperativeKernel(coop_kernel, num_blocks, warp_size, coop_params, 0, stream));
+
+  // Read back the buffers and print out their data
+  HIP_CHECK(hipMemcpyAsync(host_mem_1, dev_mem_1, sizeof(unsigned int) * width,
+                           hipMemcpyDeviceToHost, stream));
+  HIP_CHECK(hipMemcpyAsync(host_mem_2, dev_mem_2, sizeof(unsigned int) * width,
+                           hipMemcpyDeviceToHost, stream));
+
+  HIP_CHECK(hipStreamSynchronize(stream));
+
+  verify_coop_buffers(input_buffer, host_mem_1, host_mem_2, loops, width);
+
+  HIP_CHECK(hipStreamDestroy(stream));
+  HIP_CHECK(hipFree(dev_mem_1));
+  HIP_CHECK(hipFree(dev_mem_2));
+  free(input_buffer);
+  free(host_mem_1);
+  free(host_mem_2);
+}
+
+TEST_CASE("Unit_hipCGGridGroupType_Barrier") {
+  const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
+  HIP_CHECK(hipSetDevice(device));
+
+  hipDeviceProp_t device_properties;
+
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  uint32_t loops = GENERATE(1, 2, 3, 4);
+  uint32_t warps = GENERATE(4, 8, 16, 32);
+  uint32_t block_size = 1;
+
+  // Test whether the requested size will fit on the GPU
+  int max_blocks_per_sm;
+  int warp_size = device_properties.warpSize;
+  int num_sms = device_properties.multiProcessorCount;
+
+  int num_threads_in_block = block_size * warp_size;
+
+  auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+  // Calculate the device occupancy to know how many blocks can be run.
+  HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
+                                                         num_threads_in_block, 0));
+
+  int requested_blocks = warps / block_size;
+  if (requested_blocks > max_blocks_per_sm * num_sms) {
+    INFO("Too many blocks requested!");
+    REQUIRE(false);
+  }
+
+  // Each block will output a single value per loop.
+  uint32_t total_buffer_len = requested_blocks * loops;
+
+  // Alocate the buffer that will hold the kernel's output, and which will
+  // also be used to globally synchronize during GWS initialization
+  unsigned int* host_buffer =
+      reinterpret_cast<unsigned int*>(calloc(total_buffer_len, sizeof(unsigned int)));
+
+  unsigned int* kernel_buffer;
+  HIP_CHECK(hipMalloc(&kernel_buffer, sizeof(unsigned int) * total_buffer_len));
+  HIP_CHECK(hipMemcpy(kernel_buffer, host_buffer, sizeof(unsigned int) * total_buffer_len,
+                      hipMemcpyHostToDevice));
+
+  unsigned int* kernel_atomic;
+  HIP_CHECK(hipMalloc(&kernel_atomic, sizeof(unsigned int)));
+  HIP_CHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int)));
+
+  // Launch the kernel
+  INFO("Launching a cooperative kernel with " << warps << " warps in " << requested_blocks
+                                              << " thread blocks");
+
+  void* params[3];
+  params[0] = reinterpret_cast<void*>(&kernel_atomic);
+  params[1] = reinterpret_cast<void*>(&kernel_buffer);
+  params[2] = reinterpret_cast<void*>(&loops);
+  HIP_CHECK(hipLaunchCooperativeKernel(test_kernel_used, requested_blocks, num_threads_in_block,
+                                       params, 0, 0));
+
+  // Read back the buffer to host
+  HIP_CHECK(hipMemcpy(host_buffer, kernel_buffer, sizeof(unsigned int) * total_buffer_len,
+                      hipMemcpyDeviceToHost));
+
+  verify_barrier_buffer(loops, requested_blocks, host_buffer);
+
+  HIP_CHECK(hipFree(kernel_buffer));
+  HIP_CHECK(hipFree(kernel_atomic));
+  free(host_buffer);
+}
@@ -1,240 +0,0 @@
-/*
-Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
- * TEST: %t
- * HIT_END
- */
-
-#include <hip_test_common.hh>
-#include <hip/hip_cooperative_groups.h>
-
-#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
-#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
-#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
-
-using namespace cooperative_groups;
-constexpr int MaxGPUs = 8;
-
-static __global__
-void kernel_cg_multi_grid_group_type(int* numGridsTestD,
-                                     int* gridRankTestD,
-                                     int *sizeTestD,
-                                     int *thdRankTestD,
-                                     int *isValidTestD,
-                                     int *syncTestD,
-                                     int *syncResultD)
-{
-  multi_grid_group mg = this_multi_grid();
-  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  // Test num_grids
-  numGridsTestD[gIdx] = mg.num_grids();
-
-  // Test grid_rank
-  gridRankTestD[gIdx] = mg.grid_rank();
-
-  // Test size
-  sizeTestD[gIdx] = mg.size();
-
-  // Test thread_rank
-  thdRankTestD[gIdx] = mg.thread_rank();
-
-  // Test is_valid
-  isValidTestD[gIdx] = mg.is_valid();
-
-  // Test sync
-  //
-  // Eech thread assign 1 to their respective location
-  syncTestD[gIdx] = 1;
-  // Grid level sync
-  this_grid().sync();
-  // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
-      syncTestD[0] += syncTestD[i];
-    }
-    syncResultD[mg.grid_rank() + 1] = syncTestD[0];
-  }
-  // multi-grid level sync
-  mg.sync();
-  // grid (gpu) 0 does final reduction across all grids (gpus)
-  if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
-    syncResultD[0] = 0;
-    for (uint i = 1; i <= mg.num_grids(); ++i) {
-      syncResultD[0] += syncResultD[i];
-    }
-  }
-}
-
-static void test_cg_multi_grid_group_type(int blockSize, int nGpu)
-{
-  // Create a stream each device
-  hipStream_t stream[MaxGPUs];
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-    HIPCHECK(hipDeviceSynchronize());  // Make sure work is done on this device
-    HIPCHECK(hipStreamCreate(&stream[i]));
-  }
-
-  // Allocate host and device memory
-  int nBytes = sizeof(int) * 2 * blockSize;
-  int *numGridsTestD[MaxGPUs], *numGridsTestH[MaxGPUs];
-  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
-  int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
-  int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
-  int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
-  int *syncTestD[MaxGPUs], *syncResultD;
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    HIPCHECK(hipMalloc(&numGridsTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
-
-    HIPCHECK(hipHostMalloc(&numGridsTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
-
-    if (i == 0) {
-      HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
-    }
-  }
-
-  // Launch Kernel
-  constexpr int NumKernelArgs = 7;
-  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
-  void* args[MaxGPUs * NumKernelArgs];
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    args[i * NumKernelArgs]     = &numGridsTestD[i];
-    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
-    args[i * NumKernelArgs + 2] = &sizeTestD[i];
-    args[i * NumKernelArgs + 3] = &thdRankTestD[i];
-    args[i * NumKernelArgs + 4] = &isValidTestD[i];
-    args[i * NumKernelArgs + 5] = &syncTestD[i];
-    args[i * NumKernelArgs + 6] = &syncResultD;
-
-    launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type);
-    launchParamsList[i].gridDim = 2;
-    launchParamsList[i].blockDim = blockSize;
-    launchParamsList[i].sharedMem = 0;
-    launchParamsList[i].stream = stream[i];
-    launchParamsList[i].args = &args[i * NumKernelArgs];
-  }
-  HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
-
-  // Copy result from device to host
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-    HIPCHECK(hipMemcpy(numGridsTestH[i], numGridsTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
-  }
-
-  // Validate results
-  int gridsSeen[MaxGPUs];
-  for (int i = 0; i < nGpu; ++i) {
-    for (int j = 0; j < 2 * blockSize; ++j) {
-      ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
-      ASSERT_GE(gridRankTestH[i][j], 0);
-      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
-      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
-      ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
-      int gridRank = gridRankTestH[i][j];
-      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
-      ASSERT_EQUAL(isValidTestH[i][j], 1);
-    }
-    ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
-
-    // Validate uniqueness property of grid rank
-    gridsSeen[i] = gridRankTestH[i][0];
-    for (int k = 0; k < i; ++k) {
-      if (gridsSeen[k] == gridsSeen[i]) {
-        assert(false && "Grid rank in multi-gpu setup should be unique");
-      }
-    }
-  }
-  ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
-
-  // Free host and device memory
-  delete [] launchParamsList;
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    HIPCHECK(hipFree(numGridsTestD[i]));
-    HIPCHECK(hipFree(gridRankTestD[i]));
-    HIPCHECK(hipFree(sizeTestD[i]));
-    HIPCHECK(hipFree(thdRankTestD[i]));
-    HIPCHECK(hipFree(isValidTestD[i]));
-    HIPCHECK(hipFree(syncTestD[i]));
-
-    if (i == 0) {
-      HIPCHECK(hipHostFree(syncResultD));
-    }
-    HIPCHECK(hipHostFree(numGridsTestH[i]));
-    HIPCHECK(hipHostFree(gridRankTestH[i]));
-    HIPCHECK(hipHostFree(sizeTestH[i]));
-    HIPCHECK(hipHostFree(thdRankTestH[i]));
-    HIPCHECK(hipHostFree(isValidTestH[i]));
-  }
-}
-
-TEST_CASE("Unit_hipCGMultiGridGroupType") {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  nGpu = min(nGpu, MaxGPUs);
-
-  // Set `maxThreadsPerBlock` by taking minimum among all available devices
-  int maxThreadsPerBlock = INT_MAX;
-  hipDeviceProp_t deviceProperties;
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
-    if (!deviceProperties.cooperativeMultiDeviceLaunch) {
-      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
-      return;
-    }
-    maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
-  }
-
-  // Test for blockSizes in powers of 2
-  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
-    test_cg_multi_grid_group_type(blockSize, nGpu);
-  }
-
-  // Test for random blockSizes, but the sequence is the same every execution
-  srand(0);
-  for (int i = 0; i < 10; i++) {
-    // Test fails for 0 thread per block
-    test_cg_multi_grid_group_type(max(2, rand() % maxThreadsPerBlock), nGpu);
-  }
-}
@@ -1,234 +0,0 @@
-/*
-Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
- * TEST: %t
- * HIT_END
- */
-
-#include <hip_test_common.hh>
-#include <hip/hip_cooperative_groups.h>
-#include <cmath>
-#include <cstdlib>
-#include <climits>
-
-#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
-#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
-#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
-
-using namespace cooperative_groups;
-constexpr int MaxGPUs = 8;
-
-static __global__
-void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
-                                                   int* gridRankTestD,
-                                                   int *thdRankTestD,
-                                                   int *isValidTestD,
-                                                   int *syncTestD,
-                                                   int *syncResultD)
-{
-  thread_group tg = this_multi_grid();  // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda
-
-  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  // Test size
-  sizeTestD[gIdx] = tg.size();
-
-  // Test thread_rank
-  gridRankTestD[gIdx] = this_multi_grid().grid_rank();
-  thdRankTestD[gIdx] = tg.thread_rank();
-
-  // Test is_valid
-#ifdef __HIP_PLATFORM_AMD__
-  isValidTestD[gIdx] = tg.is_valid();
-#else
-  // Cuda has no thread_group.is_valid()
-  isValidTestD[gIdx] = true;
-#endif
-  // Test sync
-  //
-  // Eech thread assign 1 to their respective location
-  syncTestD[gIdx] = 1;
-  // Grid level sync
-  this_grid().sync();
-  // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
-      syncTestD[0] += syncTestD[i];
-    }
-    syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
-  }
-  // multi-grid level sync
-  tg.sync();
-  // grid (gpu) 0 does final reduction across all grids (gpus)
-  if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
-    syncResultD[0] = 0;
-    for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
-      syncResultD[0] += syncResultD[i];
-    }
-  }
-}
-
-static void test_cg_multi_grid_group_type_via_base_type(int blockSize, int nGpu)
-{
-  // Create a stream each device
-  hipStream_t stream[MaxGPUs];
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-    HIPCHECK(hipDeviceSynchronize());  // Make sure work is done on this device
-    HIPCHECK(hipStreamCreate(&stream[i]));
-  }
-
-  // Allocate host and device memory
-  int nBytes = sizeof(int) * 2 * blockSize;
-  int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
-  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
-  int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
-  int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
-  int *syncTestD[MaxGPUs], *syncResultD;
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
-
-    HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
-
-    if (i == 0) {
-      HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
-    }
-  }
-
-  // Launch Kernel
-  constexpr int NumKernelArgs = 6;
-  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
-  void* args[MaxGPUs * NumKernelArgs];
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    args[i * NumKernelArgs    ] = &sizeTestD[i];
-    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
-    args[i * NumKernelArgs + 2] = &thdRankTestD[i];
-    args[i * NumKernelArgs + 3] = &isValidTestD[i];
-    args[i * NumKernelArgs + 4] = &syncTestD[i];
-    args[i * NumKernelArgs + 5] = &syncResultD;
-
-    launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
-    launchParamsList[i].gridDim = 2;
-    launchParamsList[i].blockDim = blockSize;
-    launchParamsList[i].sharedMem = 0;
-    launchParamsList[i].stream = stream[i];
-    launchParamsList[i].args = &args[i * NumKernelArgs];
-  }
-  HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
-
-  // Copy result from device to host
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-    HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
-  }
-
-  // Validate results
-  int gridsSeen[MaxGPUs];
-  for (int i = 0; i < nGpu; ++i) {
-    for (int j = 0; j < 2 * blockSize; ++j) {
-      ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
-      ASSERT_GE(gridRankTestH[i][j], 0);
-      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
-      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
-      int gridRank = gridRankTestH[i][j];
-      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
-      ASSERT_EQUAL(isValidTestH[i][j], 1);
-    }
-    ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
-
-    // Validate uniqueness property of grid rank
-    gridsSeen[i] = gridRankTestH[i][0];
-    for (int k = 0; k < i; ++k) {
-      if (gridsSeen[k] == gridsSeen[i]) {
-        assert (false && "Grid rank in multi-gpu setup should be unique");
-      }
-    }
-  }
-  ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
-
-  // Free host and device memory
-  delete [] launchParamsList;
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    HIPCHECK(hipFree(sizeTestD[i]));
-    HIPCHECK(hipFree(gridRankTestD[i]));
-    HIPCHECK(hipFree(thdRankTestD[i]));
-    HIPCHECK(hipFree(isValidTestD[i]));
-    HIPCHECK(hipFree(syncTestD[i]));
-
-    if (i == 0)
-      HIPCHECK(hipHostFree(syncResultD));
-
-    HIPCHECK(hipHostFree(sizeTestH[i]));
-    HIPCHECK(hipHostFree(gridRankTestH[i]));
-    HIPCHECK(hipHostFree(thdRankTestH[i]));
-    HIPCHECK(hipHostFree(isValidTestH[i]));
-  }
-}
-
-TEST_CASE("Unit_hipCGMultiGridGroupType_BaseType") {
-  // Set `maxThreadsPerBlock` by taking minimum among all available devices
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  nGpu = min(nGpu, MaxGPUs);
-
-  int maxThreadsPerBlock = INT_MAX;
-  hipDeviceProp_t deviceProperties;
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
-    if (!deviceProperties.cooperativeMultiDeviceLaunch) {
-      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
-      return;
-    }
-    maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
-  }
-
-  // Test for blockSizes in powers of 2
-  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
-    test_cg_multi_grid_group_type_via_base_type(blockSize, nGpu);
-  }
-
-  // Test for random blockSizes, but the sequence is the same every execution
-  srand(0);
-  for (int i = 0; i < 10; i++) {
-    // Test fails for 0 thread per block
-    test_cg_multi_grid_group_type_via_base_type(max(2, rand() % maxThreadsPerBlock), nGpu);
-  }
-}
@@ -1,230 +0,0 @@
-/*
-Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
- * TEST: %t
- * HIT_END
- */
-
-#include <hip_test_common.hh>
-#include <hip/hip_cooperative_groups.h>
-#include <cmath>
-#include <cstdlib>
-#include <climits>
-
-#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
-#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
-#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
-
-using namespace cooperative_groups;
-constexpr int MaxGPUs = 8;
-
-static __global__
-void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
-                                                    int* gridRankTestD,
-                                                    int *thdRankTestD,
-                                                    int *isValidTestD,
-                                                    int *syncTestD,
-                                                    int *syncResultD)
-{
-  multi_grid_group mg = this_multi_grid();
-  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  // Test group_size api
-  sizeTestD[gIdx] = group_size(mg);
-
-  // Test thread_rank api
-  gridRankTestD[gIdx] = this_multi_grid().grid_rank();
-  thdRankTestD[gIdx] = thread_rank(mg);
-
-  // Test is_valid api
-  isValidTestD[gIdx] = mg.is_valid();
-
-  // Test sync api
-  //
-  // Eech thread assign 1 to their respective location
-  syncTestD[gIdx] = 1;
-  // Grid level sync
-  sync(this_grid());
-  // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
-      syncTestD[0] += syncTestD[i];
-    }
-    syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
-  }
-  // multi-grid level sync via public api
-  sync(mg);
-  // grid (gpu) 0 does final reduction across all grids (gpus)
-  if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
-    syncResultD[0] = 0;
-    for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
-      syncResultD[0] += syncResultD[i];
-    }
-  }
-}
-
-static void test_cg_multi_grid_group_type_via_public_api(int blockSize, int nGpu)
-{
-  // Create a stream each device
-  hipStream_t stream[MaxGPUs];
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-    HIPCHECK(hipDeviceSynchronize());  // Make sure work is done on this device
-    HIPCHECK(hipStreamCreate(&stream[i]));
-  }
-
-  // Allocate host and device memory
-  int nBytes = sizeof(int) * 2 * blockSize;
-  int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
-  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
-  int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
-  int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
-  int *syncTestD[MaxGPUs], *syncResultD;
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
-    HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
-
-    HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
-    HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
-
-    if (i == 0) {
-      HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
-    }
-  }
-
-  // Launch Kernel
-  constexpr int NumKernelArgs = 6;
-  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
-  void* args[MaxGPUs * NumKernelArgs];
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    args[i * NumKernelArgs    ] = &sizeTestD[i];
-    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
-    args[i * NumKernelArgs + 2] = &thdRankTestD[i];
-    args[i * NumKernelArgs + 3] = &isValidTestD[i];
-    args[i * NumKernelArgs + 4] = &syncTestD[i];
-    args[i * NumKernelArgs + 5] = &syncResultD;
-
-    launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
-    launchParamsList[i].gridDim = 2;
-    launchParamsList[i].blockDim = blockSize;
-    launchParamsList[i].sharedMem = 0;
-    launchParamsList[i].stream = stream[i];
-    launchParamsList[i].args = &args[i * NumKernelArgs];
-  }
-  HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
-
-  // Copy result from device to host
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
-    HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
-  }
-
-  // Validate results
-  int gridsSeen[MaxGPUs];
-  for (int i = 0; i < nGpu; ++i) {
-    for (int j = 0; j < 2 * blockSize; ++j) {
-      ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
-      ASSERT_GE(gridRankTestH[i][j], 0);
-      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
-      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
-      int gridRank = gridRankTestH[i][j];
-      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
-      ASSERT_EQUAL(isValidTestH[i][j], 1);
-    }
-    ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
-
-    // Validate uniqueness property of grid rank
-    gridsSeen[i] = gridRankTestH[i][0];
-    for (int k = 0; k < i; ++k) {
-      if (gridsSeen[k] == gridsSeen[i]) {
-        assert (false && "Grid rank in multi-gpu setup should be unique");
-      }
-    }
-  }
-  ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
-
-  // Free host and device memory
-  delete [] launchParamsList;
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipSetDevice(i));
-
-    HIPCHECK(hipFree(sizeTestD[i]));
-    HIPCHECK(hipFree(gridRankTestD[i]));
-    HIPCHECK(hipFree(thdRankTestD[i]));
-    HIPCHECK(hipFree(isValidTestD[i]));
-    HIPCHECK(hipFree(syncTestD[i]));
-
-    if (i == 0)
-      HIPCHECK(hipHostFree(syncResultD));
-
-    HIPCHECK(hipHostFree(sizeTestH[i]));
-    HIPCHECK(hipHostFree(gridRankTestH[i]));
-    HIPCHECK(hipHostFree(thdRankTestH[i]));
-    HIPCHECK(hipHostFree(isValidTestH[i]));
-  }
-}
-
-TEST_CASE("Unit_hipCGMultiGridGroupType_PublicApi") {
-  // Set `maxThreadsPerBlock` by taking minimum among all available devices
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  nGpu = min(nGpu, MaxGPUs);
-
-  int maxThreadsPerBlock = INT_MAX;
-  hipDeviceProp_t deviceProperties;
-  for (int i = 0; i < nGpu; i++) {
-    HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
-    if (!deviceProperties.cooperativeMultiDeviceLaunch) {
-      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
-      return;
-    }
-    maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
-  }
-
-  // Test for blockSizes in powers of 2
-  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
-    test_cg_multi_grid_group_type_via_public_api(blockSize, nGpu);
-  }
-
-  // Test for random blockSizes, but the sequence is the same every execution
-  srand(0);
-  for (int i = 0; i < 10; i++) {
-    // Test fails for 0 thread per block
-    test_cg_multi_grid_group_type_via_public_api(max(2, rand() % maxThreadsPerBlock), nGpu);
-  }
-}
@@ -0,0 +1,638 @@
+/*
+Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+#include "hip_cg_common.hh"
+
+namespace cg = cooperative_groups;
+
+static __global__ void kernel_cg_multi_grid_group_type(int* grid_rank_dev, int* size_dev,
+                                                       int* thd_rank_dev, int* is_valid_dev,
+                                                       int* sync_dev, int* sync_result,
+                                                       int* num_grids_dev) {
+  cg::multi_grid_group mg = cg::this_multi_grid();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test num_grids
+  num_grids_dev[gIdx] = mg.num_grids();
+
+  // Test grid_rank
+  grid_rank_dev[gIdx] = mg.grid_rank();
+
+  // Test size
+  size_dev[gIdx] = mg.size();
+
+  // Test thread_rank
+  thd_rank_dev[gIdx] = mg.thread_rank();
+
+  // Test is_valid
+  is_valid_dev[gIdx] = mg.is_valid();
+
+  // Test sync
+  //
+  // Eech thread assign 1 to their respective location
+  sync_dev[gIdx] = 1;
+  // Grid level sync
+  cg::this_grid().sync();
+  // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
+      sync_dev[0] += sync_dev[i];
+    }
+    sync_result[mg.grid_rank() + 1] = sync_dev[0];
+  }
+  // multi-grid level sync
+  mg.sync();
+  // grid (gpu) 0 does final reduction across all grids (gpus)
+  if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
+    sync_result[0] = 0;
+    for (uint i = 1; i <= mg.num_grids(); ++i) {
+      sync_result[0] += sync_result[i];
+    }
+  }
+}
+
+static __global__ void kernel_cg_multi_grid_group_type_via_base_type(
+    int* grid_rank_dev, int* size_dev, int* thd_rank_dev, int* is_valid_dev, int* sync_dev,
+    int* sync_result) {
+  cg::thread_group tg =
+      cg::this_multi_grid();  // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda
+
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test size
+  size_dev[gIdx] = tg.size();
+
+  // Test thread_rank
+  grid_rank_dev[gIdx] = cg::this_multi_grid().grid_rank();
+  thd_rank_dev[gIdx] = tg.thread_rank();
+
+  // Test is_valid
+#ifdef __HIP_PLATFORM_AMD__
+  is_valid_dev[gIdx] = tg.is_valid();
+#else
+  // Cuda has no thread_group.is_valid()
+  is_valid_dev[gIdx] = true;
+#endif
+  // Test sync
+  //
+  // Eech thread assign 1 to their respective location
+  sync_dev[gIdx] = 1;
+  // Grid level sync
+  cg::this_grid().sync();
+  // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
+      sync_dev[0] += sync_dev[i];
+    }
+    sync_result[cg::this_multi_grid().grid_rank() + 1] = sync_dev[0];
+  }
+  // multi-grid level sync
+  tg.sync();
+  // grid (gpu) 0 does final reduction across all grids (gpus)
+  if (cg::this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
+    sync_result[0] = 0;
+    for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
+      sync_result[0] += sync_result[i];
+    }
+  }
+}
+
+static __global__ void kernel_cg_multi_grid_group_type_via_public_api(
+    int* grid_rank_dev, int* size_dev, int* thd_rank_dev, int* is_valid_dev, int* sync_dev,
+    int* sync_result) {
+  cg::multi_grid_group mg = cg::this_multi_grid();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test group_size api
+  size_dev[gIdx] = cg::group_size(mg);
+
+  // Test thread_rank api
+  grid_rank_dev[gIdx] = cg::this_multi_grid().grid_rank();
+  thd_rank_dev[gIdx] = cg::thread_rank(mg);
+
+  // Test is_valid api
+  is_valid_dev[gIdx] = mg.is_valid();
+
+  // Test sync api
+  //
+  // Eech thread assign 1 to their respective location
+  sync_dev[gIdx] = 1;
+  // Grid level sync
+  cg::sync(cg::this_grid());
+  // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
+      sync_dev[0] += sync_dev[i];
+    }
+    sync_result[cg::this_multi_grid().grid_rank() + 1] = sync_dev[0];
+  }
+  // multi-grid level sync via public api
+  cg::sync(mg);
+  // grid (gpu) 0 does final reduction across all grids (gpus)
+  if (cg::this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
+    sync_result[0] = 0;
+    for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
+      sync_result[0] += sync_result[i];
+    }
+  }
+}
+
+static __global__ void test_kernel(unsigned int* atomic_val, unsigned int* global_array,
+                                   unsigned int* array, uint32_t loops) {
+  cg::grid_group grid = cg::this_grid();
+  cg::multi_grid_group mgrid = cg::this_multi_grid();
+  unsigned rank = grid.thread_rank();
+  unsigned global_rank = mgrid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the grid barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = clock64();
+      do {
+        long long cur_clock = clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while (time_diff < 1000000);
+    }
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(atomic_val, UINT_MAX);
+    }
+    grid.sync();
+
+    // Make the last thread in the entire multi-grid run way behind
+    // everyone else.
+    // If the mgrid barrier below fails, then the two global_array entries
+    // will end up being out of sync, because the intermingling of adds
+    // and multiplies will not be aligned between to the two GPUs.
+    if (global_rank == (mgrid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = clock64();
+      do {
+        long long cur_clock = clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while (time_diff < 1000000);
+    }
+    // During even iterations, add into your own array entry
+    // During odd iterations, add into your partner's array entry
+    unsigned grid_rank = mgrid.grid_rank();
+    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
+    if (rank == (grid.size() - 1)) {
+      if (i % mgrid.num_grids() == 0) {
+        global_array[grid_rank] += 2;
+      } else {
+        global_array[inter_gpu_offset] *= 2;
+      }
+    }
+    mgrid.sync();
+    offset += gridDim.x;
+  }
+}
+
+__global__ void test_kernel_gfx11(unsigned int* atomic_val, unsigned int* global_array,
+                                  unsigned int* array, uint32_t loops) {
+#if HT_AMD
+  cg::grid_group grid = cg::this_grid();
+  cg::multi_grid_group mgrid = cg::this_multi_grid();
+  unsigned rank = grid.thread_rank();
+  unsigned global_rank = mgrid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the grid barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while (time_diff < 1000000);
+    }
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(atomic_val, UINT_MAX);
+    }
+    grid.sync();
+
+    // Make the last thread in the entire multi-grid run way behind
+    // everyone else.
+    // If the mgrid barrier below fails, then the two global_array entries
+    // will end up being out of sync, because the intermingling of adds
+    // and multiplies will not be aligned between to the two GPUs.
+    if (global_rank == (mgrid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while (time_diff < 1000000);
+    }
+    // During even iterations, add into your own array entry
+    // During odd iterations, add into your partner's array entry
+    unsigned grid_rank = mgrid.grid_rank();
+    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
+    if (rank == (grid.size() - 1)) {
+      if (i % mgrid.num_grids() == 0) {
+        global_array[grid_rank] += 2;
+      } else {
+        global_array[inter_gpu_offset] *= 2;
+      }
+    }
+    mgrid.sync();
+    offset += gridDim.x;
+  }
+#endif
+}
+
+static void verify_barrier_buffer(unsigned int loops, unsigned int warps, unsigned int* host_buffer,
+                                  unsigned int num_devs) {
+  unsigned int max_in_this_loop = 0;
+  for (unsigned int i = 0; i < loops; i++) {
+    max_in_this_loop += (warps * num_devs);
+    for (unsigned int j = 0; j < warps; j++) {
+      REQUIRE(host_buffer[i * warps + j] <= max_in_this_loop);
+    }
+  }
+}
+
+static void verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
+  unsigned int desired_val = 0;
+  for (int i = 0; i < loops; i++) {
+    if (i % 2 == 0) {
+      desired_val += 2;
+    } else {
+      desired_val *= 2;
+    }
+  }
+
+  REQUIRE(array_val == desired_val);
+}
+
+template <typename F>
+static void test_cg_multi_grid_group_type(F kernel_func, int num_devices, int block_size,
+                                          bool specific_api_test) {
+  // Create a stream each device
+  hipStream_t stream[MaxGPUs];
+  for (int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipSetDevice(i));
+    HIP_CHECK(hipDeviceSynchronize());  // Make sure work is done on this device
+    HIP_CHECK(hipStreamCreate(&stream[i]));
+  }
+
+  // Allocate host and device memory
+  int num_bytes = sizeof(int) * 2 * block_size;
+  int *num_grids_dev[MaxGPUs], *num_grids_host[MaxGPUs];
+  int *grid_rank_dev[MaxGPUs], *grid_rank_host[MaxGPUs];
+  int *size_dev[MaxGPUs], *size_host[MaxGPUs];
+  int *thd_rank_dev[MaxGPUs], *thd_rank_host[MaxGPUs];
+  int *is_valid_dev[MaxGPUs], *is_valid_host[MaxGPUs];
+  int *sync_dev[MaxGPUs], *sync_result;
+  for (int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipSetDevice(i));
+
+    if (specific_api_test) {
+      HIP_CHECK(hipMalloc(&num_grids_dev[i], num_bytes));
+      HIP_CHECK(hipHostMalloc(&num_grids_host[i], num_bytes));
+    }
+
+    HIP_CHECK(hipMalloc(&grid_rank_dev[i], num_bytes));
+    HIP_CHECK(hipMalloc(&size_dev[i], num_bytes));
+    HIP_CHECK(hipMalloc(&thd_rank_dev[i], num_bytes));
+    HIP_CHECK(hipMalloc(&is_valid_dev[i], num_bytes));
+    HIP_CHECK(hipMalloc(&sync_dev[i], num_bytes));
+
+    HIP_CHECK(hipHostMalloc(&grid_rank_host[i], num_bytes));
+    HIP_CHECK(hipHostMalloc(&size_host[i], num_bytes));
+    HIP_CHECK(hipHostMalloc(&thd_rank_host[i], num_bytes));
+    HIP_CHECK(hipHostMalloc(&is_valid_host[i], num_bytes));
+
+    if (i == 0) {
+      HIP_CHECK(
+          hipHostMalloc(&sync_result, sizeof(int) * (num_devices + 1), hipHostMallocCoherent));
+    }
+  }
+
+  // Launch Kernel
+  int NumKernelArgs = 6;
+  if (specific_api_test) {
+    NumKernelArgs = 7;
+  }
+  hipLaunchParams* launchParamsList = new hipLaunchParams[num_devices];
+  std::vector<void*> args(MaxGPUs * NumKernelArgs);
+  for (int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipSetDevice(i));
+
+    args[i * NumKernelArgs] = &grid_rank_dev[i];
+    args[i * NumKernelArgs + 1] = &size_dev[i];
+    args[i * NumKernelArgs + 2] = &thd_rank_dev[i];
+    args[i * NumKernelArgs + 3] = &is_valid_dev[i];
+    args[i * NumKernelArgs + 4] = &sync_dev[i];
+    args[i * NumKernelArgs + 5] = &sync_result;
+    if (specific_api_test) {
+      args[i * NumKernelArgs + 6] = &num_grids_dev[i];
+    }
+
+    launchParamsList[i].func = reinterpret_cast<void*>(kernel_func);
+    launchParamsList[i].gridDim = 2;
+    launchParamsList[i].blockDim = block_size;
+    launchParamsList[i].sharedMem = 0;
+    launchParamsList[i].stream = stream[i];
+    launchParamsList[i].args = &args[i * NumKernelArgs];
+  }
+  HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, num_devices, 0));
+
+  // Copy result from device to host
+  for (int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipSetDevice(i));
+    if (specific_api_test) {
+      HIP_CHECK(hipMemcpy(num_grids_host[i], num_grids_dev[i], num_bytes, hipMemcpyDeviceToHost));
+    }
+    HIP_CHECK(hipMemcpy(grid_rank_host[i], grid_rank_dev[i], num_bytes, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipMemcpy(size_host[i], size_dev[i], num_bytes, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipMemcpy(thd_rank_host[i], thd_rank_dev[i], num_bytes, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipMemcpy(is_valid_host[i], is_valid_dev[i], num_bytes, hipMemcpyDeviceToHost));
+  }
+
+  // Validate results
+  int grids_seen[MaxGPUs];
+  for (int i = 0; i < num_devices; ++i) {
+    for (int j = 0; j < 2 * block_size; ++j) {
+      if (specific_api_test) {
+        ASSERT_EQUAL(num_grids_host[i][j], num_devices);
+      }
+      ASSERT_GE(grid_rank_host[i][j], 0);
+      ASSERT_LE(grid_rank_host[i][j], num_devices - 1);
+      ASSERT_EQUAL(grid_rank_host[i][j], grid_rank_host[i][0]);
+      ASSERT_EQUAL(size_host[i][j], num_devices * 2 * block_size);
+      int gridRank = grid_rank_host[i][j];
+      ASSERT_EQUAL(thd_rank_host[i][j], (gridRank * 2 * block_size) + j);
+      ASSERT_EQUAL(is_valid_host[i][j], 1);
+    }
+    ASSERT_EQUAL(sync_result[i + 1], 2 * block_size);
+
+    // Validate uniqueness property of grid rank
+    grids_seen[i] = grid_rank_host[i][0];
+    for (int k = 0; k < i; ++k) {
+      INFO("Grid rank in multi-gpu setup should be unique");
+      REQUIRE(grids_seen[k] != grids_seen[i]);
+    }
+  }
+  ASSERT_EQUAL(sync_result[0], num_devices * 2 * block_size);
+
+  // Free host and device memory
+  delete[] launchParamsList;
+  for (int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipSetDevice(i));
+
+    if (specific_api_test) {
+      HIP_CHECK(hipFree(num_grids_dev[i]));
+      HIP_CHECK(hipHostFree(num_grids_host[i]));
+    }
+
+    HIP_CHECK(hipFree(grid_rank_dev[i]));
+    HIP_CHECK(hipFree(size_dev[i]));
+    HIP_CHECK(hipFree(thd_rank_dev[i]));
+    HIP_CHECK(hipFree(is_valid_dev[i]));
+    HIP_CHECK(hipFree(sync_dev[i]));
+
+    if (i == 0) {
+      HIP_CHECK(hipHostFree(sync_result));
+    }
+    HIP_CHECK(hipHostFree(grid_rank_host[i]));
+    HIP_CHECK(hipHostFree(size_host[i]));
+    HIP_CHECK(hipHostFree(thd_rank_host[i]));
+    HIP_CHECK(hipHostFree(is_valid_host[i]));
+  }
+}
+
+TEST_CASE("Unit_hipCGMultiGridGroupType_Basic") {
+  int num_devices = 0;
+  HIP_CHECK(hipGetDeviceCount(&num_devices));
+  num_devices = min(num_devices, MaxGPUs);
+
+  // Set `max_threads_per_blk` by taking minimum among all available devices
+  int max_threads_per_blk = INT_MAX;
+  hipDeviceProp_t device_properties;
+  for (int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipGetDeviceProperties(&device_properties, i));
+    if (!device_properties.cooperativeMultiDeviceLaunch) {
+      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+      return;
+    }
+    max_threads_per_blk = min(max_threads_per_blk, device_properties.maxThreadsPerBlock);
+  }
+
+  void* (*kernel_func)(void);
+  bool specific_api_test = false;
+
+  SECTION("Default multi grid group API test") {
+    kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_multi_grid_group_type);
+    specific_api_test = true;
+  }
+
+  SECTION("Base type multi grid group API test") {
+    kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_multi_grid_group_type_via_base_type);
+  }
+
+  SECTION("Public API multi grid group test") {
+    kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_multi_grid_group_type_via_public_api);
+  }
+
+  // Test for blockSizes in powers of 2
+  for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
+    test_cg_multi_grid_group_type(kernel_func, num_devices, block_size, specific_api_test);
+  }
+
+  // Test for random blockSizes, but the sequence is the same every execution
+  srand(0);
+  for (int i = 0; i < 10; i++) {
+    // Test fails for 0 thread per block
+    test_cg_multi_grid_group_type(kernel_func, num_devices, max(2, rand() % max_threads_per_blk),
+                                  specific_api_test);
+  }
+}
+
+TEST_CASE("Unit_hipCGMultiGridGroupType_Barrier") {
+  int num_devices = 0;
+  uint32_t loops = GENERATE(1, 2, 3, 4);
+  uint32_t warps = GENERATE(4, 8, 16, 32);
+  uint32_t block_size = 1;
+
+  HIP_CHECK(hipGetDeviceCount(&num_devices));
+  if (num_devices < 2) {
+    HipTest::HIP_SKIP_TEST("Device number is < 2");
+    return;
+  }
+
+  std::vector<hipDeviceProp_t> device_properties(num_devices);
+  for (int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipGetDeviceProperties(&device_properties[i], i));
+    if (!device_properties[i].cooperativeMultiDeviceLaunch) {
+      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+      return;
+    }
+  }
+
+  // Test whether the requested size will fit on the GPU
+  std::vector<int> warp_sizes(num_devices);
+  std::vector<int> num_sms(num_devices);
+  int warp_size = INT_MAX;
+  int num_sm = INT_MAX;
+  for (int i = 0; i < num_devices; i++) {
+    warp_sizes[i] = device_properties[i].warpSize;
+    if (warp_sizes[i] < warp_size) {
+      warp_size = warp_sizes[i];
+    }
+    num_sms[i] = device_properties[i].multiProcessorCount;
+    if (num_sms[i] < num_sm) {
+      num_sm = num_sms[i];
+    }
+  }
+
+  int num_threads_in_block = block_size * warp_size;
+
+  // Calculate the device occupancy to know how many blocks can be run.
+  std::vector<int> max_blocks_per_sm_arr(num_devices);
+  int max_blocks_per_sm = INT_MAX;
+  for (int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipSetDevice(i));
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0));
+    if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
+      max_blocks_per_sm = max_blocks_per_sm_arr[i];
+    }
+  }
+
+  int requested_blocks = warps / block_size;
+
+  // Each block will output a single value per loop.
+  uint32_t total_buffer_len = requested_blocks * loops;
+
+  // Alocate the buffer that will hold the kernel's output, and which will
+  // also be used to globally synchronize during GWS initialization
+  std::vector<unsigned int*> host_buffer(num_devices);
+  std::vector<unsigned int*> kernel_buffer(num_devices);
+  std::vector<unsigned int*> kernel_atomic(num_devices);
+  std::vector<hipStream_t> streams(num_devices);
+  for (int i = 0; i < num_devices; i++) {
+    host_buffer[i] =
+        reinterpret_cast<unsigned int*>(calloc(total_buffer_len, sizeof(unsigned int)));
+    HIP_CHECK(hipSetDevice(i));
+    HIP_CHECK(hipMalloc(&kernel_buffer[i], sizeof(unsigned int) * total_buffer_len));
+    HIP_CHECK(hipMemcpy(kernel_buffer[i], host_buffer[i], sizeof(unsigned int) * total_buffer_len,
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMalloc(&kernel_atomic[i], sizeof(unsigned int)));
+    HIP_CHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
+    HIP_CHECK(hipStreamCreate(&streams[i]));
+  }
+
+  // Single kernel atomic shared between both devices; put it on the host
+  unsigned int* global_array;
+  HIP_CHECK(hipHostMalloc(&global_array, sizeof(unsigned int) * num_devices));
+  HIP_CHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int)));
+
+  // Launch the kernels
+  INFO("Launching a cooperative kernel with " << warps << " warps in " << requested_blocks
+                                              << " thread blocks");
+
+  std::vector<std::vector<void*>> dev_params(num_devices, std::vector<void*>(4, nullptr));
+  std::vector<hipLaunchParams> md_params(num_devices);
+  for (int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipSetDevice(i));
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
+    dev_params[i][1] = reinterpret_cast<void*>(&global_array);
+    dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
+    dev_params[i][3] = reinterpret_cast<void*>(&loops);
+    md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
+    md_params[i].gridDim = requested_blocks;
+    md_params[i].blockDim = num_threads_in_block;
+    md_params[i].sharedMem = 0;
+    md_params[i].stream = streams[i];
+    md_params[i].args = dev_params[i].data();
+  }
+
+  HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params.data(), num_devices, 0));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Read back the buffer to host
+  for (int dev = 0; dev < num_devices; dev++) {
+    HIP_CHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev],
+                        sizeof(unsigned int) * total_buffer_len, hipMemcpyDeviceToHost));
+  }
+
+  for (unsigned int dev = 0; dev < num_devices; dev++) {
+    verify_barrier_buffer(loops, requested_blocks, host_buffer[dev], num_devices);
+  }
+
+  for (int dev = 0; dev < num_devices; dev++) {
+    verify_multi_gpu_buffer(loops, global_array[dev]);
+  }
+
+  HIP_CHECK(hipHostFree(global_array));
+  for (int k = 0; k < num_devices; ++k) {
+    HIP_CHECK(hipFree(kernel_buffer[k]));
+    HIP_CHECK(hipFree(kernel_atomic[k]));
+    HIP_CHECK(hipStreamDestroy(streams[k]));
+    free(host_buffer[k]);
+  }
+}
@@ -0,0 +1,198 @@
+/*
+Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+#include "hip_cg_common.hh"
+
+namespace cg = cooperative_groups;
+
+enum class TiledGroupShflTests { shflDown, shflXor, shflUp };
+
+template <unsigned int tileSz>
+__device__ int reduction_kernel_shfl_down(cg::thread_block_tile<tileSz> const& g,
+                                          volatile int val) {
+  int sz = g.size();
+
+  for (int i = sz / 2; i > 0; i >>= 1) {
+    val += g.shfl_down(val, i);
+  }
+
+  // Choose the 0'th indexed thread that holds the reduction value to return
+  if (g.thread_rank() == 0) {
+    return val;
+  }
+  // Rest of the threads return no useful values
+  else {
+    return -1;
+  }
+}
+
+template <unsigned int tileSz>
+__device__ int reduction_kernel_shfl_xor(cg::thread_block_tile<tileSz> const& g, int val) {
+  int sz = g.size();
+
+  for (int i = sz / 2; i > 0; i >>= 1) {
+    val += g.shfl_xor(val, i);
+  }
+
+  // Choose the 0'th indexed thread that holds the reduction value to return
+  if (g.thread_rank() == 0) {
+    return val;
+  }
+  // Rest of the threads return no useful values
+  else {
+    return -1;
+  }
+}
+
+template <unsigned int tileSz>
+__device__ int prefix_sum_kernel(cg::thread_block_tile<tileSz> const& g, volatile int val) {
+  int sz = g.size();
+#pragma unroll
+  for (int i = 1; i < sz; i <<= 1) {
+    int temp = g.shfl_up(val, i);
+
+    if (g.thread_rank() >= i) {
+      val += temp;
+    }
+  }
+  return val;
+}
+
+template <unsigned int tile_size>
+static __global__ void kernel_cg_group_partition_static(int* result,
+                                                        TiledGroupShflTests shfl_test) {
+  cg::thread_block thread_block_CG_ty = cg::this_thread_block();
+  int input, output_sum;
+
+  // Choose a leader thread to print the results
+  if (thread_block_CG_ty.thread_rank() == 0) {
+    printf(" Creating %d groups, of tile size %d threads:\n\n",
+           (int)thread_block_CG_ty.size() / tile_size, tile_size);
+  }
+
+  thread_block_CG_ty.sync();
+
+  cg::thread_block_tile<tile_size> tiled_part = cg::tiled_partition<tile_size>(thread_block_CG_ty);
+
+  input = tiled_part.thread_rank();
+
+  switch (shfl_test) {
+    case (TiledGroupShflTests::shflDown):
+      output_sum = reduction_kernel_shfl_down(tiled_part, input);
+      break;
+    case (TiledGroupShflTests::shflXor):
+      output_sum = reduction_kernel_shfl_xor(tiled_part, input);
+      break;
+    case (TiledGroupShflTests::shflUp):
+      output_sum = prefix_sum_kernel(tiled_part, input);
+      result[thread_block_CG_ty.thread_rank()] = output_sum;
+  }
+
+  if (tiled_part.thread_rank() == 0 && shfl_test != TiledGroupShflTests::shflUp) {
+    printf("   Sum of all ranks 0..%d in this tiled_part group is %d\n", tiled_part.size() - 1,
+           output_sum);
+    result[thread_block_CG_ty.thread_rank() / (tile_size)] = output_sum;
+  }
+}
+
+static void expected_result_calc(int* expected_result, int tile_size, int size,
+                                 TiledGroupShflTests shfl_test) {
+  switch (shfl_test) {
+    case (TiledGroupShflTests::shflDown):
+    case (TiledGroupShflTests::shflXor): {
+      int expected_sum = ((tile_size - 1) * tile_size / 2);
+      for (int i = 0; i < size; i++) {
+        expected_result[i] = expected_sum;
+      }
+      break;
+    }
+    case (TiledGroupShflTests::shflUp): {
+      for (int i = 0; i < size / tile_size; i++) {
+        int acc = 0;
+        for (int j = 0; j < tile_size; j++) {
+          acc += j;
+          expected_result[i * tile_size + j] = acc;
+        }
+      }
+      break;
+    }
+  }
+}
+
+template <unsigned int tile_size> static void test_group_partition(TiledGroupShflTests shfl_test) {
+  int block_size = 1;
+  int threads_per_blk = 64;
+
+  int num_elem = (block_size * threads_per_blk) / tile_size;
+  if (shfl_test == TiledGroupShflTests::shflUp) {
+    num_elem = block_size * threads_per_blk;
+  }
+
+  int* expected_result = new int[num_elem];
+
+  int* result_dev = NULL;
+  int* result_host = NULL;
+
+  HIP_CHECK(hipHostMalloc(&result_host, num_elem * sizeof(int), hipHostMallocDefault));
+  memset(result_host, 0, num_elem * sizeof(int));
+
+  HIP_CHECK(hipMalloc(&result_dev, num_elem * sizeof(int)));
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_cg_group_partition_static<tile_size>, block_size, threads_per_blk,
+                     threads_per_blk * sizeof(int), 0, result_dev, shfl_test);
+  HIP_CHECK(hipDeviceSynchronize());
+
+
+  HIP_CHECK(hipMemcpy(result_host, result_dev, sizeof(int) * num_elem, hipMemcpyDeviceToHost));
+
+  expected_result_calc(expected_result, tile_size, num_elem, shfl_test);
+  compareResults(expected_result, result_host, num_elem * sizeof(int));
+
+  // Free all allocated memory on host and device
+  HIP_CHECK(hipFree(result_dev));
+  HIP_CHECK(hipHostFree(result_host));
+  delete[] expected_result;
+}
+
+TEST_CASE("Unit_hipCGThreadBlockTileType_Shfl") {
+  // Use default device for validating the test
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  TiledGroupShflTests shfl_test = GENERATE(
+      TiledGroupShflTests::shflDown, TiledGroupShflTests::shflXor, TiledGroupShflTests::shflUp);
+  test_group_partition<2>(shfl_test);
+  test_group_partition<4>(shfl_test);
+  test_group_partition<8>(shfl_test);
+  test_group_partition<16>(shfl_test);
+  test_group_partition<32>(shfl_test);
+}
@@ -1,177 +0,0 @@
-/*
-Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <hip_test_common.hh>
-#include <hip/hip_cooperative_groups.h>
-#include <cstdlib>
-
-#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
-
-using namespace cooperative_groups;
-
-static __global__
-void kernel_cg_thread_block_type(int *sizeTestD,
-                                 int *thdRankTestD,
-                                 int *syncTestD,
-                                 dim3 *groupIndexTestD,
-                                 dim3 *thdIndexTestD,
-                                 dim3 *groupDimTestD)
-{
-  thread_block tb = this_thread_block();
-  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
-  // Test size
-  sizeTestD[gIdx] = tb.size();
-
-  // Test thread_rank
-  thdRankTestD[gIdx] = tb.thread_rank();
-
-  // Test sync
-  __shared__ int sm[2];
-  if (threadIdx.x == 0)
-    sm[0] = 10;
-  else if (threadIdx.x == 1)
-    sm[1] = 20;
-  tb.sync();
-  syncTestD[gIdx] = sm[1] * sm[0];
-
-  // Test group_index
-  groupIndexTestD[gIdx] = tb.group_index();
-
-  // Test thread_index
-  thdIndexTestD[gIdx] = tb.thread_index();
-
-  // Test group_dim aka number of threads in a block
-  groupDimTestD[gIdx] = tb.group_dim();
-}
-
-static void test_cg_thread_block_type(int blockSize)
-{
-  int nBytes = sizeof(int) * 2 * blockSize;
-  int nDim3Bytes = sizeof(dim3) * 2 * blockSize;
-  int *sizeTestD, *sizeTestH;
-  int *thdRankTestD, *thdRankTestH;
-  int *syncTestD, *syncTestH;
-  dim3 *groupIndexTestD, *groupIndexTestH;
-  dim3 *thdIndexTestD, *thdIndexTestH, *groupDimTestD, *groupDimTestH;
-
-  // Allocate device memory
-  HIPCHECK(hipMalloc(&sizeTestD, nBytes));
-  HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
-  HIPCHECK(hipMalloc(&syncTestD, nBytes));
-  HIPCHECK(hipMalloc(&groupIndexTestD, nDim3Bytes));
-  HIPCHECK(hipMalloc(&thdIndexTestD, nDim3Bytes));
-  HIPCHECK(hipMalloc(&groupDimTestD, nDim3Bytes));
-
-  // Allocate host memory
-  HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
-  HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
-  HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
-  HIPCHECK(hipHostMalloc(&groupIndexTestH, nDim3Bytes));
-  HIPCHECK(hipHostMalloc(&thdIndexTestH, nDim3Bytes));
-  HIPCHECK(hipHostMalloc(&groupDimTestH, nDim3Bytes));
-
-  // Launch Kernel
-  hipLaunchKernelGGL(kernel_cg_thread_block_type,
-                     2,
-                     blockSize,
-                     0,
-                     0,
-                     sizeTestD,
-                     thdRankTestD,
-                     syncTestD,
-                     groupIndexTestD,
-                     thdIndexTestD,
-                     groupDimTestD);
-
-  // Copy result from device to host
-  HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
-  HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
-  HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
-  HIPCHECK(hipMemcpy(groupIndexTestH, groupIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
-  HIPCHECK(hipMemcpy(thdIndexTestH, thdIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
-  HIPCHECK(hipMemcpy(groupDimTestH, groupDimTestD, nDim3Bytes, hipMemcpyDeviceToHost));
-
-  // Validate results for both blocks together
-  for (int i = 0; i < 2 * blockSize; ++i) {
-    ASSERT_EQUAL(sizeTestH[i], blockSize);
-    ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
-    ASSERT_EQUAL(syncTestH[i], 200);
-    ASSERT_EQUAL(groupIndexTestH[i].x, (uint) i / blockSize);
-    ASSERT_EQUAL(groupIndexTestH[i].y, 0);
-    ASSERT_EQUAL(groupIndexTestH[i].z, 0);
-    ASSERT_EQUAL(thdIndexTestH[i].x, (uint) i % blockSize);
-    ASSERT_EQUAL(thdIndexTestH[i].y, 0);
-    ASSERT_EQUAL(thdIndexTestH[i].z, 0);
-    ASSERT_EQUAL(groupDimTestH[i].x, blockSize);
-    ASSERT_EQUAL(groupDimTestH[i].y, 1);
-    ASSERT_EQUAL(groupDimTestH[i].z, 1);
-  }
-
-  // Free device memory
-  HIPCHECK(hipFree(sizeTestD));
-  HIPCHECK(hipFree(thdRankTestD));
-  HIPCHECK(hipFree(syncTestD));
-  HIPCHECK(hipFree(groupIndexTestD));
-  HIPCHECK(hipFree(thdIndexTestD));
-  HIPCHECK(hipFree(groupDimTestD));
-
-  //Free host memory
-  HIPCHECK(hipHostFree(sizeTestH));
-  HIPCHECK(hipHostFree(thdRankTestH));
-  HIPCHECK(hipHostFree(syncTestH));
-  HIPCHECK(hipHostFree(groupIndexTestH));
-  HIPCHECK(hipHostFree(thdIndexTestH));
-  HIPCHECK(hipHostFree(groupDimTestH));
-}
-
-TEST_CASE("Unit_hipCGThreadBlockType") {
-  // Use default device for validating the test
-  int deviceId;
-  hipDeviceProp_t deviceProperties;
-  HIPCHECK(hipGetDevice(&deviceId));
-  HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
-
-  if (!deviceProperties.cooperativeLaunch) {
-    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
-    return;
-  }
-
-  // Test for blockSizes in powers of 2
-  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
-  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
-    test_cg_thread_block_type(blockSize);
-  }
-
-  // Test for random blockSizes, but the sequence is the same every execution
-  srand(0);
-  for (int i = 0; i < 10; i++) {
-    // Test fails for only 1 thread per block
-    test_cg_thread_block_type(max(2, rand() % maxThreadsPerBlock));
-  }
-}
@@ -1,136 +0,0 @@
-/*
-Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <hip_test_common.hh>
-#include "hip/hip_cooperative_groups.h"
-#include <cstdlib>
-
-#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
-
-using namespace cooperative_groups;
-
-static __global__
-void kernel_cg_thread_block_type_via_base_type(int *sizeTestD,
-                                               int *thdRankTestD,
-                                               int *syncTestD)
-{
-  thread_group tg = this_thread_block();
-  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  // Test size
-  sizeTestD[gIdx] = tg.size();
-
-  // Test thread_rank
-  thdRankTestD[gIdx] = tg.thread_rank();
-
-  // Test sync
-  __shared__ int sm[2];
-  if (threadIdx.x == 0)
-    sm[0] = 10;
-  else if (threadIdx.x == 1)
-    sm[1] = 20;
-  tg.sync();
-  syncTestD[gIdx] = sm[1] * sm[0];
-}
-
-static void test_cg_thread_block_type_via_base_type(int blockSize)
-{
-  int nBytes = sizeof(int) * 2 * blockSize;
-  int *sizeTestD, *sizeTestH;
-  int *thdRankTestD, *thdRankTestH;
-  int *syncTestD, *syncTestH;
-
-  // Allocate device memory
-  HIPCHECK(hipMalloc(&sizeTestD, nBytes));
-  HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
-  HIPCHECK(hipMalloc(&syncTestD, nBytes));
-
-  // Allocate host memory
-  HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
-  HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
-  HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
-
-  // Launch Kernel
-  hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type,
-                     2,
-                     blockSize,
-                     0,
-                     0,
-                     sizeTestD,
-                     thdRankTestD,
-                     syncTestD);
-
-  // Copy result from device to host
-  HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
-  HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
-  HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
-
-  // Validate results for both blocks together
-  for (int i = 0; i < 2 * blockSize; ++i) {
-    ASSERT_EQUAL(sizeTestH[i], blockSize);
-    ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
-    ASSERT_EQUAL(syncTestH[i], 200);
-  }
-
-  // Free device memory
-  HIPCHECK(hipFree(sizeTestD));
-  HIPCHECK(hipFree(thdRankTestD));
-  HIPCHECK(hipFree(syncTestD));
-
-  //Free host memory
-  HIPCHECK(hipHostFree(sizeTestH));
-  HIPCHECK(hipHostFree(thdRankTestH));
-  HIPCHECK(hipHostFree(syncTestH));
-}
-
-TEST_CASE("Unit_hipCGThreadBlockType_BaseType") {
-  // Use default device for validating the test
-  int deviceId;
-  hipDeviceProp_t deviceProperties;
-  HIPCHECK(hipGetDevice(&deviceId));
-  HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
-
-  if (!deviceProperties.cooperativeLaunch) {
-    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
-    return;
-  }
-
-  // Test for blockSizes in powers of 2
-  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
-  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
-    test_cg_thread_block_type_via_base_type(blockSize);
-  }
-
-  // Test for random blockSizes, but the sequence is the same every execution
-  srand(0);
-  for (int i = 0; i < 10; i++) {
-    // Test fails for only 1 thread per block
-    test_cg_thread_block_type_via_base_type(max(2, rand() % maxThreadsPerBlock));
-  }
-}
@@ -1,136 +0,0 @@
-/*
-Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <hip_test_common.hh>
-#include "hip/hip_cooperative_groups.h"
-#include <cstdlib>
-
-#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
-
-using namespace cooperative_groups;
-
-static __global__
-void kernel_cg_thread_block_type_via_public_api(int *sizeTestD,
-                                                int *thdRankTestD,
-                                                int *syncTestD)
-{
-  thread_block tb = this_thread_block();
-  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  // Test group_size api
-  sizeTestD[gIdx] = group_size(tb);
-
-  // Test thread_rank api
-  thdRankTestD[gIdx] = thread_rank(tb);
-
-  // Test sync api
-  __shared__ int sm[2];
-  if (threadIdx.x == 0)
-    sm[0] = 10;
-  else if (threadIdx.x == 1)
-    sm[1] = 20;
-  sync(tb);
-  syncTestD[gIdx] = sm[1] * sm[0];
-}
-
-static void test_cg_thread_block_type_via_public_api(int blockSize)
-{
-  int nBytes = sizeof(int) * 2 * blockSize;
-  int *sizeTestD, *sizeTestH;
-  int *thdRankTestD, *thdRankTestH;
-  int *syncTestD, *syncTestH;
-
-  // Allocate device memory
-  HIPCHECK(hipMalloc(&sizeTestD, nBytes));
-  HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
-  HIPCHECK(hipMalloc(&syncTestD, nBytes));
-
-  // Allocate host memory
-  HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
-  HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
-  HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
-
-  // Launch Kernel
-  hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api,
-                     2,
-                     blockSize,
-                     0,
-                     0,
-                     sizeTestD,
-                     thdRankTestD,
-                     syncTestD);
-
-  // Copy result from device to host
-  HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
-  HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
-  HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
-
-  // Validate results for both blocks together
-  for (int i = 0; i < 2 * blockSize; ++i) {
-    ASSERT_EQUAL(sizeTestH[i], blockSize);
-    ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
-    ASSERT_EQUAL(syncTestH[i], 200);
-  }
-
-  // Free device memory
-  HIPCHECK(hipFree(sizeTestD));
-  HIPCHECK(hipFree(thdRankTestD));
-  HIPCHECK(hipFree(syncTestD));
-
-  //Free host memory
-  HIPCHECK(hipHostFree(sizeTestH));
-  HIPCHECK(hipHostFree(thdRankTestH));
-  HIPCHECK(hipHostFree(syncTestH));
-}
-
-TEST_CASE("Unit_hipCGThreadBlockType_PublicApi") {
-  // Use default device for validating the test
-  int deviceId;
-  hipDeviceProp_t deviceProperties;
-  HIPCHECK(hipGetDevice(&deviceId));
-  HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
-
-  if (!deviceProperties.cooperativeLaunch) {
-    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
-    return;
-  }
-
-  // Test for blockSizes in powers of 2
-  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
-  for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
-    test_cg_thread_block_type_via_public_api(blockSize);
-  }
-
-  // Test for random blockSizes, but the sequence is the same every execution
-  srand(0);
-  for (int i = 0; i < 10; i++) {
-    // Test fails for only 1 thread per block
-    test_cg_thread_block_type_via_public_api(max(2, rand() % maxThreadsPerBlock));
-  }
-}
@@ -0,0 +1,225 @@
+/*
+Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+#include "hip_cg_common.hh"
+
+namespace cg = cooperative_groups;
+
+enum class ThreadBlockTypeTests { basicApi, baseType, publicApi };
+
+static __global__ void kernel_cg_thread_block_type(int* size_dev, int* thd_rank_dev, int* sync_dev,
+                                                   dim3* group_index_dev, dim3* thd_index_dev,
+                                                   dim3* group_dim_dev) {
+  cg::thread_block tb = cg::this_thread_block();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  // Test size
+  size_dev[gIdx] = tb.size();
+
+  // Test thread_rank
+  thd_rank_dev[gIdx] = tb.thread_rank();
+
+  // Test sync
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  tb.sync();
+  sync_dev[gIdx] = sm[1] * sm[0];
+
+  // Test group_index
+  group_index_dev[gIdx] = tb.group_index();
+
+  // Test thread_index
+  thd_index_dev[gIdx] = tb.thread_index();
+
+  // Test group_dim aka number of threads in a block
+  group_dim_dev[gIdx] = tb.group_dim();
+}
+
+static __global__ void kernel_cg_thread_block_type_via_base_type(int* size_dev, int* thd_rank_dev,
+                                                                 int* sync_dev) {
+  cg::thread_group tg = cg::this_thread_block();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test size
+  size_dev[gIdx] = tg.size();
+
+  // Test thread_rank
+  thd_rank_dev[gIdx] = tg.thread_rank();
+
+  // Test sync
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  tg.sync();
+  sync_dev[gIdx] = sm[1] * sm[0];
+}
+
+static __global__ void kernel_cg_thread_block_type_via_public_api(int* size_dev, int* thd_rank_dev,
+                                                                  int* sync_dev) {
+  cg::thread_block tb = cg::this_thread_block();
+  int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // Test group_size api
+  size_dev[gIdx] = cg::group_size(tb);
+
+  // Test thread_rank api
+  thd_rank_dev[gIdx] = cg::thread_rank(tb);
+
+  // Test sync api
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  cg::sync(tb);
+  sync_dev[gIdx] = sm[1] * sm[0];
+}
+
+static void test_cg_thread_block_type(ThreadBlockTypeTests test_type, int block_size) {
+  int num_bytes = sizeof(int) * 2 * block_size;
+  int num_dim3_bytes = sizeof(dim3) * 2 * block_size;
+  int *size_dev, *size_host;
+  int *thd_rank_dev, *thd_rank_host;
+  int *sync_dev, *sync_host;
+  dim3 *group_index_dev, *group_index_host;
+  dim3 *thd_index_dev, *thd_index_host;
+  dim3 *group_dim_dev, *group_dim_host;
+
+  // Allocate device memory
+  HIP_CHECK(hipMalloc(&size_dev, num_bytes));
+  HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes));
+  HIP_CHECK(hipMalloc(&sync_dev, num_bytes));
+
+  // Allocate host memory
+  HIP_CHECK(hipHostMalloc(&size_host, num_bytes));
+  HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes));
+  HIP_CHECK(hipHostMalloc(&sync_host, num_bytes));
+
+  switch (test_type) {
+    case (ThreadBlockTypeTests::basicApi):
+      HIP_CHECK(hipMalloc(&group_index_dev, num_dim3_bytes));
+      HIP_CHECK(hipMalloc(&thd_index_dev, num_dim3_bytes));
+      HIP_CHECK(hipMalloc(&group_dim_dev, num_dim3_bytes));
+      HIP_CHECK(hipHostMalloc(&group_index_host, num_dim3_bytes));
+      HIP_CHECK(hipHostMalloc(&thd_index_host, num_dim3_bytes));
+      HIP_CHECK(hipHostMalloc(&group_dim_host, num_dim3_bytes));
+
+      hipLaunchKernelGGL(kernel_cg_thread_block_type, 2, block_size, 0, 0, size_dev, thd_rank_dev,
+                         sync_dev, group_index_dev, thd_index_dev, group_dim_dev);
+      break;
+    case (ThreadBlockTypeTests::baseType):
+      hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type, 2, block_size, 0, 0, size_dev,
+                         thd_rank_dev, sync_dev);
+      break;
+    case (ThreadBlockTypeTests::publicApi):
+      hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api, 2, block_size, 0, 0, size_dev,
+                         thd_rank_dev, sync_dev);
+  }
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(size_host, size_dev, num_bytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost));
+  if (test_type == ThreadBlockTypeTests::basicApi) {
+    HIP_CHECK(hipMemcpy(group_index_host, group_index_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipMemcpy(thd_index_host, thd_index_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipMemcpy(group_dim_host, group_dim_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
+  }
+
+  // Validate results for both blocks together
+  for (int i = 0; i < 2 * block_size; ++i) {
+    ASSERT_EQUAL(size_host[i], block_size);
+    ASSERT_EQUAL(thd_rank_host[i], i % block_size);
+    ASSERT_EQUAL(sync_host[i], 200);
+    if (test_type == ThreadBlockTypeTests::basicApi) {
+      ASSERT_EQUAL(group_index_host[i].x, (uint)i / block_size);
+      ASSERT_EQUAL(group_index_host[i].y, 0);
+      ASSERT_EQUAL(group_index_host[i].z, 0);
+      ASSERT_EQUAL(thd_index_host[i].x, (uint)i % block_size);
+      ASSERT_EQUAL(thd_index_host[i].y, 0);
+      ASSERT_EQUAL(thd_index_host[i].z, 0);
+      ASSERT_EQUAL(group_dim_host[i].x, block_size);
+      ASSERT_EQUAL(group_dim_host[i].y, 1);
+      ASSERT_EQUAL(group_dim_host[i].z, 1);
+    }
+  }
+
+  // Free device memory
+  HIP_CHECK(hipFree(size_dev));
+  HIP_CHECK(hipFree(thd_rank_dev));
+  HIP_CHECK(hipFree(sync_dev));
+
+  // Free host memory
+  HIP_CHECK(hipHostFree(size_host));
+  HIP_CHECK(hipHostFree(thd_rank_host));
+  HIP_CHECK(hipHostFree(sync_host));
+
+  if (test_type == ThreadBlockTypeTests::basicApi) {
+    HIP_CHECK(hipFree(group_index_dev));
+    HIP_CHECK(hipFree(thd_index_dev));
+    HIP_CHECK(hipFree(group_dim_dev));
+    HIP_CHECK(hipHostFree(group_index_host));
+    HIP_CHECK(hipHostFree(thd_index_host));
+    HIP_CHECK(hipHostFree(group_dim_host));
+  }
+}
+
+
+TEST_CASE("Unit_hipCGThreadBlockType") {
+  // Use default device for validating the test
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  ThreadBlockTypeTests test_type = ThreadBlockTypeTests::basicApi;
+
+  SECTION("Default thread block API test") { test_type = ThreadBlockTypeTests::basicApi; }
+
+  SECTION("Base type thread block API test") { test_type = ThreadBlockTypeTests::baseType; }
+
+  SECTION("Public API thread block test") { test_type = ThreadBlockTypeTests::publicApi; }
+
+  // Test for blockSizes in powers of 2
+  int max_threads_per_blk = device_properties.maxThreadsPerBlock;
+  for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
+    test_cg_thread_block_type(test_type, block_size);
+  }
+
+  // Test for random block_size, but the sequence is the same every execution
+  srand(0);
+  for (int i = 0; i < 10; i++) {
+    // Test fails for only 1 thread per block
+    test_cg_thread_block_type(test_type, max(2, rand() % max_threads_per_blk));
+  }
+}
@@ -1,385 +0,0 @@
-/*
-Copyright (c) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-// Test Description:
-/* This test implements sum reduction kernel, first with each threads own rank
-   as input and comparing the sum with expected sum output derieved from n(n-1)/2
-   formula. The second part, partitions this parent group into child subgroups
-   a.k.a tiles using using tiled_partition() collective operation. This can be called
-   with a static tile size, passed in templated non-type variable-tiled_partition<tileSz>,
-   or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these
-   cases.
-   This test tests functionality of cg group partitioning, (static and dynamic) and its respective
-   API's size(), thread_rank(), and sync().
-*/
-
-#include <hip_test_common.hh>
-#include <hip/hip_cooperative_groups.h>
-#include <stdio.h>
-#include <vector>
-
-using namespace cooperative_groups;
-
-/* Parallel reduce kernel.
- *
- * Step complexity: O(log n)
- * Work complexity: O(n)
- *
- * Note: This kernel works only with power of 2 input arrays.
- */
-__device__ int reduction_kernel(thread_group g, int* x, int val) {
-  int lane = g.thread_rank();
-
-  for (int i = g.size() / 2; i > 0; i /= 2) {
-    // use lds to store the temporary result
-    x[lane] = val;
-    // Ensure all the stores are completed.
-    g.sync();
-
-    if (lane < i) {
-      val += x[lane + i];
-    }
-    // It must work on one tiled thread group at a time,
-    // and it must make sure all memory operations are
-    // completed before moving to the next stride.
-    // sync() here just does that.
-    g.sync();
-  }
-
-  // Choose the 0'th indexed thread that holds the reduction value to return
-  if (g.thread_rank() == 0) {
-    return val;
-  }
-  // Rest of the threads return no useful values
-  else {
-    return -1;
-  }
-}
-
-template <unsigned int tileSz>
-__global__ void kernel_cg_group_partition_static(int* result, bool isGlobalMem, int* globalMem) {
-  thread_block threadBlockCGTy = this_thread_block();
-  int threadBlockGroupSize = threadBlockCGTy.size();
-
-  int* workspace = NULL;
-
-  if (isGlobalMem) {
-    workspace = globalMem;
-  } else {
-    // Declare a shared memory
-    extern __shared__ int sharedMem[];
-    workspace = sharedMem;
-  }
-
-  int input, outputSum, expectedOutput;
-
-  // we pass its own thread rank as inputs
-  input = threadBlockCGTy.thread_rank();
-
-  expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
-
-  outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
-
-  // Choose a leader thread to print the results
-  if (threadBlockCGTy.thread_rank() == 0) {
-    printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d (expected %d)\n\n",
-           (int)threadBlockCGTy.size() - 1, outputSum, expectedOutput);
-    printf(" Creating %d groups, of tile size %d threads:\n\n",
-           (int)threadBlockCGTy.size() / tileSz, tileSz);
-  }
-
-  threadBlockCGTy.sync();
-
-  thread_block_tile<tileSz> tiledPartition = tiled_partition<tileSz>(threadBlockCGTy);
-
-  // This offset allows each group to have its own unique area in the workspace array
-  int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank();
-
-  outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input);
-
-  if (tiledPartition.thread_rank() == 0) {
-    printf(
-        "   Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
-        "rank via meta_group_rank : %d and the total number of groups created when partitioned : %d\n",
-        tiledPartition.size() - 1, outputSum, tiledPartition.meta_group_rank(), tiledPartition.meta_group_size());
-    result[input / (tileSz)] = outputSum;
-  }
-  return;
-}
-
-
-__global__ void kernel_cg_group_partition_dynamic(unsigned int tileSz, int* result,
-                                                  bool isGlobalMem, int* globalMem) {
-  thread_block threadBlockCGTy = this_thread_block();
-
-  int* workspace = NULL;
-
-  if (isGlobalMem) {
-    workspace = globalMem;
-  } else {
-    // Declare a shared memory
-    extern __shared__ int sharedMem[];
-    workspace = sharedMem;
-  }
-
-  int input, outputSum;
-
-  // input to reduction, for each thread, is its' rank in the group
-  input = threadBlockCGTy.thread_rank();
-
-  outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
-
-  if (threadBlockCGTy.thread_rank() == 0) {
-    printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d\n\n",
-           (int)threadBlockCGTy.size() - 1, outputSum);
-    printf(" Creating %d groups, of tile size %d threads:\n\n",
-           (int)threadBlockCGTy.size() / tileSz, tileSz);
-  }
-
-  threadBlockCGTy.sync();
-
-  thread_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
-
-  // This offset allows each group to have its own unique area in the workspace array
-  int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank();
-
-  outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input);
-
-  if (tiledPartition.thread_rank() == 0) {
-     printf(
-        "   Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
-        " %d\n", tiledPartition.size() - 1, outputSum, input);
-    result[input / (tileSz)] = outputSum;
-  }
-  return;
-}
-
-// Search if the sum exists in the expected results array
-void verifyResults(int* hPtr, int* dPtr, int size) {
-  int i = 0, j = 0;
-  for (i = 0; i < size; i++) {
-    for (j = 0; j < size; j++) {
-      if (hPtr[i] == dPtr[j]) {
-        break;
-      }
-    }
-    if (j == size) {
-      REQUIRE(" Result verification failed!");
-    }
-  }
-}
-
-
-template <unsigned int tileSz> static void test_group_partition(bool useGlobalMem) {
-  hipError_t err;
-  int blockSize = 1;
-  int threadsPerBlock = 64;
-
-  int numTiles = (blockSize * threadsPerBlock) / tileSz;
-
-  // Build an array of expected reduction sum output on the host
-  // based on the sum of their respective thread ranks for verification.
-  // eg: parent group has 64threads.
-  // child thread ranks: 0-15, 16-31, 32-47, 48-63
-  // expected sum:       120,   376,  632,  888
-  int* expectedSum = new int[numTiles];
-  int temp = 0, sum = 0;
-
-  for (int i = 1; i <= numTiles; i++) {
-    sum = temp;
-    temp = (((tileSz * i) - 1) * (tileSz * i)) / 2;
-    expectedSum[i-1] = temp - sum;
-  }
-
-  int* dResult = NULL;
-  HIPCHECK(hipMalloc((void**)&dResult, numTiles * sizeof(int)));
-
-  int* globalMem = NULL;
-  if (useGlobalMem) {
-    HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
-  }
-
-  int* hResult = NULL;
-  HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
-  memset(hResult, 0, numTiles * sizeof(int));
-
-  if (useGlobalMem) {
-    // Launch Kernel
-    hipLaunchKernelGGL(kernel_cg_group_partition_static<tileSz>, blockSize, threadsPerBlock, 0, 0,
-                       dResult, useGlobalMem, globalMem);
-    err = hipDeviceSynchronize();
-    if (err != hipSuccess) {
-      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
-    }
-  } else {
-    // Launch Kernel
-    hipLaunchKernelGGL(kernel_cg_group_partition_static<tileSz>, blockSize, threadsPerBlock,
-                       threadsPerBlock * sizeof(int), 0, dResult, useGlobalMem, globalMem);
-    err = hipDeviceSynchronize();
-    if (err != hipSuccess) {
-      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
-    }
-  }
-
-  HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
-
-  verifyResults(expectedSum, hResult, numTiles);
-
-  // Free all allocated memory on host and device
-  HIPCHECK(hipFree(dResult));
-  HIPCHECK(hipFree(hResult));
-  if (useGlobalMem) {
-    HIPCHECK(hipFree(globalMem));
-  }
-  delete[] expectedSum;
-
-  printf("\n...PASSED.\n\n");
-}
-
-static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
-  hipError_t err;
-  int blockSize = 1;
-  int threadsPerBlock = 64;
-
-  int numTiles = (blockSize * threadsPerBlock) / tileSz;
-  // Build an array of expected reduction sum output on the host
-  // based on the sum of their respective thread ranks to use for verification
-  int* expectedSum = new int[numTiles];
-  int temp = 0, sum = 0;
-  for (int i = 1; i <= numTiles; i++) {
-    sum = temp;
-    temp = (((tileSz * i) - 1) * (tileSz * i)) / 2;
-    expectedSum[i-1] = temp - sum;
-  }
-
-  int* dResult = NULL;
-  HIPCHECK(hipMalloc(&dResult, sizeof(int) * numTiles));
-
-  int* globalMem = NULL;
-  if (useGlobalMem) {
-    HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
-  }
-
-  int* hResult = NULL;
-  HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
-  memset(hResult, 0, numTiles * sizeof(int));
-
-  // Launch Kernel
-  if (useGlobalMem) {
-    hipLaunchKernelGGL(kernel_cg_group_partition_dynamic, blockSize, threadsPerBlock, 0, 0, tileSz,
-                       dResult, useGlobalMem, globalMem);
-
-    err = hipDeviceSynchronize();
-    if (err != hipSuccess) {
-      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
-    }
-  } else {
-    hipLaunchKernelGGL(kernel_cg_group_partition_dynamic, blockSize, threadsPerBlock,
-                       threadsPerBlock * sizeof(int), 0, tileSz, dResult, useGlobalMem, globalMem);
-
-    err = hipDeviceSynchronize();
-    if (err != hipSuccess) {
-      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
-    }
-  }
-
-  HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
-
-  verifyResults(expectedSum, hResult, numTiles);
-
-  // Free all allocated memory on host and device
-  HIPCHECK(hipFree(dResult));
-  HIPCHECK(hipFree(hResult));
-  if (useGlobalMem) {
-    HIPCHECK(hipFree(globalMem));
-  }
-  delete[] expectedSum;
-
-  printf("\n...PASSED.\n\n");
-}
-
-TEST_CASE("Unit_tiled_partition") {
-  // Use default device for validating the test
-  int deviceId;
-  HIP_CHECK_ERROR(hipGetDevice(&deviceId), hipSuccess);
-  hipDeviceProp_t deviceProperties;
-  HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
-
-  if (!deviceProperties.cooperativeLaunch) {
-    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
-  }
-
-  bool useGlobalMem = true;
-  std::cout << "Testing static tiled_partition for different tile sizes" << std::endl;
-  std::cout << "\nUsing global memory for computation\n";
-  /* Test static tile_partition */
-  std::cout << "TEST 1:" << '\n' << std::endl;
-  test_group_partition<2>(useGlobalMem);
-  std::cout << "TEST 2:" << '\n' << std::endl;
-  test_group_partition<4>(useGlobalMem);
-  std::cout << "TEST 3:" << '\n' << std::endl;
-  test_group_partition<8>(useGlobalMem);
-  std::cout << "TEST 4:" << '\n' << std::endl;
-  test_group_partition<16>(useGlobalMem);
-  std::cout << "TEST 5:" << '\n' << std::endl;
-  test_group_partition<32>(useGlobalMem);
-
-  useGlobalMem = false;
-  std::cout << "Testing static tiled_partition for different tile sizes" << std::endl;
-  std::cout << "\nUsing shared memory for computation\n";
-  /* Test static tile_partition */
-  std::cout << "TEST 1:" << '\n' << std::endl;
-  test_group_partition<2>(useGlobalMem);
-  std::cout << "TEST 2:" << '\n' << std::endl;
-  test_group_partition<4>(useGlobalMem);
-  std::cout << "TEST 3:" << '\n' << std::endl;
-  test_group_partition<8>(useGlobalMem);
-  std::cout << "TEST 4:" << '\n' << std::endl;
-  test_group_partition<16>(useGlobalMem);
-  std::cout << "TEST 5:" << '\n' << std::endl;
-  test_group_partition<32>(useGlobalMem);
-
-
-  std::cout << "Now testing dynamic tiled_partition for different tile sizes" << '\n' << std::endl;
-
-  /* Test dynamic group partition*/
-  useGlobalMem = true;
-  int testNo = 1;
-  std::vector<unsigned int> tileSizes = {2, 4, 8, 16, 32};
-  std::cout << "\nUsing global memory for computation\n";
-  for (auto i : tileSizes) {
-    std::cout << "TEST " << testNo << ":" << '\n' << std::endl;
-    test_group_partition(i, useGlobalMem);
-    testNo++;
-  }
-
-  useGlobalMem = false;
-  testNo = 1;
-  std::cout << "\nUsing shared memory for computation\n";
-  for (auto i : tileSizes) {
-    std::cout << "TEST " << testNo << ":" << '\n' << std::endl;
-    test_group_partition(i, useGlobalMem);
-    testNo++;
-  }
-  printf("\n...PASSED.\n\n");
-  return;
-}
@@ -0,0 +1,279 @@
+/*
+Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Test Description:
+/* This test implements sum reduction kernel, first with each threads own rank
+   as input and comparing the sum with expected sum output derieved from n(n-1)/2
+   formula. The second part, partitions this parent group into child subgroups
+   a.k.a tiles using using tiled_partition() collective operation. This can be called
+   with a static tile size, passed in templated non-type variable-tiled_partition<tileSz>,
+   or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these
+   cases.
+   This test tests functionality of cg group partitioning, (static and dynamic) and its respective
+   API's size(), thread_rank(), and sync().
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <cstdlib>
+
+#include "hip_cg_common.hh"
+
+namespace cg = cooperative_groups;
+
+/* Parallel reduce kernel.
+ *
+ * Step complexity: O(log n)
+ * Work complexity: O(n)
+ *
+ * Note: This kernel works only with power of 2 input arrays.
+ */
+__device__ int reduction_kernel(cg::thread_group g, int* x, int val) {
+  int lane = g.thread_rank();
+
+  for (int i = g.size() / 2; i > 0; i /= 2) {
+    // use lds to store the temporary result
+    x[lane] = val;
+    // Ensure all the stores are completed.
+    g.sync();
+
+    if (lane < i) {
+      val += x[lane + i];
+    }
+    // It must work on one tiled thread group at a time,
+    // and it must make sure all memory operations are
+    // completed before moving to the next stride.
+    // sync() here just does that.
+    g.sync();
+  }
+
+  // Choose the 0'th indexed thread that holds the reduction value to return
+  if (g.thread_rank() == 0) {
+    return val;
+  }
+  // Rest of the threads return no useful values
+  else {
+    return -1;
+  }
+}
+
+template <unsigned int tile_size>
+__global__ void kernel_cg_group_partition_static(int* result, bool is_global_mem, int* global_mem) {
+  cg::thread_block thread_block_CG_ty = cg::this_thread_block();
+
+  int* workspace = NULL;
+
+  if (is_global_mem) {
+    workspace = global_mem;
+  } else {
+    // Declare a shared memory
+    extern __shared__ int shared_mem[];
+    workspace = shared_mem;
+  }
+
+  int input, output_sum, expected_output;
+
+  // input to reduction, for each thread, is its' rank in the group
+  input = thread_block_CG_ty.thread_rank();
+
+  expected_output = (thread_block_CG_ty.size() - 1) * thread_block_CG_ty.size() / 2;
+
+  output_sum = reduction_kernel(thread_block_CG_ty, workspace, input);
+
+  if (thread_block_CG_ty.thread_rank() == 0) {
+    printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d (expected %d)\n\n",
+           (int)thread_block_CG_ty.size() - 1, output_sum, expected_output);
+    printf(" Creating %d groups, of tile size %d threads:\n\n",
+           (int)thread_block_CG_ty.size() / tile_size, tile_size);
+  }
+
+  thread_block_CG_ty.sync();
+
+  cg::thread_block_tile<tile_size> tiled_part = cg::tiled_partition<tile_size>(thread_block_CG_ty);
+
+  // This offset allows each group to have its own unique area in the workspace array
+  int workspace_offset = thread_block_CG_ty.thread_rank() - tiled_part.thread_rank();
+
+  output_sum = reduction_kernel(tiled_part, workspace + workspace_offset, input);
+
+  if (tiled_part.thread_rank() == 0) {
+    printf(
+        "   Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
+        "rank: via meta_group_rank : %d and the total number of groups created when partitioned : "
+        "%d\n",
+        tiled_part.size() - 1, output_sum, tiled_part.meta_group_rank(),
+        tiled_part.meta_group_size());
+    result[input / (tile_size)] = output_sum;
+  }
+  return;
+}
+
+__global__ void kernel_cg_group_partition_dynamic(unsigned int tile_size, int* result,
+                                                  bool is_global_mem, int* global_mem) {
+  cg::thread_block thread_block_CG_ty = cg::this_thread_block();
+
+  int* workspace = NULL;
+
+  if (is_global_mem) {
+    workspace = global_mem;
+  } else {
+    // Declare a shared memory
+    extern __shared__ int shared_mem[];
+    workspace = shared_mem;
+  }
+
+  int input, output_sum;
+
+  // input to reduction, for each thread, is its' rank in the group
+  input = thread_block_CG_ty.thread_rank();
+
+  output_sum = reduction_kernel(thread_block_CG_ty, workspace, input);
+
+  if (thread_block_CG_ty.thread_rank() == 0) {
+    printf("\n\n\n Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d\n\n",
+           (int)thread_block_CG_ty.size() - 1, output_sum);
+    printf(" Creating %d groups, of tile size %d threads:\n\n",
+           (int)thread_block_CG_ty.size() / tile_size, tile_size);
+  }
+
+  thread_block_CG_ty.sync();
+
+  cg::thread_group tiled_part = cg::tiled_partition(thread_block_CG_ty, tile_size);
+
+  // This offset allows each group to have its own unique area in the workspace array
+  int workspace_offset = thread_block_CG_ty.thread_rank() - tiled_part.thread_rank();
+
+  output_sum = reduction_kernel(tiled_part, workspace + workspace_offset, input);
+
+  if (tiled_part.thread_rank() == 0) {
+    printf(
+        "   Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
+        "rank: %d\n",
+        static_cast<int>(tiled_part.size()) - 1, output_sum, input);
+    result[input / (tile_size)] = output_sum;
+  }
+  return;
+}
+
+template <typename F>
+static void common_group_partition(F kernel_func, unsigned int tile_size, void** params,
+                                   size_t num_params, bool use_global_mem) {
+  int block_size = 1;
+  int threads_per_blk = 64;
+
+  int num_tiles = (block_size * threads_per_blk) / tile_size;
+
+  // Build an array of expected reduction sum output on the host
+  // based on the sum of their respective thread ranks for verification.
+  // eg: parent group has 64threads.
+  // child thread ranks: 0-15, 16-31, 32-47, 48-63
+  // expected sum:       120,   376,  632,  888
+  int* expected_sum = new int[num_tiles];
+  int temp = 0, sum = 0;
+
+  for (int i = 1; i <= num_tiles; i++) {
+    sum = temp;
+    temp = (((tile_size * i) - 1) * (tile_size * i)) / 2;
+    expected_sum[i - 1] = temp - sum;
+  }
+
+  int* result_dev = NULL;
+  HIP_CHECK(hipMalloc((void**)&result_dev, num_tiles * sizeof(int)));
+
+  int* global_mem = NULL;
+  if (use_global_mem) {
+    HIP_CHECK(hipMalloc((void**)&global_mem, threads_per_blk * sizeof(int)));
+  }
+
+  int* result_host = NULL;
+  HIP_CHECK(hipHostMalloc(&result_host, num_tiles * sizeof(int), hipHostMallocDefault));
+  memset(result_host, 0, num_tiles * sizeof(int));
+
+  params[num_params + 0] = &result_dev;
+  params[num_params + 1] = &use_global_mem;
+  params[num_params + 2] = &global_mem;
+
+  if (use_global_mem) {
+    // Launch Kernel
+    HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, block_size, threads_per_blk, params, 0, 0));
+    HIP_CHECK(hipDeviceSynchronize());
+  } else {
+    // Launch Kernel
+    HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, block_size, threads_per_blk, params,
+                                         threads_per_blk * sizeof(int), 0));
+    HIP_CHECK(hipDeviceSynchronize());
+  }
+
+  HIP_CHECK(hipMemcpy(result_host, result_dev, num_tiles * sizeof(int), hipMemcpyDeviceToHost));
+
+  verifyResults(expected_sum, result_host, num_tiles);
+
+  // Free all allocated memory on host and device
+  HIP_CHECK(hipFree(result_dev));
+  HIP_CHECK(hipHostFree(result_host));
+  if (use_global_mem) {
+    HIP_CHECK(hipFree(global_mem));
+  }
+  delete[] expected_sum;
+}
+
+template <unsigned int tile_size> static void test_group_partition(bool use_global_mem) {
+  void* params[3];
+  size_t num_params = 0;
+  common_group_partition(kernel_cg_group_partition_static<tile_size>, tile_size, params, num_params,
+                         use_global_mem);
+}
+
+static void test_group_partition(unsigned int tile_size, bool use_global_mem) {
+  void* params[4];
+  params[0] = &tile_size;
+  size_t num_params = 1;
+  common_group_partition(kernel_cg_group_partition_dynamic, tile_size, params, num_params,
+                         use_global_mem);
+}
+
+TEST_CASE("Unit_hipCGThreadBlockTileType") {
+  // Use default device for validating the test
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  bool use_global_mem = GENERATE(true, false);
+
+  SECTION("Static tile partition") {
+    test_group_partition<2>(use_global_mem);
+    test_group_partition<4>(use_global_mem);
+    test_group_partition<8>(use_global_mem);
+    test_group_partition<16>(use_global_mem);
+    test_group_partition<32>(use_global_mem);
+  }
+
+  SECTION("Dynamic tile partition") {
+    unsigned int tile_size = GENERATE(2, 4, 8, 16, 32);
+    test_group_partition(tile_size, use_global_mem);
+  }
+}
@@ -0,0 +1,606 @@
+/*
+Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Test Description:
+/*The general idea of the application is to test how multi-GPU Cooperative
+Groups kernel launches to a stream interact with other things that may be
+simultaneously running in the same streams.
+
+The HIP specification says that a multi-GPU cooperative launch will wait
+until all of the streams it's using finish their work. Only then will the
+cooperative kernel be launched to all of the devices. Then no other work
+can take part in the any of the streams until all of the multi-GPU
+cooperative work is done.
+
+However, there are flags that allow you to disable each of these
+serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and
+hipCooperativeLaunchMultiDeviceNoPostSync.
+
+As such, this benchmark tests the following five situations launching
+to two GPUs (and thus two streams):
+
+    1. Normal multi-GPU cooperative kernel:
+        This should result in the following pattern:
+        Stream 0: Cooperative
+        Stream 1: Cooperative
+    2. Regular kernel launches and multi-GPU cooperative kernel launches
+       with the default flags, resulting in the following pattern:
+        Stream 0: Regular --> Cooperative
+        Stream 1:         --> Cooperative --> Regular
+
+    3. Regular kernel launches and multi-GPU cooperative kernel launches
+       that turn off "pre-sync". This should allow a cooperative kernel
+       to launch even if work is already in a stream pointing to
+       another GPU.
+        This should result in the following pattern:
+        Stream 0: Regular --> Cooperative
+        Stream 1: Cooperative            --> Regular
+
+    4. Regular kernel launches and multi-GPU cooperative kernel launches
+       that turn off "post-sync". This should allow a new kernel to enter
+       a GPU even if another GPU still has a cooperative kernel on it.
+        This should result in the following pattern:
+        Stream 0: Regular --> Cooperative
+        Stream 1:         --> Cooperative--> Regular
+
+    5. Regular kernel launches and multi-GPU cooperative kernel launches
+       that turn off both pre- and post-sync. This should allow any of
+       the kernels to launch to their GPU regardless of the status of
+       other kernels in other multi-GPU stream groups.
+        This should result in the following pattern:
+        Stream 0: Regular --> Cooperative
+        Stream 1: Cooperative --> Regular
+
+We time how long it takes to run each of these benchmarks and print it as
+the output of the benchmark. The kernels themselves are just useless time-
+wasting code so that the kernel takes a meaningful amount of time on the
+GPU before it exits. We only launch a single wavefront for each kernel, so
+any serialization should not be because of GPU occupancy concerns.
+
+If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that
+cooperative kernels are serialized as expected.
+
+If test #5 takes roughly twice as long as #1, that implies that the
+overlap-allowing flags work as expected.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+static constexpr size_t kBufferLen = 1024 * 1024;
+
+__global__ void test_gws(uint* buf, uint buf_size, long* tmp_buf, long* result) {
+  extern __shared__ long tmp[];
+  uint groups = gridDim.x;
+  uint group_id = blockIdx.x;
+  uint local_id = threadIdx.x;
+  uint chunk = gridDim.x * blockDim.x;
+
+  uint i = group_id * blockDim.x + local_id;
+  long sum = 0;
+  while (i < buf_size) {
+    sum += buf[i];
+    i += chunk;
+  }
+  tmp[local_id] = sum;
+  __syncthreads();
+  i = 0;
+  if (local_id == 0) {
+    sum = 0;
+    while (i < blockDim.x) {
+      sum += tmp[i];
+      i++;
+    }
+    tmp_buf[group_id] = sum;
+  }
+  // wait
+  cg::this_grid().sync();
+
+  if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) {
+    for (uint i = 1; i < groups; ++i) {
+      sum += tmp_buf[i];
+    }
+    //*result = sum;
+    result[1 + cg::this_multi_grid().grid_rank()] = sum;
+  }
+  cg::this_multi_grid().sync();
+  if (cg::this_multi_grid().grid_rank() == 0) {
+    sum = 0;
+    for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
+      sum += result[i];
+    }
+    *result = sum;
+  }
+}
+
+__global__ void test_coop_kernel(unsigned int loops, long long* array, int fast_gpu) {
+  cg::multi_grid_group mgrid = cg::this_multi_grid();
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (mgrid.grid_rank() == fast_gpu) {
+    return;
+  }
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = clock64();
+    do {
+      long long cur_clock = clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while (time_diff < 1000000);
+    array[rank] += clock64();
+  }
+}
+
+__global__ void test_coop_kernel_gfx11(unsigned int loops, long long* array, int fast_gpu) {
+#if HT_AMD
+  cg::multi_grid_group mgrid = cg::this_multi_grid();
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (mgrid.grid_rank() == fast_gpu) {
+    return;
+  }
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = wall_clock64();
+    do {
+      long long cur_clock = wall_clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while (time_diff < 1000000);
+    array[rank] += wall_clock64();
+  }
+#endif
+}
+
+__global__ void test_kernel(uint32_t loops, unsigned long long* array) {
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = clock64();
+    do {
+      long long cur_clock = clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while (time_diff < 1000000);
+    array[rank] += clock64();
+  }
+}
+
+__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array) {
+#if HT_AMD
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = wall_clock64();
+    do {
+      long long cur_clock = wall_clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while (time_diff < 1000000);
+    array[rank] += wall_clock64();
+  }
+#endif
+}
+
+static void verify_time(double single_kernel_time, double multi_kernel_time, float low_bound,
+                        float high_bound) {
+  // Test that multiple kernel times are inside expected boundaries
+  REQUIRE(multi_kernel_time >= low_bound * single_kernel_time);
+  REQUIRE(multi_kernel_time <= high_bound * single_kernel_time);
+}
+
+void test_multigrid_streams(int device_num) {
+  uint32_t loops = 2000;
+  int32_t fast_gpu = -1;
+
+  // We will launch enough waves to fill up all of the GPU
+  int warp_sizes[2];
+  int num_sms[2];
+  hipDeviceProp_t device_properties[2];
+  int warp_size = INT_MAX;
+  int num_sm = INT_MAX;
+  for (int dev = 0; dev < (device_num - 1); ++dev) {
+    for (int i = 0; i < 2; i++) {
+      HIP_CHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
+      warp_sizes[i] = device_properties[i].warpSize;
+      if (warp_sizes[i] < warp_size) {
+        warp_size = warp_sizes[i];
+      }
+      num_sms[i] = device_properties[i].multiProcessorCount;
+      if (num_sms[i] < num_sm) {
+        num_sm = num_sms[i];
+      }
+    }
+
+    // Calculate the device occupancy to know how many blocks can be run.
+    int max_blocks_per_sm_arr[2];
+    int max_blocks_per_sm = INT_MAX;
+    for (int i = 0; i < 2; i++) {
+      HIP_CHECK(hipSetDevice(dev + i));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm_arr[i],
+                                                             test_kernel_used, warp_size, 0));
+      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
+        max_blocks_per_sm = max_blocks_per_sm_arr[i];
+      }
+    }
+    int desired_blocks = 1;
+
+    if (desired_blocks > max_blocks_per_sm * num_sm) {
+      INFO("The requested number of blocks will not fit on the GPU");
+      REQUIRE(desired_blocks < max_blocks_per_sm * num_sm);
+      return;
+    }
+
+    // Create the streams we will use in this test
+    hipStream_t streams[2];
+    for (int i = 0; i < 2; i++) {
+      HIP_CHECK(hipSetDevice(dev + i));
+      HIP_CHECK(hipStreamCreate(&streams[i]));
+    }
+
+    // Set up data to pass into the kernel
+    // Alocate the host input buffer, and two device-focused buffers that we
+    // will use for our test.
+    unsigned long long* dev_array[2];
+    for (int i = 0; i < 2; i++) {
+      int good_size = desired_blocks * warp_size * sizeof(long long);
+      HIP_CHECK(hipSetDevice(dev + i));
+      HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), good_size));
+      HIP_CHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      HIP_CHECK(hipSetDevice(dev + i));
+      HIP_CHECK(hipDeviceSynchronize());
+    }
+
+    /* Launch the kernels ****************************************************/
+    void* dev_params[2][3];
+    hipLaunchParams md_params[2];
+    std::chrono::time_point<std::chrono::system_clock> start_time[2];
+    std::chrono::time_point<std::chrono::system_clock> end_time[2];
+
+    // Test 0: Launching a multi-GPU cooperative kernel
+    // Both GPUs launch a long cooperative kernel
+    INFO("GPU " << dev << ": Long Coop Kernel");
+    INFO("GPU " << (dev + 1) << ": Long Coop Kernel");
+
+    auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel;
+    for (int i = 0; i < 2; i++) {
+      dev_params[i][0] = reinterpret_cast<void*>(&loops);
+      dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
+      dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
+      md_params[i].func = reinterpret_cast<void*>(test_coop_kernel_used);
+      md_params[i].gridDim = desired_blocks;
+      md_params[i].blockDim = warp_size;
+      md_params[i].sharedMem = 0;
+      md_params[i].stream = streams[i];
+      md_params[i].args = dev_params[i];
+    }
+
+    start_time[0] = std::chrono::system_clock::now();
+    HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
+    for (int i = 0; i < 2; i++) {
+      HIP_CHECK(hipSetDevice(dev + i));
+      HIP_CHECK(hipDeviceSynchronize());
+    }
+    end_time[0] = std::chrono::system_clock::now();
+
+    std::chrono::duration<double> single_kernel_time = (end_time[0] - start_time[0]);
+    INFO("A single kernel on both GPUs took: " << single_kernel_time.count() << " seconds");
+
+    SECTION("GPU1 - Standard/ Long Coop, GPU2 - Coop/Standard") {
+      INFO("GPU " << dev << ": Standard/Long Coop");
+      INFO("GPU " << (dev + 1) << ": Coop/Standard");
+      fast_gpu = 1;
+      start_time[1] = std::chrono::system_clock::now();
+      HIP_CHECK(hipSetDevice(dev));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
+                         loops, dev_array[0]);
+      HIP_CHECK(hipGetLastError());
+      HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
+      HIP_CHECK(hipSetDevice(dev + 1));
+      test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
+                         loops, dev_array[1]);
+      HIP_CHECK(hipGetLastError());
+      for (int i = 0; i < 2; i++) {
+        HIP_CHECK(hipSetDevice(dev + i));
+        HIP_CHECK(hipDeviceSynchronize());
+      }
+      end_time[1] = std::chrono::system_clock::now();
+      std::chrono::duration<double> serialized_gpu0_time = (end_time[1] - start_time[1]);
+      INFO("Serialized set of three kernels with GPU0 being long took: "
+           << serialized_gpu0_time.count() << " seconds");
+
+      verify_time(single_kernel_time.count(), serialized_gpu0_time.count(), 2.7f, 3.3f);
+    }
+
+    SECTION("GPU1 - Standard/Coop, GPU2 - Long Coop/Standard") {
+      INFO("GPU " << dev << ": Standard/Coop");
+      INFO("GPU " << (dev + 1) << ": Long Coop/Standard");
+      fast_gpu = 0;
+      start_time[1] = std::chrono::system_clock::now();
+      HIP_CHECK(hipSetDevice(dev));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
+                         loops, dev_array[0]);
+      HIP_CHECK(hipGetLastError());
+      HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
+      HIP_CHECK(hipSetDevice(dev + 1));
+      test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
+                         loops, dev_array[1]);
+      HIP_CHECK(hipGetLastError());
+      for (int i = 0; i < 2; i++) {
+        HIP_CHECK(hipSetDevice(dev + i));
+        HIP_CHECK(hipDeviceSynchronize());
+      }
+      end_time[1] = std::chrono::system_clock::now();
+      std::chrono::duration<double> serialized_gpu1_time = (end_time[1] - start_time[1]);
+      INFO("Serialized set of three kernels with GPU1 being long took: "
+           << serialized_gpu1_time.count() << " seconds");
+
+      verify_time(single_kernel_time.count(), serialized_gpu1_time.count(), 2.7f, 3.3f);
+    }
+
+    SECTION(
+        "GPU1 - Standard/Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap at "
+        "beginning") {
+      INFO("GPU " << dev << ": Standard/Coop with multi device no pre sync");
+      INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre sync");
+      fast_gpu = 0;
+      start_time[1] = std::chrono::system_clock::now();
+      HIP_CHECK(hipSetDevice(dev));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
+                         loops, dev_array[0]);
+      HIP_CHECK(hipGetLastError());
+      HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
+                                                      hipCooperativeLaunchMultiDeviceNoPreSync));
+      HIP_CHECK(hipSetDevice(dev + 1));
+      test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
+                         loops, dev_array[1]);
+      HIP_CHECK(hipGetLastError());
+      for (int i = 0; i < 2; i++) {
+        HIP_CHECK(hipSetDevice(dev + i));
+        HIP_CHECK(hipDeviceSynchronize());
+      }
+      end_time[1] = std::chrono::system_clock::now();
+      std::chrono::duration<double> pre_overlapped_time = (end_time[1] - start_time[1]);
+      INFO("Multiple kernels with pre-overlap allowed took: " << pre_overlapped_time.count()
+                                                              << " seconds");
+
+      verify_time(single_kernel_time.count(), pre_overlapped_time.count(), 1.7f, 2.3f);
+    }
+
+    SECTION(
+        "GPU1 - Standard/Long Coop, GPU2 - Coop/Standard - regular and coop kernel overlap at "
+        "end") {
+      INFO("GPU " << dev << ": Standard/Long Coop with multi device no post sync");
+      INFO("GPU " << (dev + 1) << ": Coop/Standard with multi device no post sync");
+      fast_gpu = 1;
+      start_time[1] = std::chrono::system_clock::now();
+      HIP_CHECK(hipSetDevice(dev));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
+                         loops, dev_array[0]);
+      HIP_CHECK(hipGetLastError());
+      HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
+                                                      hipCooperativeLaunchMultiDeviceNoPostSync));
+      HIP_CHECK(hipSetDevice(dev + 1));
+      test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
+                         loops, dev_array[1]);
+      for (int i = 0; i < 2; i++) {
+        HIP_CHECK(hipSetDevice(dev + i));
+        HIP_CHECK(hipDeviceSynchronize());
+      }
+      end_time[1] = std::chrono::system_clock::now();
+      std::chrono::duration<double> post_overlapped_time = (end_time[1] - start_time[1]);
+      INFO("Multiple kernels with post-overlap allowed took: " << post_overlapped_time.count()
+                                                               << " seconds");
+
+      verify_time(single_kernel_time.count(), post_overlapped_time.count(), 1.7f, 2.3f);
+    }
+
+    SECTION(
+        "GPU1 - Standard/Long Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap") {
+      INFO("GPU " << dev << ": Standard/Long Coop with multi device no pre or post sync");
+      INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre or post sync");
+      start_time[1] = std::chrono::system_clock::now();
+      HIP_CHECK(hipSetDevice(dev));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
+                         loops, dev_array[0]);
+      HIP_CHECK(hipGetLastError());
+      HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(
+          md_params, 2,
+          hipCooperativeLaunchMultiDeviceNoPreSync | hipCooperativeLaunchMultiDeviceNoPostSync));
+      HIP_CHECK(hipSetDevice(dev + 1));
+      test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
+                         loops, dev_array[1]);
+      HIP_CHECK(hipGetLastError());
+      for (int i = 0; i < 2; i++) {
+        HIP_CHECK(hipSetDevice(dev + i));
+        HIP_CHECK(hipDeviceSynchronize());
+      }
+      end_time[1] = std::chrono::system_clock::now();
+      std::chrono::duration<double> overlapped_time = (end_time[1] - start_time[1]);
+      INFO("Multiple kernels with overlap allowed took: " << overlapped_time.count() << " seconds");
+
+      verify_time(single_kernel_time.count(), overlapped_time.count(), 1.8f, 2.2f);
+    }
+
+    for (int k = 0; k < 2; ++k) {
+      HIP_CHECK(hipFree(dev_array[k]));
+      HIP_CHECK(hipStreamDestroy(streams[k]));
+    }
+  }
+}
+
+TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") {
+  constexpr uint num_kernel_args = 4;
+
+  int device_num = 0;
+  HIP_CHECK(hipGetDeviceCount(&device_num));
+
+  size_t buffer_size = kBufferLen * sizeof(int);
+
+  int* A_h = reinterpret_cast<int*>(malloc(buffer_size * device_num));
+  for (uint32_t i = 0; i < kBufferLen * device_num; ++i) {
+    A_h[i] = static_cast<int>(i);
+  }
+
+  std::vector<int*> A_d(device_num);
+  std::vector<long*> B_d(device_num);
+  long* C_d;
+  std::vector<hipStream_t> stream(device_num);
+
+  std::vector<hipDeviceProp_t> device_properties(device_num);
+
+  for (int i = 0; i < device_num; i++) {
+    HIP_CHECK(hipSetDevice(i));
+
+    // Calculate the device occupancy to know how many blocks can be run concurrently
+    HIP_CHECK(hipGetDeviceProperties(&device_properties[i], 0));
+    if (!device_properties[i].cooperativeMultiDeviceLaunch) {
+      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+      return;
+    }
+
+    HIP_CHECK(hipMalloc(&A_d[i], buffer_size));
+    HIP_CHECK(hipMemcpy(A_d[i], &A_h[i * kBufferLen], buffer_size, hipMemcpyHostToDevice));
+    if (i == 0) {
+      HIP_CHECK(hipHostMalloc(&C_d, (device_num + 1) * sizeof(long)));
+    }
+
+    HIP_CHECK(hipStreamCreate(&stream[i]));
+    HIP_CHECK(hipDeviceSynchronize());
+  }
+
+  dim3 dimBlock;
+  dim3 dimGrid;
+  dimGrid.x = 1;
+  dimGrid.y = 1;
+  dimGrid.z = 1;
+  dimBlock.x = 64;
+  dimBlock.y = 1;
+  dimBlock.z = 1;
+
+  int num_blocks = 0;
+  uint workgroup = GENERATE(64, 128, 256);
+
+  hipLaunchParams* launch_params_list = new hipLaunchParams[device_num];
+  std::vector<void*> args(device_num * num_kernel_args);
+
+  for (int i = 0; i < device_num; i++) {
+    HIP_CHECK(hipSetDevice(i));
+
+    dimBlock.x = workgroup;
+    HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &num_blocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
+
+    INFO("GPU" << i << " has block size = " << dimBlock.x << " and num blocks per CU " << num_blocks
+               << "\n");
+
+    dimGrid.x = device_properties[i].multiProcessorCount * std::min(num_blocks, 32);
+
+    HIP_CHECK(hipMalloc(&B_d[i], dimGrid.x * sizeof(long)));
+
+    args[i * num_kernel_args] = (void*)&A_d[i];
+    args[i * num_kernel_args + 1] = (void*)&kBufferLen;
+    args[i * num_kernel_args + 2] = (void*)&B_d[i];
+    args[i * num_kernel_args + 3] = (void*)&C_d;
+
+    launch_params_list[i].func = reinterpret_cast<void*>(test_gws);
+    launch_params_list[i].gridDim = dimGrid;
+    launch_params_list[i].blockDim = dimBlock;
+    launch_params_list[i].sharedMem = dimBlock.x * sizeof(long);
+    launch_params_list[i].stream = stream[i];
+    launch_params_list[i].args = &args[i * num_kernel_args];
+  }
+
+  HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launch_params_list, device_num, 0));
+  for (int i = 0; i < device_num; i++) {
+    HIP_CHECK(hipStreamSynchronize(stream[i]));
+  }
+
+  size_t processed_Dwords = kBufferLen * device_num;
+  REQUIRE(*C_d == (((long)(processed_Dwords) * (processed_Dwords - 1)) / 2));
+
+  delete[] launch_params_list;
+
+  HIP_CHECK(hipSetDevice(0));
+  HIP_CHECK(hipHostFree(C_d));
+  for (int i = 0; i < device_num; i++) {
+    HIP_CHECK(hipSetDevice(i));
+    HIP_CHECK(hipFree(A_d[i]));
+    HIP_CHECK(hipFree(B_d[i]));
+    HIP_CHECK(hipStreamDestroy(stream[i]));
+  }
+
+  free(A_h);
+}
+
+TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Streams") {
+  int device_num = 0;
+  HIP_CHECK(hipGetDeviceCount(&device_num));
+
+  if (device_num < 2) {
+    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
+    return;
+  }
+
+  hipDeviceProp_t device_properties;
+  for (int i = 0; i < device_num; i++) {
+    HIP_CHECK(hipGetDeviceProperties(&device_properties, i));
+    if (!device_properties.cooperativeMultiDeviceLaunch) {
+      HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+      return;
+    }
+  }
+
+  test_multigrid_streams(device_num);
+}
@@ -0,0 +1,364 @@
+/*
+Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+static constexpr size_t kBufferLen = 1024 * 1024;
+
+__global__ void test_gws(int* buf, size_t buf_size, long* tmp_buf, long* result) {
+  extern __shared__ long tmp[];
+  uint offset = blockIdx.x * blockDim.x + threadIdx.x;
+  uint stride = gridDim.x * blockDim.x;
+  cg::grid_group gg = cg::this_grid();
+
+  long sum = 0;
+  for (uint i = offset; i < buf_size; i += stride) {
+    sum += buf[i];
+  }
+  tmp[threadIdx.x] = sum;
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    sum = 0;
+    for (uint i = 0; i < blockDim.x; i++) {
+      sum += tmp[i];
+    }
+    tmp_buf[blockIdx.x] = sum;
+  }
+
+  gg.sync();
+
+  if (offset == 0) {
+    for (uint i = 1; i < gridDim.x; ++i) {
+      sum += tmp_buf[i];
+    }
+    *result = sum;
+  }
+}
+
+__global__ void test_kernel(uint32_t loops, unsigned long long* array, long long totalTicks) {
+  cg::thread_block tb = cg::this_thread_block();
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = clock64();
+    do {
+      long long cur_clock = clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while (time_diff < totalTicks);
+    tb.sync();
+    array[rank] += clock64();
+  }
+}
+
+__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array, long long totalTicks) {
+#if HT_AMD
+  cg::thread_block tb = cg::this_thread_block();
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = wall_clock64();
+    do {
+      long long cur_clock = wall_clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while (time_diff < totalTicks);
+    tb.sync();
+    array[rank] += wall_clock64();
+  }
+#endif
+}
+
+template <typename T>
+static void verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time,
+                                T& triple_kernel_time) {
+#if HT_AMD
+  // hipLaunchCooperativeKernel() follows serialization policy on AMD devices
+  // Test that the two cooperative kernels took roughly twice as long as the one
+  REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
+  REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
+#else
+  // hipLaunchCooperativeKernel() doesn't follow serialization policy on NV devices
+  // Test that the two cooperative kernels took roughly as long as the one
+  REQUIRE(double_kernel_time.count() >= 0.8 * single_kernel_time.count());
+  REQUIRE(double_kernel_time.count() <= 1.2 * single_kernel_time.count());
+#endif
+
+  // Test that the three kernels together took roughly as long as the two
+  // cooperative kernels.
+  REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
+}
+
+template <typename T>
+static void verifyHalfCapacity(T& single_kernel_time, T& double_kernel_time,
+                               T& triple_kernel_time) {
+  // Test that the two cooperative kernels took roughly twice as long as the one
+  REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
+  REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
+
+  // Test that the three kernels together took roughly as long as the two
+  // cooperative kernels.
+  REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
+}
+
+template <typename T>
+static void verifyFullCapacity(T& single_kernel_time, T& double_kernel_time,
+                               T& triple_kernel_time) {
+  // Test that the two cooperative kernels took roughly twice as long as the one
+  REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
+  REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
+
+  // Test that the three kernels together took roughly 1.6 times as long as the two
+  // cooperative kernels. If the first 2 kernels run very fast, the third
+  // won't share much time with the second kernel.
+  REQUIRE(triple_kernel_time.count() <= 1.7 * double_kernel_time.count());
+}
+
+template <typename T>
+static void verify(int tests, T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time) {
+  switch (tests) {
+    case 0:
+      verifyLeastCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
+      break;
+    case 1:
+      verifyHalfCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
+      break;
+    case 2:
+      verifyFullCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
+      break;
+    default:
+      break;
+  }
+}
+
+static void test_cooperative_streams(int dev, int p_tests) {
+  hipStream_t streams[3];
+  unsigned long long* dev_array[3];
+  int loops = 1000;
+
+  HIP_CHECK(hipSetDevice(dev));
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, dev));
+
+  // Test whether target device supports cooperative groups
+  if (device_properties.cooperativeLaunch == 0) {
+    std::cout << "Cooperative group support not available in device " << dev << std::endl;
+    return;
+  }
+
+  // We will launch enough waves to fill up all of the GPU
+  int warp_size = device_properties.warpSize;
+  int num_sms = device_properties.multiProcessorCount;
+  long long totalTicks = device_properties.clockRate;
+  int max_blocks_per_sm = 0;
+  // Calculate the device occupancy to know how many blocks can be run.
+  auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+  HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
+                                                         warp_size, 0));
+  int max_active_blocks = max_blocks_per_sm * num_sms;
+  int coop_blocks = 0;
+  int reg_blocks = 0;
+
+  switch (p_tests) {
+    case 0:
+      // 1 block
+      coop_blocks = 1;
+      reg_blocks = 1;
+      break;
+    case 1:
+      // Half capacity
+      // To make sure the second kernel launched by hipLaunchCooperativeKernel
+      // is invoked after the first kernel finished
+      coop_blocks = max_active_blocks / 2 + 1;
+      // To make sure the third kernel launched by hipLaunchKernelGGL is invoked
+      // concurrently with the second kernel
+      reg_blocks = max_active_blocks - coop_blocks;
+      break;
+    case 2:
+      // Full capacity
+      coop_blocks = max_active_blocks;
+      reg_blocks = max_active_blocks;
+      break;
+    default:
+      break;
+  }
+
+  for (int i = 0; i < 3; i++) {
+    HIP_CHECK(hipStreamCreate(&streams[i]));
+  }
+
+  // Set up data to pass into the kernel
+
+  for (int i = 0; i < 3; i++) {
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), warp_size * sizeof(long long)));
+    HIP_CHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long), streams[i]));
+  }
+
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Launch the kernels
+  void* coop_params[3][3];
+  for (int i = 0; i < 3; i++) {
+    coop_params[i][0] = reinterpret_cast<void*>(&loops);
+    coop_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
+    coop_params[i][2] = reinterpret_cast<void*>(&totalTicks);
+  }
+
+  // We need exclude the the initial launching as it will need time to load code obj.
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
+                                       warp_size, coop_params[0], 0, streams[0]));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Launching a single cooperative kernel
+  auto single_start = std::chrono::system_clock::now();
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
+                                       warp_size, coop_params[0], 0, streams[0]));
+  HIP_CHECK(hipDeviceSynchronize());
+  auto single_end = std::chrono::system_clock::now();
+
+  std::chrono::duration<double> single_kernel_time = (single_end - single_start);
+
+  // Launching 2 cooperative kernels to different streams
+  auto double_start = std::chrono::system_clock::now();
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
+                                       warp_size, coop_params[0], 0, streams[0]));
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
+                                       warp_size, coop_params[1], 0, streams[1]));
+
+  HIP_CHECK(hipDeviceSynchronize());
+  auto double_end = std::chrono::system_clock::now();
+
+  // Launching 2 cooperative kernels and 1 normal kernel
+  std::chrono::duration<double> double_kernel_time = (double_end - double_start);
+
+  auto triple_start = std::chrono::system_clock::now();
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
+                                       warp_size, coop_params[0], 0, streams[0]));
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
+                                       warp_size, coop_params[1], 0, streams[1]));
+  hipLaunchKernelGGL(test_kernel_used, dim3(reg_blocks), dim3(warp_size), 0, streams[2], loops,
+                     dev_array[2], totalTicks);
+
+  HIP_CHECK(hipDeviceSynchronize());
+  auto triple_end = std::chrono::system_clock::now();
+  std::chrono::duration<double> triple_kernel_time = (triple_end - triple_start);
+
+  for (int k = 0; k < 3; ++k) {
+    HIP_CHECK(hipFree(dev_array[k]));
+    HIP_CHECK(hipStreamDestroy(streams[k]));
+  }
+
+
+  INFO("A single kernel took : " << single_kernel_time.count() << " seconds");
+  INFO("Two cooperative kernels took: " << double_kernel_time.count() << " seconds");
+  INFO("Two coop kernels and a third regular kernel took: " << triple_kernel_time.count()
+                                                            << " seconds");
+
+  verify(p_tests, single_kernel_time, double_kernel_time, triple_kernel_time);
+}
+
+TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
+  // Use default device for validating the test
+  int device;
+  int *A_h, *A_d;
+  long* B_d;
+  long* C_d;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.cooperativeLaunch) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
+    return;
+  }
+
+  size_t buffer_size = kBufferLen * sizeof(int);
+
+  A_h = reinterpret_cast<int*>(malloc(buffer_size));
+  for (uint32_t i = 0; i < kBufferLen; ++i) {
+    A_h[i] = static_cast<int>(i);
+  }
+
+  HIP_CHECK(hipMalloc(&A_d, buffer_size));
+  HIP_CHECK(hipMemcpy(A_d, A_h, buffer_size, hipMemcpyHostToDevice));
+  HIP_CHECK(hipHostMalloc(&C_d, sizeof(long)));
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  dim3 dimBlock = dim3(1);
+  dim3 dimGrid = dim3(1);
+  int numBlocks = 0;
+
+  uint32_t workgroup = GENERATE(32, 64, 128, 256);
+
+  dimBlock.x = workgroup;
+
+  // Calculate the device occupancy to know how many blocks can be run concurrently
+  HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+      &numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
+
+  dimGrid.x = device_properties.multiProcessorCount * std::min(numBlocks, 32);
+  HIP_CHECK(hipMalloc(&B_d, dimGrid.x * sizeof(long)));
+
+  void* params[4];
+  params[0] = (void*)&A_d;
+  params[1] = (void*)&kBufferLen;
+  params[2] = (void*)&B_d;
+  params[3] = (void*)&C_d;
+
+  INFO("Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n");
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_gws), dimGrid, dimBlock, params,
+                                       dimBlock.x * sizeof(long), stream));
+
+  HIP_CHECK(hipStreamSynchronize(stream));
+
+  REQUIRE(((unsigned long long)*C_d) == (((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2));
+
+  HIP_CHECK(hipStreamDestroy(stream));
+  HIP_CHECK(hipHostFree(C_d));
+  HIP_CHECK(hipFree(B_d));
+  HIP_CHECK(hipFree(A_d));
+  free(A_h);
+}
+
+TEST_CASE("Unit_hipLaunchCooperativeKernel_Streams") {
+  const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
+  int p_tests = GENERATE(0, 1, 2);
+
+  test_cooperative_streams(device, p_tests);
+}
@@ -0,0 +1,68 @@
+/*
+Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+#define ASSERT_EQUAL(lhs, rhs) HIP_ASSERT(lhs == rhs)
+#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
+#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
+
+constexpr int MaxGPUs = 8;
+
+template <typename T>
+void printResults(T* ptr, int size) {
+  for (int i = 0; i < size; i++) {
+    std::cout << ptr[i] << " ";
+  }
+  std::cout << '\n';
+}
+
+template <typename T>
+void compareResults(T* cpu, T* gpu, int size) {
+  for (unsigned int i = 0; i < size / sizeof(T); i++) {
+    if (cpu[i] != gpu[i]) {
+      INFO("Results do not match at index " << i);
+      REQUIRE(cpu[i] == gpu[i]);
+    }
+  }
+}
+
+
+// Search if the sum exists in the expected results array
+template <typename T>
+void verifyResults(T* hPtr, T* dPtr, int size) {
+  int i = 0, j = 0;
+  for (i = 0; i < size; i++) {
+    for (j = 0; j < size; j++) {
+      if (hPtr[i] == dPtr[j]) {
+        break;
+      }
+    }
+    if (j == size) {
+      INFO("Result verification failed!");
+      REQUIRE(j != size);
+    }
+  }
+}
@@ -0,0 +1,30 @@
+set(TEST_SRC
+    hipGLGetDevices.cc
+    hipGraphicsGLRegisterBuffer.cc
+    hipGraphicsGLRegisterImage.cc
+    hipGraphicsMapResources.cc
+    hipGraphicsSubResourceGetMappedArray.cc
+    hipGraphicsResourceGetMappedPointer.cc
+    hipGraphicsUnmapResources.cc
+    hipGraphicsUnregisterResource.cc
+)
+
+find_package(OpenGL COMPONENTS OpenGL EGL)
+message(STATUS "OpenGL_FOUND: ${OpenGL_FOUND}")
+if(NOT OpenGL_FOUND)
+  message(STATUS "OpenGL not found, OpenGL interop tests not enabled.")
+  return()
+endif()
+
+find_package(GLUT)
+message(STATUS "GLUT_FOUND: ${GLUT_FOUND}")
+if(NOT GLUT_FOUND)
+  message(STATUS "GLUT not found, OpenGL interop tests not enabled.")
+  return()
+endif()
+
+hip_add_exe_to_target(NAME GLInteropTest
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      COMPILE_OPTIONS -std=c++17)
+target_link_libraries(GLInteropTest OpenGL::GL OpenGL::EGL GLUT::GLUT)
@@ -0,0 +1,219 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <variant>
+
+#define GL_GLEXT_PROTOTYPES
+#include <GL/freeglut.h>
+#include <GL/freeglut_ext.h>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
+#include <hip_test_common.hh>
+
+class GLBufferObject {
+ public:
+  static constexpr size_t kSize = 512 * 512 * 4 * sizeof(float);
+
+  GLBufferObject() {
+    glGenBuffers(1, &vbo_);
+    glBindBuffer(GL_ARRAY_BUFFER, vbo_);
+    glBufferData(GL_ARRAY_BUFFER, kSize, 0, GL_DYNAMIC_DRAW);
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    REQUIRE(glGetError() == GL_NO_ERROR);
+  }
+
+  ~GLBufferObject() { glDeleteBuffers(1, &vbo_); }
+
+  operator GLuint() const { return vbo_; }
+
+ private:
+  GLuint vbo_;
+};
+
+class GLImageObject {
+ public:
+  static constexpr size_t kWidth = 512, kHeight = 512;
+
+  GLImageObject() {
+    glGenTextures(1, &tex_);
+    glBindTexture(GL_TEXTURE_2D, tex_);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, kWidth, kHeight, 0, GL_RGBA_INTEGER_EXT,
+                 GL_UNSIGNED_BYTE, NULL);
+    REQUIRE(glGetError() == GL_NO_ERROR);
+  }
+
+  ~GLImageObject() { glDeleteTextures(1, &tex_); }
+
+  operator GLuint() const { return tex_; }
+
+ private:
+  GLuint tex_;
+};
+
+static std::once_flag glut_init_flag;
+
+class GLUTContextScopeGuard {
+ public:
+  GLUTContextScopeGuard() {
+    std::call_once(glut_init_flag, &GLUTContextScopeGuard::init);
+    glut_window_ = glutCreateWindow("");
+  }
+
+  ~GLUTContextScopeGuard() { glutDestroyWindow(glut_window_); }
+
+  GLUTContextScopeGuard(const GLUTContextScopeGuard&) = delete;
+  GLUTContextScopeGuard& operator=(const GLUTContextScopeGuard&) = delete;
+
+  GLUTContextScopeGuard(GLUTContextScopeGuard&&) = delete;
+  GLUTContextScopeGuard& operator=(GLUTContextScopeGuard&&) = delete;
+
+ private:
+  int glut_window_;
+
+  static void init() {
+    static char proc_name[] = "";
+    static std::array<char*, 2> glut_argv = {proc_name, nullptr};
+    static int glut_argc = 1;
+
+    glutInit(&glut_argc, glut_argv.data());
+    glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE | GLUT_DEPTH);
+    glutInitWindowSize(512, 512);
+  }
+};
+
+class EGLContextScopeGuard {
+ public:
+  EGLContextScopeGuard() {
+    // 1. Initialize EGL
+    PFNEGLQUERYDEVICESEXTPROC eglQueryDevicesEXT =
+        (PFNEGLQUERYDEVICESEXTPROC)eglGetProcAddress("eglQueryDevicesEXT");
+
+    eglQueryDevicesEXT(egl_devices_.max_size(), egl_devices_.data(), &num_devices_);
+
+    INFO("Detected " << num_devices_ << " devices");
+
+    PFNEGLGETPLATFORMDISPLAYEXTPROC eglGetPlatformDisplayEXT =
+        (PFNEGLGETPLATFORMDISPLAYEXTPROC)eglGetProcAddress("eglGetPlatformDisplayEXT");
+
+    egl_display_ = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, egl_devices_.at(0), 0);
+
+    REQUIRE(eglInitialize(egl_display_, &major_, &minor_));
+
+    // 2. Select an appropriate configuration
+    REQUIRE(eglChooseConfig(egl_display_, kConfigAttribs, &egl_config_, 1, &num_configs_));
+
+    // 3. Create a surface
+    egl_surface_ = eglCreatePbufferSurface(egl_display_, egl_config_, kPbufferAttribs);
+
+    // 4. Bind the API
+    REQUIRE(eglBindAPI(EGL_OPENGL_API));
+
+    // 5. Create a context and make it current
+    egl_context_ = eglCreateContext(egl_display_, egl_config_, EGL_NO_CONTEXT, NULL);
+
+    REQUIRE(eglMakeCurrent(egl_display_, egl_surface_, egl_surface_, egl_context_));
+  }
+
+  ~EGLContextScopeGuard() {
+    // 6. Terminate EGL when finished
+    eglTerminate(egl_display_);
+  }
+
+  EGLContextScopeGuard(const EGLContextScopeGuard&) = delete;
+  EGLContextScopeGuard& operator=(const EGLContextScopeGuard&) = delete;
+
+  EGLContextScopeGuard(EGLContextScopeGuard&&) = delete;
+  EGLContextScopeGuard& operator=(EGLContextScopeGuard&&) = delete;
+
+ private:
+  // clang-format off
+  static constexpr EGLint kConfigAttribs[] = {
+      EGL_SURFACE_TYPE,
+      EGL_PBUFFER_BIT,
+      EGL_BLUE_SIZE, 8,
+      EGL_GREEN_SIZE, 8,
+      EGL_RED_SIZE, 8,
+      EGL_DEPTH_SIZE, 8,
+      EGL_RENDERABLE_TYPE,
+      EGL_OPENGL_BIT,
+      EGL_NONE
+  };
+  // clang-format on
+
+  static constexpr int kPbufferWidth = 9;
+  static constexpr int kPbufferHeight = 9;
+
+  static constexpr EGLint kPbufferAttribs[] = {
+      EGL_WIDTH, kPbufferWidth, EGL_HEIGHT, kPbufferHeight, EGL_NONE,
+  };
+
+  std::array<EGLDeviceEXT, 8> egl_devices_;
+  EGLint num_devices_;
+  EGLDisplay egl_display_;
+  EGLint major_, minor_;
+  EGLint num_configs_;
+  EGLConfig egl_config_;
+  EGLSurface egl_surface_;
+  EGLContext egl_context_;
+};
+
+class GLContextScopeGuard {
+ public:
+  using GLUTContextScopeGuardPtr = std::unique_ptr<GLUTContextScopeGuard>;
+  using EGLContextScopeGuardPtr = std::unique_ptr<EGLContextScopeGuard>;
+  using GLContextScopeGuardVariant =
+      std::variant<GLUTContextScopeGuardPtr, EGLContextScopeGuardPtr>;
+
+  static constexpr char kEnvarName[] = "GL_CONTEXT_TYPE";
+
+  GLContextScopeGuard() {
+    char* val = std::getenv(kEnvarName);
+    std::string val_str = val == NULL ? "" : val;
+
+    if (val_str.empty() || val_str == "GLUT") {
+      gl_context_ = std::make_unique<GLUTContextScopeGuard>();
+    } else if (val_str == "EGL") {
+      gl_context_ = std::make_unique<EGLContextScopeGuard>();
+    } else {
+      INFO("Unsupported " << kEnvarName << " value '" << val_str << "'");
+      INFO("Supported values are ['GLUT', 'EGL']");
+      REQUIRE(false);
+    }
+  }
+
+  GLContextScopeGuard(const GLContextScopeGuard&) = delete;
+  GLContextScopeGuard& operator=(const GLContextScopeGuard&) = delete;
+
+  GLContextScopeGuard(GLContextScopeGuard&&) = delete;
+  GLContextScopeGuard& operator=(GLContextScopeGuard&&) = delete;
+
+ private:
+  GLContextScopeGuardVariant gl_context_;
+};
@@ -0,0 +1,90 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#include "gl_interop_common.hh"
+
+namespace {
+constexpr std::array<hipGLDeviceList, 3> kDeviceLists{
+    hipGLDeviceListAll, hipGLDeviceListCurrentFrame, hipGLDeviceListNextFrame};
+}  // anonymous namespace
+
+TEST_CASE("Unit_hipGLGetDevices_Positive_Basic") {
+  GLContextScopeGuard gl_context;
+
+  const auto device_list = GENERATE(from_range(begin(kDeviceLists), end(kDeviceLists)));
+
+  const int device_count = HipTest::getDeviceCount();
+
+  unsigned int gl_device_count = 0;
+  std::vector<int> gl_devices(device_count, -1);
+
+  HIP_CHECK(hipGLGetDevices(&gl_device_count, gl_devices.data(), device_count, device_list));
+
+  REQUIRE(gl_device_count == 1);
+  REQUIRE(gl_devices.at(0) == 0);
+}
+
+TEST_CASE("Unit_hipGLGetDevices_Positive_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  const int device_count = HipTest::getDeviceCount();
+
+  unsigned int gl_device_count = 0;
+  std::vector<int> gl_devices(device_count, -1);
+
+  SECTION("pHipDeviceCount == nullptr") {
+    HIP_CHECK(hipGLGetDevices(nullptr, gl_devices.data(), device_count, hipGLDeviceListAll));
+    REQUIRE(gl_devices.at(0) == 0);
+  }
+
+  SECTION("pHipDevices == nullptr") {
+    HIP_CHECK(hipGLGetDevices(&gl_device_count, nullptr, device_count, hipGLDeviceListAll));
+    REQUIRE(gl_device_count == 1);
+  }
+
+  SECTION("hipDeviceCount == 0") {
+    HIP_CHECK(hipGLGetDevices(&gl_device_count, gl_devices.data(), 0, hipGLDeviceListAll));
+    REQUIRE(gl_device_count == 1);
+    REQUIRE(gl_devices.at(0) == -1);
+  }
+}
+
+TEST_CASE("Unit_hipGLGetDevices_Negative_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  const int device_count = HipTest::getDeviceCount();
+
+  unsigned int gl_device_count = 0;
+  std::vector<int> gl_devices(device_count, -1);
+
+  SECTION("invalid deviceList") {
+    HIP_CHECK_ERROR(hipGLGetDevices(&gl_device_count, gl_devices.data(), device_count,
+                                    static_cast<hipGLDeviceList>(-1)),
+                    hipErrorInvalidValue);
+    REQUIRE(gl_device_count == 0);
+    REQUIRE(gl_devices.at(0) == -1);
+  }
+}
@@ -0,0 +1,98 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#include "gl_interop_common.hh"
+
+namespace {
+constexpr std::array<unsigned int, 3> kFlags{hipGraphicsRegisterFlagsNone,
+                                             hipGraphicsRegisterFlagsReadOnly,
+                                             hipGraphicsRegisterFlagsWriteDiscard};
+}  // anonymous namespace
+
+TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Positive_Basic") {
+  GLContextScopeGuard gl_context;
+
+  const auto flags = GENERATE(from_range(begin(kFlags), end(kFlags)));
+
+  GLBufferObject vbo;
+
+  hipGraphicsResource* vbo_resource;
+
+  HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, flags));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
+}
+
+TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Positive_Register_Twice") {
+  GLContextScopeGuard gl_context;
+
+  GLBufferObject vbo;
+
+  hipGraphicsResource *vbo_resource_1, *vbo_resource_2;
+
+  HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource_1, vbo, hipGraphicsRegisterFlagsNone));
+  HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource_2, vbo, hipGraphicsRegisterFlagsNone));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource_1));
+  HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource_2));
+}
+
+TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Negative_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  GLBufferObject vbo;
+
+  hipGraphicsResource* vbo_resource;
+
+  SECTION("resource == nullptr") {
+    HIP_CHECK_ERROR(hipGraphicsGLRegisterBuffer(nullptr, vbo, hipGraphicsRegisterFlagsNone),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("invalid buffer") {
+    HIP_CHECK_ERROR(
+        hipGraphicsGLRegisterBuffer(&vbo_resource, GLuint{}, hipGraphicsRegisterFlagsNone),
+        hipErrorInvalidValue);
+  }
+
+  SECTION("invalid flags") {
+    HIP_CHECK_ERROR(
+        hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, std::numeric_limits<unsigned int>::max()),
+        hipErrorInvalidValue);
+  }
+
+  SECTION("flags == hipGraphicsRegisterFlagsSurfaceLoadStore") {
+    HIP_CHECK_ERROR(
+        hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsSurfaceLoadStore),
+        hipErrorInvalidValue);
+  }
+
+  SECTION("flags == hipGraphicsRegisterFlagsTextureGather") {
+    HIP_CHECK_ERROR(
+        hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsTextureGather),
+        hipErrorInvalidValue);
+  }
+}
@@ -0,0 +1,102 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#include "gl_interop_common.hh"
+
+namespace {
+constexpr std::array<unsigned int, 5> kFlags{
+    hipGraphicsRegisterFlagsNone, hipGraphicsRegisterFlagsReadOnly,
+    hipGraphicsRegisterFlagsWriteDiscard, hipGraphicsRegisterFlagsSurfaceLoadStore,
+    hipGraphicsRegisterFlagsTextureGather};
+}  // anonymous namespace
+
+TEST_CASE("Unit_hipGraphicsGLRegisterImage_Positive_Basic") {
+  GLContextScopeGuard gl_context;
+
+  const auto flags = GENERATE(from_range(begin(kFlags), end(kFlags)));
+
+  GLImageObject tex;
+
+  hipGraphicsResource* tex_resource;
+
+  HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, flags));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
+}
+
+TEST_CASE("Unit_hipGraphicsGLRegisterImage_Positive_Register_Twice") {
+  GLContextScopeGuard gl_context;
+
+  GLImageObject tex;
+
+  hipGraphicsResource *tex_resource_1, *tex_resource_2;
+
+  HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource_1, tex, GL_TEXTURE_2D,
+                                       hipGraphicsRegisterFlagsNone));
+  HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource_2, tex, GL_TEXTURE_2D,
+                                       hipGraphicsRegisterFlagsNone));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(tex_resource_1));
+  HIP_CHECK(hipGraphicsUnregisterResource(tex_resource_2));
+}
+
+TEST_CASE("Unit_hipGraphicsGLRegisterImage_Negative_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  GLImageObject tex;
+
+  hipGraphicsResource* tex_resource;
+
+  SECTION("resource == nullptr") {
+    HIP_CHECK_ERROR(
+        hipGraphicsGLRegisterImage(nullptr, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone),
+        hipErrorInvalidValue);
+  }
+
+  SECTION("invalid image") {
+    HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, GLuint{}, GL_TEXTURE_2D,
+                                               hipGraphicsRegisterFlagsNone),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("invalid target") {
+    HIP_CHECK_ERROR(
+        hipGraphicsGLRegisterImage(&tex_resource, tex, GL_BUFFER, hipGraphicsRegisterFlagsNone),
+        hipErrorInvalidValue);
+  }
+
+  SECTION("target does not match the object") {
+    HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_RENDERBUFFER,
+                                               hipGraphicsRegisterFlagsNone),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("invalid flags") {
+    HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D,
+                                               std::numeric_limits<unsigned int>::max()),
+                    hipErrorInvalidValue);
+  }
+}
@@ -0,0 +1,93 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#include "gl_interop_common.hh"
+
+TEST_CASE("Unit_hipGraphicsMapResources_Positive_Basic") {
+  GLContextScopeGuard gl_context;
+
+  GLBufferObject vbo;
+  GLImageObject tex;
+
+  std::array<hipGraphicsResource_t, 2> resources;
+
+  HIP_CHECK(hipGraphicsGLRegisterBuffer(&resources.at(0), vbo, hipGraphicsRegisterFlagsNone));
+  HIP_CHECK(hipGraphicsGLRegisterImage(&resources.at(1), tex, GL_TEXTURE_2D,
+                                       hipGraphicsRegisterFlagsNone));
+
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  HIP_CHECK(hipGraphicsMapResources(resources.size(), resources.data(), stream));
+
+  HIP_CHECK(hipGraphicsUnmapResources(resources.size(), resources.data(), stream));
+
+  HIP_CHECK(hipStreamDestroy(stream));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(resources.at(0)));
+  HIP_CHECK(hipGraphicsUnregisterResource(resources.at(1)));
+}
+
+TEST_CASE("Unit_hipGraphicsMapResources_Negative_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  GLBufferObject vbo;
+
+  hipGraphicsResource* vbo_resource;
+
+  HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
+
+  SECTION("count == 0") {
+    HIP_CHECK_ERROR(hipGraphicsMapResources(0, &vbo_resource, 0), hipErrorInvalidValue);
+  }
+
+  SECTION("resources == nullptr") {
+    HIP_CHECK_ERROR(hipGraphicsMapResources(1, nullptr, 0), hipErrorInvalidValue);
+  }
+
+  SECTION("unregistered resource") {
+    hipGraphicsResource* unregistered_resource;
+    HIP_CHECK(
+        hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone));
+    HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
+    HIP_CHECK_ERROR(hipGraphicsMapResources(1, &unregistered_resource, 0), hipErrorInvalidHandle);
+  }
+
+  SECTION("already mapped resource") {
+    HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
+    HIP_CHECK_ERROR(hipGraphicsMapResources(1, &vbo_resource, 0), hipErrorAlreadyMapped);
+    HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
+  }
+
+  SECTION("invalid stream") {
+    hipStream_t stream;
+    HIP_CHECK(hipStreamCreate(&stream));
+    HIP_CHECK(hipStreamDestroy(stream));
+    HIP_CHECK_ERROR(hipGraphicsMapResources(1, &vbo_resource, stream), hipErrorContextIsDestroyed);
+  }
+
+  HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
+}
@@ -0,0 +1,151 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#include "gl_interop_common.hh"
+
+TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Positive_Basic") {
+  GLContextScopeGuard gl_context;
+
+  GLBufferObject vbo;
+
+  hipGraphicsResource* vbo_resource;
+
+  HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
+
+  HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
+
+  float* buffer_devptr = nullptr;
+  size_t size = 0;
+
+  HIP_CHECK(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr), &size,
+                                                vbo_resource));
+
+  REQUIRE(buffer_devptr != nullptr);
+  REQUIRE(size == vbo.kSize);
+
+  HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
+}
+
+TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Positive_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  GLBufferObject vbo;
+
+  hipGraphicsResource* vbo_resource;
+
+  HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
+
+  HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
+
+  float* buffer_devptr = nullptr;
+  size_t size = 0;
+
+  SECTION("devPtr == nullptr") {
+    HIP_CHECK(hipGraphicsResourceGetMappedPointer(nullptr, &size, vbo_resource));
+    REQUIRE(size == vbo.kSize);
+  }
+
+  SECTION("size == nullptr") {
+    HIP_CHECK(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr), nullptr,
+                                                  vbo_resource));
+    REQUIRE(buffer_devptr != nullptr);
+  }
+
+  HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
+}
+
+TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Negative_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  GLBufferObject vbo;
+
+  hipGraphicsResource* vbo_resource;
+
+  HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
+
+  HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
+
+  float* buffer_devptr = nullptr;
+  size_t size = 0;
+
+  SECTION("non-pointer resource") {
+    GLImageObject tex;
+    hipGraphicsResource* tex_resource;
+
+    HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D,
+                                         hipGraphicsRegisterFlagsNone));
+    HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0));
+
+    HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
+                                                        &size, tex_resource),
+                    hipErrorNotMappedAsPointer);
+
+    HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0));
+    HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
+  }
+
+  SECTION("unregistered resource") {
+    hipGraphicsResource* unregistered_resource;
+    HIP_CHECK(
+        hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone));
+    HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
+    HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
+                                                        &size, unregistered_resource),
+                    hipErrorContextIsDestroyed);
+  }
+
+  SECTION("not mapped resource") {
+    hipGraphicsResource* not_mapped_resource;
+    HIP_CHECK(hipGraphicsGLRegisterBuffer(&not_mapped_resource, vbo, hipGraphicsRegisterFlagsNone));
+    HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
+                                                        &size, not_mapped_resource),
+                    hipErrorNotMapped);
+    HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource));
+  }
+
+  SECTION("unmapped resource") {
+    hipGraphicsResource* unmapped_resource;
+
+    HIP_CHECK(hipGraphicsGLRegisterBuffer(&unmapped_resource, vbo, hipGraphicsRegisterFlagsNone));
+
+    HIP_CHECK(hipGraphicsMapResources(1, &unmapped_resource, 0));
+    HIP_CHECK(hipGraphicsUnmapResources(1, &unmapped_resource, 0));
+
+    HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
+                                                        &size, unmapped_resource),
+                    hipErrorNotMapped);
+
+    HIP_CHECK(hipGraphicsUnregisterResource(unmapped_resource));
+  }
+
+  HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
+}
@@ -0,0 +1,132 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#include "gl_interop_common.hh"
+
+TEST_CASE("Unit_hipGraphicsSubResourceGetMappedArray_Positive_Basic") {
+  GLContextScopeGuard gl_context;
+
+  GLImageObject tex;
+
+  hipGraphicsResource* tex_resource;
+
+  HIP_CHECK(
+      hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone));
+
+  HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0));
+
+  hipArray* image_devptr = nullptr;
+  HIP_CHECK(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource, 0, 0));
+
+  REQUIRE(image_devptr != nullptr);
+
+  HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
+}
+
+TEST_CASE("Unit_hipGraphicsSubResourceGetMappedArray_Negative_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  GLImageObject tex;
+
+  hipGraphicsResource* tex_resource;
+
+  HIP_CHECK(
+      hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone));
+
+  HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0));
+
+  hipArray* image_devptr = nullptr;
+
+  SECTION("array == nullptr") {
+    HIP_CHECK(hipGraphicsSubResourceGetMappedArray(nullptr, tex_resource, 0, 0));
+  }
+
+  SECTION("non-texture resource") {
+    GLBufferObject vbo;
+    hipGraphicsResource* vbo_resource;
+
+    HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
+    HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
+
+    HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, vbo_resource, 0, 0),
+                    hipErrorNotMappedAsArray);
+
+    HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
+    HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
+  }
+
+  SECTION("unregistered resource") {
+    hipGraphicsResource* unregistered_resource;
+    HIP_CHECK(hipGraphicsGLRegisterImage(&unregistered_resource, tex, GL_TEXTURE_2D,
+                                         hipGraphicsRegisterFlagsNone));
+    HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
+    HIP_CHECK_ERROR(
+        hipGraphicsSubResourceGetMappedArray(&image_devptr, unregistered_resource, 0, 0),
+        hipErrorContextIsDestroyed);
+  }
+
+  SECTION("not mapped resource") {
+    hipGraphicsResource* not_mapped_resource;
+    HIP_CHECK(hipGraphicsGLRegisterImage(&not_mapped_resource, tex, GL_TEXTURE_2D,
+                                         hipGraphicsRegisterFlagsNone));
+    HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, not_mapped_resource, 0, 0),
+                    hipErrorNotMapped);
+    HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource));
+  }
+
+  SECTION("unmapped resource") {
+    hipGraphicsResource* unmapped_resource;
+
+    HIP_CHECK(hipGraphicsGLRegisterImage(&unmapped_resource, tex, GL_TEXTURE_2D,
+                                         hipGraphicsRegisterFlagsNone));
+
+    HIP_CHECK(hipGraphicsMapResources(1, &unmapped_resource, 0));
+    HIP_CHECK(hipGraphicsUnmapResources(1, &unmapped_resource, 0));
+
+    HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, unmapped_resource, 0, 0),
+                    hipErrorNotMapped);
+
+    HIP_CHECK(hipGraphicsUnregisterResource(unmapped_resource));
+  }
+
+  SECTION("invalid arrayIndex") {
+    HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource,
+                                                         std::numeric_limits<int>::max(), 0),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("invalid mipLevel") {
+    HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource, 0,
+                                                         std::numeric_limits<int>::max()),
+                    hipErrorInvalidValue);
+  }
+
+  HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
+}
@@ -0,0 +1,66 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#include "gl_interop_common.hh"
+
+TEST_CASE("Unit_hipGraphicsUnmapResources_Negative_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  GLBufferObject vbo;
+
+  hipGraphicsResource* vbo_resource;
+
+  HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
+
+  HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
+
+  SECTION("count == 0") {
+    HIP_CHECK_ERROR(hipGraphicsUnmapResources(0, &vbo_resource, 0), hipErrorInvalidValue);
+  }
+
+  SECTION("resources == nullptr") {
+    HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, nullptr, 0), hipErrorInvalidValue);
+  }
+
+  SECTION("not mapped resource") {
+    hipGraphicsResource* not_mapped_resource;
+    HIP_CHECK(hipGraphicsGLRegisterBuffer(&not_mapped_resource, vbo, hipGraphicsRegisterFlagsNone));
+    HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, &not_mapped_resource, 0), hipErrorNotMapped);
+    HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource));
+  }
+
+  SECTION("invalid stream") {
+    hipStream_t stream;
+    HIP_CHECK(hipStreamCreate(&stream));
+    HIP_CHECK(hipStreamDestroy(stream));
+    HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, &vbo_resource, stream),
+                    hipErrorContextIsDestroyed);
+  }
+
+  HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
+
+  HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
+}
@@ -0,0 +1,48 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#include "gl_interop_common.hh"
+
+TEST_CASE("Unit_hipGraphicsUnregisterResource_Negative_Parameters") {
+  GLContextScopeGuard gl_context;
+
+  GLBufferObject vbo;
+
+  SECTION("already unregistered resource") {
+    hipGraphicsResource* unregistered_resource;
+    HIP_CHECK(
+        hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone));
+    HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
+    HIP_CHECK_ERROR(hipGraphicsUnregisterResource(unregistered_resource), hipErrorInvalidContext);
+  }
+
+  SECTION("mapped resource") {
+    hipGraphicsResource* mapped_resource;
+    HIP_CHECK(hipGraphicsGLRegisterBuffer(&mapped_resource, vbo, hipGraphicsRegisterFlagsNone));
+    HIP_CHECK(hipGraphicsMapResources(1, &mapped_resource, 0));
+    HIP_CHECK_ERROR(hipGraphicsUnregisterResource(mapped_resource), hipErrorAlreadyMapped);
+  }
+}
@@ -103,6 +103,7 @@ set(TEST_SRC
  hipGraphKernelNodeSetParams.cc
  hipGraphExecKernelNodeSetParams.cc
  hipGraphLaunch.cc
+  hipGraphLaunch_old.cc
  hipGraphMemcpyNodeSetParams1D.cc
  hipGraphExecMemcpyNodeSetParamsToSymbol_old.cc
  hipGraphExecMemcpyNodeSetParamsToSymbol.cc
@@ -40,19 +40,26 @@ end. Instantiate and Launch the Graph. Wait for the event to complete.
 Verify that hipEventElapsedTime() returns error.
 6) Validate scenario 2 by running the graph multiple times in a loop
 (100 times) after instantiation.
- 7) Negative Scenarios
+ 7) Validate that no error is reported when numDeps <= dependencies length
+ 8) Negative Scenarios
    - Output node is a nullptr.
    - Input graph is a nullptr.
    - Input dependencies is a nullptr.
+    - Node in dependency is from different graph
+    - Invalid numNodes
+    - Duplicate node in dependencies
    - Input event is a nullptr.
    - Input graph is uninitialized.
    - Input event is uninitialized.
 */
+#include <functional>

-#include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
 #include <hip_test_kernels.hh>

+#include "graph_tests_common.hh"
+
 /**
 * Scenario 1: Create s simple graph with just one event record
 * node and instantiate and launch the graph.
@@ -66,8 +73,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_Simple") {
  hipEvent_t event;
  HIP_CHECK(hipEventCreate(&event));
  hipGraphNode_t eventrec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
-                                                            event));
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event));
  // Instantiate and launch the graph
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
@@ -82,8 +88,8 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_Simple") {
 /**
 * Local test function
 */
-static void validateAddEventRecordNode(bool measureTime, bool withFlags,
-                            int nstep, unsigned flag = 0) {
+static void validateAddEventRecordNode(bool measureTime, bool withFlags, int nstep,
+                                       unsigned flag = 0) {
  constexpr size_t N = 1024;
  constexpr size_t Nbytes = N * sizeof(int);
  constexpr auto blocksPerCU = 6;  // to hide latency
@@ -111,8 +117,7 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags,
  memsetParams.elementSize = sizeof(char);
  memsetParams.width = Nbytes;
  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
-                                  &memsetParams));
+  HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
  memset(&memsetParams, 0, sizeof(memsetParams));
  memsetParams.dst = reinterpret_cast<void*>(B_d);
  memsetParams.value = 0;
@@ -120,38 +125,34 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags,
  memsetParams.elementSize = sizeof(char);
  memsetParams.width = Nbytes;
  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memset_B, graph, nullptr, 0,
-                                  &memsetParams));
+  HIP_CHECK(hipGraphAddMemsetNode(&memset_B, graph, nullptr, 0, &memsetParams));

-  void* kernelArgs1[] = {&C_d, &memsetVal, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func =
-                       reinterpret_cast<void *>(HipTest::memsetReverse<int>);
+  void* kernelArgs1[] = {&C_d, &memsetVal, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::memsetReverse<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&memsetKer_C, graph, nullptr, 0,
-                                  &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&memsetKer_C, graph, nullptr, 0, &kernelNodeParams));

-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d,
-                                  A_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));

-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d,
-                                  B_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));

-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h,
-                                  C_d, Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));

-  void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
+  void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&ker_vecAdd, graph, nullptr, 0,
-                                                        &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&ker_vecAdd, graph, nullptr, 0, &kernelNodeParams));
  hipEvent_t eventstart, eventend;
  if (withFlags) {
    HIP_CHECK(hipEventCreateWithFlags(&eventstart, flag));
@@ -161,10 +162,8 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags,
    HIP_CHECK(hipEventCreate(&eventend));
  }
  hipGraphNode_t event_start, event_final;
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0,
-                                                            eventstart));
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0,
-                                                            eventend));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0, eventstart));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0, eventend));
  // Create dependencies
  HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memset_A, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memset_B, 1));
@@ -260,7 +259,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
  HIP_CHECK(hipEventCreateWithFlags(&event_start, hipEventDisableTiming));
  HIP_CHECK(hipEventCreateWithFlags(&event_end, hipEventDisableTiming));
  // memset node
-  char *A_d;
+  char* A_d;
  hipGraphNode_t memset_A;
  hipMemsetParams memsetParams{};
  HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -271,14 +270,11 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
  memsetParams.elementSize = sizeof(char);
  memsetParams.width = Nbytes;
  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
-                                   &memsetParams));
+  HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));

  hipGraphNode_t event_node_start, event_node_end;
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_node_start, graph, nullptr, 0,
-                                                            event_start));
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_node_end, graph, nullptr, 0,
-                                                            event_end));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_node_start, graph, nullptr, 0, event_start));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_node_end, graph, nullptr, 0, event_end));
  // Add dependencies between nodes
  HIP_CHECK(hipGraphAddDependencies(graph, &event_node_start, &memset_A, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memset_A, &event_node_end, 1));
@@ -290,7 +286,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
  // Validate hipEventElapsedTime returns error code because timing is
  // disabled for start and end event nodes.
  float t;
-  REQUIRE(hipSuccess != hipEventElapsedTime(&t, event_start, event_end));
+  HIP_CHECK_ERROR(hipEventElapsedTime(&t, event_start, event_end), hipErrorInvalidHandle);

  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipFree(A_d));
@@ -301,44 +297,73 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
 }

 /**
- * Scenario 7: All negative tests
+ * Scenario 7: Positive parameter tests
 */
-TEST_CASE("Unit_hipGraphAddEventRecordNode_Negative") {
+TEST_CASE("Unit_hipGraphAddEventRecordNode_Positive_Parameters") {
  hipGraph_t graph;
  HIP_CHECK(hipGraphCreate(&graph, 0));
  hipEvent_t event;
  HIP_CHECK(hipEventCreate(&event));
-  hipGraphNode_t eventwait;
-  SECTION("pGraphNode = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(nullptr,
-                                    graph, nullptr, 0, event));
+  hipGraphNode_t eventrec;
+
+  hipGraphNode_t dep_node = nullptr;
+  hipGraphNode_t dep_node2 = nullptr;
+  HIP_CHECK(hipGraphAddEmptyNode(&dep_node, graph, nullptr, 0));
+  HIP_CHECK(hipGraphAddEmptyNode(&dep_node2, graph, nullptr, 0));
+  hipGraphNode_t dep_nodes[] = {dep_node, dep_node2};
+
+  size_t numDeps = 0;
+  SECTION("numDependencies is zero, dependencies is not nullptr") {
+    HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 0, event));
+    HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps));
+    REQUIRE(numDeps == 0);
  }

-  SECTION("graph = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
-                                    nullptr, nullptr, 0, event));
+  SECTION("numDependencies < dependencies length") {
+    HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 1, event));
+    HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps));
+    REQUIRE(numDeps == 1);
  }

-  SECTION("pDependencies = nullptr and numDependencies != 0") {
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
-                                    graph, nullptr, 1, event));
-  }
-
-  SECTION("event = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
-                                    graph, nullptr, 0, nullptr));
-  }
-
-  SECTION("graph is uninitialized") {
-    hipGraph_t graph_uninit{};
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
-                                    graph_uninit, nullptr, 0, nullptr));
-  }
-
-  SECTION("event is uninitialized") {
-    hipEvent_t event_uninit{};
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
-                                    graph, nullptr, 0, event_uninit));
+  SECTION("numDependencies == dependencies length") {
+    HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 2, event));
+    HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps));
+    REQUIRE(numDeps == 2);
+  }
+
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipEventDestroy(event));
+}
+
+/**
+ * Scenario 8: All negative tests
+ */
+TEST_CASE("Unit_hipGraphAddEventRecordNode_Negative") {
+  using namespace std::placeholders;
+  hipGraph_t graph;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  hipEvent_t event;
+  HIP_CHECK(hipEventCreate(&event));
+  hipGraphNode_t eventrec;
+
+  GraphAddNodeCommonNegativeTests(std::bind(hipGraphAddEventRecordNode, _1, _2, _3, _4, event),
+                                  graph);
+
+  SECTION("event = nullptr") {
+    HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, nullptr),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("graph is uninitialized") {
+    hipGraph_t graph_uninit{};
+    HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph_uninit, nullptr, 0, event),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("event is uninitialized") {
+    hipEvent_t event_uninit{};
+    HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event_uninit),
+                    hipErrorInvalidValue);
  }

  HIP_CHECK(hipGraphDestroy(graph));
@@ -32,20 +32,25 @@ both graphs.
 (100 times).
 4) Execute scenario 2 with stream1 = stream2.
 5) Repeat scenario 2 for different event flags.
- 6) Negative Scenarios
+ 6) Validate that no error is reported when numDeps <= dependencies length
+ 7) Negative Scenarios
    - Pass input node parameter as nullptr.
    - Pass input graph parameter as nullptr.
    - Pass input dependency parameter as nullptr.
+    - Node in dependency is from different graph
+    - Invalid numNodes
+    - Duplicate node in dependencies
    - Pass input event parameter as nullptr.
    - Pass uninitialized input graph parameter.
    - Pass uninitialized input event parameter.
 */
+#include <functional>

-#include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
 #include <hip_test_kernels.hh>

-#define LEN 512
+#include "graph_tests_common.hh"

 /**
 * Scenario 1
@@ -60,13 +65,10 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_Functional_Simple") {
  HIP_CHECK(hipEventCreate(&event));
  hipGraphNode_t event_rec_node, event_wait_node;
  // Create a event record node in graph
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph, nullptr, 0,
-                                                            event));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph, nullptr, 0, event));
  // Create a event wait node in graph
-  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0,
-                                                            event));
-  HIP_CHECK(hipGraphAddDependencies(graph, &event_rec_node,
-                                    &event_wait_node, 1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event));
+  HIP_CHECK(hipGraphAddDependencies(graph, &event_rec_node, &event_wait_node, 1));
  // Instantiate and launch the graph
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
@@ -80,13 +82,14 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_Functional_Simple") {
 /**
 * Local Function
 */
-static void validate_hipGraphAddEventWaitNode_internodedep(int test,
-                         int nstep, unsigned flag = hipEventDefault) {
-  size_t memsize = LEN * sizeof(int);
+static void validate_hipGraphAddEventWaitNode_internodedep(int test, int nstep,
+                                                           unsigned flag = hipEventDefault) {
+  constexpr size_t N = 1024;
+  size_t memsize = N * sizeof(int);
  constexpr auto blocksPerCU = 6;  // to hide latency
  constexpr auto threadsPerBlock = 256;
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, LEN);
-  size_t NElem{LEN};
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  size_t NElem{N};
  hipGraph_t graph1, graph2;
  hipStream_t streamForGraph1, streamForGraph2;
  hipGraphExec_t graphExec1, graphExec2;
@@ -114,68 +117,57 @@ static void validate_hipGraphAddEventWaitNode_internodedep(int test,
  HIP_CHECK(hipMalloc(&out_d_g1, memsize));
  HIP_CHECK(hipMalloc(&out_d_g2, memsize));
  // Initialize host buffer
-  for (uint32_t i = 0; i < LEN; i++) {
+  for (uint32_t i = 0; i < N; i++) {
    inp_h[i] = i;
    out_h_g1[i] = 0;
    out_h_g2[i] = 0;
  }
  // Graph1 creation ...........
  // Create event1 record node in graph1
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0,
-                                                            event1));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, event1));

  // Create memcpy and kernel nodes for graph1
  hipGraphNode_t memcpyH2D, memcpyD2H_1, kernelnode_1;
  hipKernelNodeParams kernelNodeParams1{};
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d,
-                inp_h, memsize, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0,
-                out_h_g1, out_d_g1, memsize, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, out_h_g1, out_d_g1, memsize,
+                                    hipMemcpyDeviceToHost));

-  void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams1.func =
-  reinterpret_cast<void *>(HipTest::vector_square<int>);
+  void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams1.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
  kernelNodeParams1.gridDim = dim3(blocks);
  kernelNodeParams1.blockDim = dim3(threadsPerBlock);
  kernelNodeParams1.sharedMemBytes = 0;
  kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs1);
  kernelNodeParams1.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0,
-                                  &kernelNodeParams1));
+  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, &kernelNodeParams1));
  // Create dependencies for graph1
-  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D,
-                                    &event_rec_node, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node,
-                                    &kernelnode_1, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1,
-                                    &memcpyD2H_1, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &event_rec_node, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, &kernelnode_1, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, &memcpyD2H_1, 1));

  // Graph2 creation ...........
  // Create event1 record node in graph2
-  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0,
-                                                            event1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1));

  // Create memcpy and kernel nodes for graph2
  hipGraphNode_t memcpyD2H_2, kernelnode_2;
  hipKernelNodeParams kernelNodeParams2{};
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0,
-                out_h_g2, out_d_g2, memsize, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, out_h_g2, out_d_g2, memsize,
+                                    hipMemcpyDeviceToHost));

-  void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams2.func =
-  reinterpret_cast<void *>(HipTest::vector_cubic<int>);
+  void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams2.func = reinterpret_cast<void*>(HipTest::vector_cubic<int>);
  kernelNodeParams2.gridDim = dim3(blocks);
  kernelNodeParams2.blockDim = dim3(threadsPerBlock);
  kernelNodeParams2.sharedMemBytes = 0;
  kernelNodeParams2.kernelParams = reinterpret_cast<void**>(kernelArgs2);
  kernelNodeParams2.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0,
-                                  &kernelNodeParams2));
+  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, &kernelNodeParams2));
  // Create dependencies for graph2
-  HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node,
-                                    &kernelnode_2, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2,
-                                    &memcpyD2H_2, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &kernelnode_2, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, &memcpyD2H_2, 1));

  // Instantiate and launch the graphs
  HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
@@ -187,16 +179,16 @@ static void validate_hipGraphAddEventWaitNode_internodedep(int test,
    HIP_CHECK(hipStreamSynchronize(streamForGraph2));
    // Validate output
    bool btestPassed1 = true;
-    for (uint32_t i = 0; i < LEN; i++) {
-      if (out_h_g1[i] != (inp_h[i]*inp_h[i])) {
+    for (uint32_t i = 0; i < N; i++) {
+      if (out_h_g1[i] != (inp_h[i] * inp_h[i])) {
        btestPassed1 = false;
        break;
      }
    }
    REQUIRE(btestPassed1 == true);
    bool btestPassed2 = true;
-    for (uint32_t i = 0; i < LEN; i++) {
-      if (out_h_g2[i] != (inp_h[i]*inp_h[i]*inp_h[i])) {
+    for (uint32_t i = 0; i < N; i++) {
+      if (out_h_g2[i] != (inp_h[i] * inp_h[i] * inp_h[i])) {
        btestPassed2 = false;
        break;
      }
@@ -247,55 +239,81 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_MultGraphOneStrmDependency") {
 */
 TEST_CASE("Unit_hipGraphAddEventWaitNode_differentFlags") {
  SECTION("flag = hipEventBlockingSync") {
-    validate_hipGraphAddEventWaitNode_internodedep(0, 1,
-                       hipEventBlockingSync);
+    validate_hipGraphAddEventWaitNode_internodedep(0, 1, hipEventBlockingSync);
  }
  SECTION("graph = hipEventDisableTiming") {
-    validate_hipGraphAddEventWaitNode_internodedep(0, 1,
-                       hipEventDisableTiming);
+    validate_hipGraphAddEventWaitNode_internodedep(0, 1, hipEventDisableTiming);
  }
 }

 /**
- * Scenario 6
+ * Scenario 6: Positive parameter tests
 */
-TEST_CASE("Unit_hipGraphAddEventWaitNode_Negative") {
+TEST_CASE("Unit_hipGraphAddEventWaitNode_Positive_Parameters") {
  hipGraph_t graph;
  HIP_CHECK(hipGraphCreate(&graph, 0));
  hipEvent_t event;
  HIP_CHECK(hipEventCreate(&event));
  hipGraphNode_t eventwait;

-  SECTION("pGraphNode = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(nullptr,
-                                    graph, nullptr, 0, event));
+  hipGraphNode_t dep_node = nullptr;
+  hipGraphNode_t dep_node2 = nullptr;
+  HIP_CHECK(hipGraphAddEmptyNode(&dep_node, graph, nullptr, 0));
+  HIP_CHECK(hipGraphAddEmptyNode(&dep_node2, graph, nullptr, 0));
+  hipGraphNode_t dep_nodes[] = {dep_node, dep_node2};
+
+  size_t numDeps = 0;
+  SECTION("numDependencies is zero, dependencies is not nullptr") {
+    HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 0, event));
+    HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps));
+    REQUIRE(numDeps == 0);
  }

-  SECTION("graph = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
-                                    nullptr, nullptr, 0, event));
+  SECTION("numDependencies < dependencies length") {
+    HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 1, event));
+    HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps));
+    REQUIRE(numDeps == 1);
  }

-  SECTION("pDependencies = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
-                                    graph, nullptr, 1, event));
-  }
-
-  SECTION("event = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
-                                    graph, nullptr, 0, nullptr));
-  }
-
-  SECTION("graph is uninitialized") {
-    hipGraph_t graph_uninit{};
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
-                                    graph_uninit, nullptr, 0, event));
-  }
-
-  SECTION("event is uninitialized") {
-    hipEvent_t event_uninit{};
-    REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
-                                    graph, nullptr, 0, event_uninit));
+  SECTION("numDependencies == dependencies length") {
+    HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 2, event));
+    HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps));
+    REQUIRE(numDeps == 2);
+  }
+
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipEventDestroy(event));
+}
+
+/**
+ * Scenario 7
+ */
+TEST_CASE("Unit_hipGraphAddEventWaitNode_Negative") {
+  using namespace std::placeholders;
+  hipGraph_t graph;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  hipEvent_t event;
+  HIP_CHECK(hipEventCreate(&event));
+  hipGraphNode_t eventwait;
+
+  GraphAddNodeCommonNegativeTests(std::bind(hipGraphAddEventWaitNode, _1, _2, _3, _4, event),
+                                  graph);
+
+  SECTION("event = nullptr") {
+    HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, nullptr),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("graph is uninitialized") {
+    hipGraph_t graph_uninit{};
+    HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph_uninit, nullptr, 0, event),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("event is uninitialized") {
+    hipEvent_t event_uninit{};
+    HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event_uninit),
+                    hipErrorInvalidValue);
  }

  HIP_CHECK(hipGraphDestroy(graph));
@@ -26,11 +26,12 @@ with the event set in hipGraphAddEventRecordNode.
    - Output event is a nullptr.
    - Input node is an empty node.
    - Input node is a memset node.
+    - Input node is event wait node
    - Input node is an uninitialized node.
 */

-#include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
 #include <hip_test_kernels.hh>

 /**
@@ -42,8 +43,7 @@ static void validateEventRecordNodeGetEvent(unsigned flag) {
  hipEvent_t event, event_out;
  HIP_CHECK(hipEventCreateWithFlags(&event, flag));
  hipGraphNode_t eventrec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
-                                                            event));
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event));
  HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out));
  // validate set event and get event are same
  REQUIRE(event == event_out);
@@ -77,31 +77,32 @@ TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Functional") {
 TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Negative") {
  hipGraph_t graph;
  HIP_CHECK(hipGraphCreate(&graph, 0));
-  hipEvent_t event, event_out;
-  HIP_CHECK(hipEventCreate(&event));
-  hipGraphNode_t eventrec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
-                                                            event));
+  hipEvent_t event_out;
+  hipEvent_t event1, event2;
+  HIP_CHECK(hipEventCreate(&event1));
+  HIP_CHECK(hipEventCreate(&event2));
+  hipGraphNode_t eventrec, eventwait;
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event2));
+
  SECTION("node = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeGetEvent(nullptr,
-                                    &event_out));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(nullptr, &event_out), hipErrorInvalidValue);
  }

  SECTION("event_out = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeGetEvent(eventrec,
-                                    nullptr));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(eventrec, nullptr), hipErrorInvalidValue);
  }

  SECTION("input node is empty node") {
    hipGraphNode_t EmptyGraphNode;
    HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventRecordNodeGetEvent(EmptyGraphNode, &event_out));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(EmptyGraphNode, &event_out),
+                    hipErrorInvalidValue);
  }

  SECTION("input node is memset node") {
    constexpr size_t Nbytes = 1024;
-    char *A_d;
+    char* A_d;
    hipGraphNode_t memset_A;
    hipMemsetParams memsetParams{};
    HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -112,19 +113,21 @@ TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Negative") {
    memsetParams.elementSize = sizeof(char);
    memsetParams.width = Nbytes;
    memsetParams.height = 1;
-    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
-                                    &memsetParams));
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventRecordNodeGetEvent(memset_A, &event_out));
+    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(memset_A, &event_out), hipErrorInvalidValue);
    HIP_CHECK(hipFree(A_d));
  }

+  SECTION("input node is event wait node") {
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(eventwait, &event_out), hipErrorInvalidValue);
+  }
+
  SECTION("input node is uninitialized node") {
    hipGraphNode_t node_unit{};
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventRecordNodeGetEvent(node_unit, &event_out));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(node_unit, &event_out), hipErrorInvalidValue);
  }

  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipEventDestroy(event));
+  HIP_CHECK(hipEventDestroy(event1));
+  HIP_CHECK(hipEventDestroy(event2));
 }
@@ -30,14 +30,16 @@ Testcase Scenarios :
    - Input event parameter is nullptr.
    - Empty node is passed as input node.
    - Memset node is passed as input node.
+    - Event wait node is passed as input node.
    - Input node is an uninitialized node.
    - Input event is an uninitialized event.
 */

-#include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
 #include <hip_test_kernels.hh>

+
 /**
 * Local Function: Set Get test
 */
@@ -49,8 +51,7 @@ static void validateEventRecordNodeSetEvent(unsigned flag) {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreateWithFlags(&event2, flag));
  hipGraphNode_t eventrec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
-                                                            event1));
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
  // Set a different event
  HIP_CHECK(hipGraphEventRecordNodeSetEvent(eventrec, event2));
  HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out));
@@ -73,11 +74,9 @@ static void setEventWaitNode() {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t eventwait;
-  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
-                                                           event1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
  // Set a different event eventwait using hipGraphEventRecordNodeSetEvent
-  REQUIRE(hipErrorInvalidValue ==
-          hipGraphEventRecordNodeSetEvent(eventwait, event2));
+  HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventwait, event2), hipErrorInvalidValue);
  // Free resources
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipEventDestroy(event1));
@@ -98,13 +97,11 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
  HIP_CHECK(hipEventCreateWithFlags(&event2_end, hipEventDisableTiming));
  // Create nodes
  hipGraphNode_t event_start_rec, event_end_rec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0,
-                                                          event1_start));
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0,
-                                                          event1_end));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event1_start));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end));
  // Create memset node
  constexpr size_t Nbytes = 1024;
-  char *A_d;
+  char* A_d;
  hipGraphNode_t memset_A;
  hipMemsetParams memsetParams{};
  HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -115,8 +112,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
  memsetParams.elementSize = sizeof(char);
  memsetParams.width = Nbytes;
  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
-                                  &memsetParams));
+  HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
  // Create dependencies
  // event_start_rec --> memset_A --> event_end_rec
  HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memset_A, 1));
@@ -132,8 +128,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
  // Validate by measuring time difference between event_end_rec &
  // event_start_rec
  float t = 0.0f;
-  REQUIRE(hipSuccess == hipEventElapsedTime(&t, event1_start,
-                                            event1_end));
+  REQUIRE(hipSuccess == hipEventElapsedTime(&t, event1_start, event1_end));
  REQUIRE(t > 0.0f);
  // Change the event property after instantiation
  HIP_CHECK(hipGraphEventRecordNodeSetEvent(event_start_rec, event2_start));
@@ -145,8 +140,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
  // hipErrorInvalidHandle when events are created using
  // hipEventDisableTiming flag.
  t = 0.0f;
-  REQUIRE(hipErrorInvalidHandle ==
-          hipEventElapsedTime(&t, event2_start, event2_end));
+  HIP_CHECK_ERROR(hipEventElapsedTime(&t, event2_start, event2_end), hipErrorInvalidHandle);
  // Free resources
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipStreamDestroy(streamForGraph));
@@ -185,28 +179,24 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t eventrec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
-                                                            event1));
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
  SECTION("node = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeSetEvent(nullptr,
-                                    event2));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(nullptr, event2), hipErrorInvalidValue);
  }

  SECTION("event_out = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeSetEvent(eventrec,
-                                    nullptr));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventrec, nullptr), hipErrorInvalidValue);
  }

  SECTION("input node is empty node") {
    hipGraphNode_t EmptyGraphNode;
    HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventRecordNodeSetEvent(EmptyGraphNode, event2));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(EmptyGraphNode, event2), hipErrorInvalidValue);
  }

  SECTION("input node is memset node") {
    constexpr size_t Nbytes = 1024;
-    char *A_d;
+    char* A_d;
    hipGraphNode_t memset_A;
    hipMemsetParams memsetParams{};
    HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -217,10 +207,8 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") {
    memsetParams.elementSize = sizeof(char);
    memsetParams.width = Nbytes;
    memsetParams.height = 1;
-    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
-                                    &memsetParams));
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventRecordNodeSetEvent(memset_A, event2));
+    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(memset_A, event2), hipErrorInvalidValue);
    HIP_CHECK(hipFree(A_d));
  }

@@ -230,14 +218,12 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") {

  SECTION("input node is uninitialized node") {
    hipGraphNode_t node_uninit{};
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventRecordNodeSetEvent(node_uninit, event2));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(node_uninit, event2), hipErrorInvalidValue);
  }

  SECTION("input event is uninitialized") {
    hipEvent_t event_uninit{};
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventRecordNodeSetEvent(eventrec, event_uninit));
+    HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventrec, event_uninit), hipErrorInvalidValue);
  }

  HIP_CHECK(hipGraphDestroy(graph));
@@ -26,13 +26,15 @@ with the event set in hipGraphAddEventWaitNode.
    - Output event parameter is passed as nullptr.
    - Input node parameter is an empty node.
    - Input node parameter is a memset node.
+    - Input node parameter is a event record node.
    - Input node parameter is an uninitialized node.
 */

-#include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
 #include <hip_test_kernels.hh>

+
 /**
 * Local Function
 */
@@ -42,8 +44,7 @@ static void validateEventWaitNodeGetEvent(unsigned flag) {
  hipEvent_t event, event_out;
  HIP_CHECK(hipEventCreateWithFlags(&event, flag));
  hipGraphNode_t eventwait;
-  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
-                                                          event));
+  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event));
  HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out));
  // validate set event and get event are same
  REQUIRE(event == event_out);
@@ -77,31 +78,32 @@ TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Functional") {
 TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Negative") {
  hipGraph_t graph;
  HIP_CHECK(hipGraphCreate(&graph, 0));
-  hipEvent_t event, event_out;
-  HIP_CHECK(hipEventCreate(&event));
-  hipGraphNode_t eventwait;
-  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
-                                                            event));
+  hipEvent_t event_out;
+  hipEvent_t event1, event2;
+  HIP_CHECK(hipEventCreate(&event1));
+  HIP_CHECK(hipEventCreate(&event2));
+  hipGraphNode_t eventrec, eventwait;
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event2));
+
  SECTION("node = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeGetEvent(nullptr,
-                                    &event_out));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(nullptr, &event_out), hipErrorInvalidValue);
  }

  SECTION("event_out = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeGetEvent(eventwait,
-                                    nullptr));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(eventwait, nullptr), hipErrorInvalidValue);
  }

  SECTION("input node is empty node") {
    hipGraphNode_t EmptyGraphNode;
    HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventWaitNodeGetEvent(EmptyGraphNode, &event_out));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(EmptyGraphNode, &event_out),
+                    hipErrorInvalidValue);
  }

  SECTION("input node is memset node") {
    constexpr size_t Nbytes = 1024;
-    char *A_d;
+    char* A_d;
    hipGraphNode_t memset_A;
    hipMemsetParams memsetParams{};
    HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -112,19 +114,21 @@ TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Negative") {
    memsetParams.elementSize = sizeof(char);
    memsetParams.width = Nbytes;
    memsetParams.height = 1;
-    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
-                                    &memsetParams));
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventWaitNodeGetEvent(memset_A, &event_out));
+    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(memset_A, &event_out), hipErrorInvalidValue);
    HIP_CHECK(hipFree(A_d));
  }

+  SECTION("input node is event record node") {
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(eventrec, &event_out), hipErrorInvalidValue);
+  }
+
  SECTION("input node is uninitialized") {
    hipGraphNode_t node_uninit{};
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventWaitNodeGetEvent(node_uninit, &event_out));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(node_uninit, &event_out), hipErrorInvalidValue);
  }

  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipEventDestroy(event));
+  HIP_CHECK(hipEventDestroy(event1));
+  HIP_CHECK(hipEventDestroy(event2));
 }
@@ -37,11 +37,10 @@ Testcase Scenarios :
    - Input event is an uninitialized node.
 */

-#include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
 #include <hip_test_kernels.hh>

-#define LEN 512

 /**
 * Local Function
@@ -54,8 +53,7 @@ static void validateEventWaitNodeSetEvent(unsigned flag) {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreateWithFlags(&event2, flag));
  hipGraphNode_t eventwait;
-  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
-                                                            event1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
  // Set a different event
  HIP_CHECK(hipGraphEventWaitNodeSetEvent(eventwait, event2));
  HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out));
@@ -78,11 +76,9 @@ static void setEventRecordNode() {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t eventrec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
-                                                            event1));
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
  // Set a different event eventrec using hipGraphEventWaitNodeSetEvent
-  REQUIRE(hipErrorInvalidValue ==
-          hipGraphEventWaitNodeSetEvent(eventrec, event2));
+  HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventrec, event2), hipErrorInvalidValue);
  // Free resources
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipEventDestroy(event1));
@@ -93,11 +89,12 @@ static void setEventRecordNode() {
 * Scenario 2
 */
 TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") {
-  size_t memsize = LEN * sizeof(int);
+  constexpr size_t N = 512;
+  size_t memsize = N * sizeof(int);
  constexpr auto blocksPerCU = 6;  // to hide latency
  constexpr auto threadsPerBlock = 256;
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, LEN);
-  size_t NElem{LEN};
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  size_t NElem{N};
  hipGraph_t graph1, graph2;
  hipStream_t streamForGraph1, streamForGraph2;
  hipGraphExec_t graphExec1, graphExec2;
@@ -123,67 +120,56 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") {
  HIP_CHECK(hipMalloc(&out_d_g1, memsize));
  HIP_CHECK(hipMalloc(&out_d_g2, memsize));
  // Initialize host buffer
-  for (uint32_t i = 0; i < LEN; i++) {
+  for (uint32_t i = 0; i < N; i++) {
    inp_h[i] = i;
    out_h_g1[i] = 0;
    out_h_g2[i] = 0;
  }
  // Graph1 creation ...........
  // Create event1 record node in graph1
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0,
-                                                            event1));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, event1));

  // Create memcpy and kernel nodes for graph1
  hipGraphNode_t memcpyH2D, memcpyD2H_1, kernelnode_1;
  hipKernelNodeParams kernelNodeParams1{};
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d,
-                inp_h, memsize, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0,
-                out_h_g1, out_d_g1, memsize, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, out_h_g1, out_d_g1, memsize,
+                                    hipMemcpyDeviceToHost));

-  void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams1.func =
-  reinterpret_cast<void *>(HipTest::vector_square<int>);
+  void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams1.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
  kernelNodeParams1.gridDim = dim3(blocks);
  kernelNodeParams1.blockDim = dim3(threadsPerBlock);
  kernelNodeParams1.sharedMemBytes = 0;
  kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs1);
  kernelNodeParams1.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0,
-                                  &kernelNodeParams1));
+  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, &kernelNodeParams1));
  // Create dependencies for graph1
-  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D,
-                                    &event_rec_node, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node,
-                                    &kernelnode_1, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1,
-                                    &memcpyD2H_1, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &event_rec_node, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, &kernelnode_1, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, &memcpyD2H_1, 1));

  // Graph2 creation ...........
  // Create event1 record node in graph2
-  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0,
-                                                            event1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1));
  // Create memcpy and kernel nodes for graph2
  hipGraphNode_t memcpyD2H_2, kernelnode_2;
  hipKernelNodeParams kernelNodeParams2{};
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0,
-                out_h_g2, out_d_g2, memsize, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, out_h_g2, out_d_g2, memsize,
+                                    hipMemcpyDeviceToHost));

-  void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams2.func =
-  reinterpret_cast<void *>(HipTest::vector_cubic<int>);
+  void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams2.func = reinterpret_cast<void*>(HipTest::vector_cubic<int>);
  kernelNodeParams2.gridDim = dim3(blocks);
  kernelNodeParams2.blockDim = dim3(threadsPerBlock);
  kernelNodeParams2.sharedMemBytes = 0;
  kernelNodeParams2.kernelParams = reinterpret_cast<void**>(kernelArgs2);
  kernelNodeParams2.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0,
-                                  &kernelNodeParams2));
+  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, &kernelNodeParams2));
  // Create dependencies for graph2
-  HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node,
-                                    &kernelnode_2, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2,
-                                    &memcpyD2H_2, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &kernelnode_2, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, &memcpyD2H_2, 1));

  // Instantiate and launch the graphs
  HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
@@ -198,16 +184,16 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") {
  HIP_CHECK(hipStreamSynchronize(streamForGraph2));
  // Validate output
  bool btestPassed1 = true;
-  for (uint32_t i = 0; i < LEN; i++) {
-    if (out_h_g1[i] != (inp_h[i]*inp_h[i])) {
+  for (uint32_t i = 0; i < N; i++) {
+    if (out_h_g1[i] != (inp_h[i] * inp_h[i])) {
      btestPassed1 = false;
      break;
    }
  }
  REQUIRE(btestPassed1 == true);
  bool btestPassed2 = true;
-  for (uint32_t i = 0; i < LEN; i++) {
-    if (out_h_g2[i] != (inp_h[i]*inp_h[i]*inp_h[i])) {
+  for (uint32_t i = 0; i < N; i++) {
+    if (out_h_g2[i] != (inp_h[i] * inp_h[i] * inp_h[i])) {
      btestPassed2 = false;
      break;
    }
@@ -256,28 +242,24 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t eventwait;
-  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
-                                                        event1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
  SECTION("node = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent(
-                                    nullptr, event2));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(nullptr, event2), hipErrorInvalidValue);
  }

  SECTION("event = nullptr") {
-    REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent(
-                                    eventwait, nullptr));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventwait, nullptr), hipErrorInvalidValue);
  }

  SECTION("input node is empty node") {
    hipGraphNode_t EmptyGraphNode;
    HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventWaitNodeSetEvent(EmptyGraphNode, event2));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(EmptyGraphNode, event2), hipErrorInvalidValue);
  }

  SECTION("input node is memset node") {
    constexpr size_t Nbytes = 1024;
-    char *A_d;
+    char* A_d;
    hipGraphNode_t memset_A;
    hipMemsetParams memsetParams{};
    HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -288,10 +270,8 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") {
    memsetParams.elementSize = sizeof(char);
    memsetParams.width = Nbytes;
    memsetParams.height = 1;
-    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
-                                    &memsetParams));
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventWaitNodeSetEvent(memset_A, event2));
+    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(memset_A, event2), hipErrorInvalidValue);
    HIP_CHECK(hipFree(A_d));
  }

@@ -301,14 +281,12 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") {

  SECTION("input node is uninitialized node") {
    hipGraphNode_t node_uninit{};
-    REQUIRE(hipErrorInvalidValue ==
-            hipGraphEventWaitNodeSetEvent(node_uninit, event2));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(node_uninit, event2), hipErrorInvalidValue);
  }

  SECTION("input event is uninitialized") {
    hipEvent_t event_uninit{};
-    REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent(
-                                    eventwait, event_uninit));
+    HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventwait, event_uninit), hipErrorInvalidValue);
  }

  HIP_CHECK(hipGraphDestroy(graph));
@@ -20,26 +20,51 @@ THE SOFTWARE.
 #include <hip_test_common.hh>

 /**
-Negative Testcase Scenarios :
-1) Pass hipGraphExecDestroy with nullptr.
-2) Pass hipGraphExecDestroy with un-initilze structure.
-3) Destroy graph before exec-graph destroyed and verify no crash occurs.
-*/
+ * @addtogroup hipGraphExecDestroy hipGraphExecDestroy
+ * @{
+ * @ingroup GraphTest
+ * `hipGraphExecDestroy(hipGraphExec_t graphExec)` -
+ * Destroys an executable graph
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test to verify API behavior with invalid arguments:
+ *        -# GraphExec is nullptr
+ *        -# GraphExec is uninitialized
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphExecDestroy.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphExecDestroy_Negative_Parameters") {

-TEST_CASE("Unit_hipGraphExecDestroy_Negative") {
-  hipError_t ret;
  SECTION("Pass hipGraphExecDestroy with nullptr") {
-    ret = hipGraphExecDestroy(nullptr);
-    REQUIRE(hipErrorInvalidValue == ret);
+    HIP_CHECK_ERROR(hipGraphExecDestroy(nullptr), hipErrorInvalidValue);
  }
+
  SECTION("Pass hipGraphExecDestroy with un-initilze structure") {
-    hipGraphExec_t graphExec{};
-    ret = hipGraphExecDestroy(graphExec);
-    REQUIRE(hipErrorInvalidValue == ret);
+    hipGraphExec_t graph_exec{};
+    HIP_CHECK_ERROR(hipGraphExecDestroy(graph_exec), hipErrorInvalidValue);
  }
 }

-TEST_CASE("Unit_hipGraphExecDestroy_Sequence") {
+/**
+ * Test Description
+ * ------------------------
+ *    - Basic positive test for hipGraphExecDestroy
+ *    - create an executable graph and then destroy it
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphExecDestroy.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphExecDestroy_Positive_Basic") {
  hipGraph_t graph;
  hipGraphExec_t graphExec;
  hipStream_t streamForGraph;
@@ -70,4 +95,3 @@ TEST_CASE("Unit_hipGraphExecDestroy_Sequence") {
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipStreamDestroy(streamForGraph));
 }
-
@@ -33,7 +33,12 @@ Testcase Scenarios :
    the graph to create an executable graph. Change the event in the
    executable graph to event2. Verify that the event record node still
    contains event1.
- 3) Negative Scenarios
+  3) Scenario to verify that hipGraphExecEventRecordNodeSetEvent can set event
+     created on different device. Create an event record node with event1 and add it to graph.
+     Instantiate the graph to create an executable graph. Call the API to change the event in the
+     executable graph to event2 which has been created on different device. Verify that graph can be
+     launched and no error is reported.
+  4) Negative Scenarios
    - Input executable graph is a nullptr.
    - Input node is a nullptr.
    - Input event to set is a nullptr.
@@ -45,27 +50,26 @@ Testcase Scenarios :
    - Input node is a event wait node.
 */

-#include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
 #include <hip_test_kernels.hh>

-#define GRID_DIM 512
-#define BLK_DIM 512
-#define LEN (GRID_DIM * BLK_DIM)
-
 /**
 * Kernel Functions to copy.
 */
-static __global__ void copy_ker_func(int* a, int* b) {
-  int tx = blockIdx.x*blockDim.x + threadIdx.x;
-  if (tx < LEN) b[tx] = a[tx];
+static __global__ void copy_ker_func(int* a, int* b, size_t N) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tx < N) b[tx] = a[tx];
 }

 /**
 * Scenario 1: Functional scenario (See description Above)
 */
 TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
-  size_t memsize = LEN*sizeof(int);
+  constexpr size_t gridSize = 512;
+  constexpr size_t blockSize = 512;
+  constexpr size_t N = gridSize * blockSize;
+  size_t memsize = N * sizeof(int);
  hipGraph_t graph;
  HIP_CHECK(hipGraphCreate(&graph, 0));
  // Create events
@@ -75,10 +79,8 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
  HIP_CHECK(hipEventCreate(&event2_end));
  // Create nodes with event_start and event1_end
  hipGraphNode_t event_start_rec, event_end_rec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0,
-                                                          event_start));
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0,
-                                                          event1_end));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event_start));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end));
  int *inp_h, *inp_d, *out_h, *out_d;
  // Allocate host buffers
  inp_h = reinterpret_cast<int*>(malloc(memsize));
@@ -89,7 +91,7 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
  HIP_CHECK(hipMalloc(&inp_d, memsize));
  HIP_CHECK(hipMalloc(&out_d, memsize));
  // Initialize host buffer
-  for (uint32_t i = 0; i < LEN; i++) {
+  for (uint32_t i = 0; i < N; i++) {
    inp_h[i] = i;
    out_h[i] = 0;
  }
@@ -97,44 +99,39 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
  // Create memcpy and kernel nodes for graph
  hipGraphNode_t memcpyH2D, memcpyD2H, kernelnode;
  hipKernelNodeParams kernelNodeParams{};
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, inp_d,
-                inp_h, memsize, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0,
-                out_h, out_d, memsize, hipMemcpyDeviceToHost));
-  void* kernelArgs1[] = {&inp_d, &out_d};
-  kernelNodeParams.func = reinterpret_cast<void *>(copy_ker_func);
-  kernelNodeParams.gridDim = dim3(GRID_DIM);
-  kernelNodeParams.blockDim = dim3(BLK_DIM);
+  size_t NElem{N};
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, inp_d, inp_h, memsize,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0, out_h, out_d, memsize,
+                                    hipMemcpyDeviceToHost));
+  void* kernelArgs1[] = {&inp_d, &out_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(copy_ker_func);
+  kernelNodeParams.gridDim = dim3(gridSize);
+  kernelNodeParams.blockDim = dim3(blockSize);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernelnode, graph, nullptr, 0,
-                                  &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&kernelnode, graph, nullptr, 0, &kernelNodeParams));

  // Create dependencies for graph
-  HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec,
-                                    &memcpyH2D, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D,
-                                    &kernelnode, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode,
-                                    &memcpyD2H, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyD2H,
-                                    &event_end_rec, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memcpyH2D, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D, &kernelnode, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode, &memcpyD2H, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyD2H, &event_end_rec, 1));
  // Instantiate and launch the graph
  hipStream_t streamForGraph;
  hipGraphExec_t graphExec;
  HIP_CHECK(hipStreamCreate(&streamForGraph));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  // Change the event at event_end_rec node to event2_end
-  HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec,
-                                            event_end_rec, event2_end));
+  HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, event_end_rec, event2_end));

  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
  // Wait for graph to complete
  HIP_CHECK(hipStreamSynchronize(streamForGraph));
  // Validate output
  bool btestPassed = true;
-  for (uint32_t i = 0; i < LEN; i++) {
+  for (uint32_t i = 0; i < N; i++) {
    if (out_h[i] != inp_h[i]) {
      btestPassed = false;
      break;
@@ -147,8 +144,7 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
  REQUIRE(t > 0.0f);
  // Since event1_end is never recorded, hipEventElapsedTime
  // should return error code.
-  REQUIRE(hipErrorInvalidResourceHandle ==
-          hipEventElapsedTime(&t, event_start, event1_end));
+  HIP_CHECK_ERROR(hipEventElapsedTime(&t, event_start, event1_end), hipErrorInvalidResourceHandle);
  // Free resources
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipStreamDestroy(streamForGraph));
@@ -173,12 +169,10 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_VerifyEventNotChanged") {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t eventrec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
-                                                          event1));
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
  hipGraphExec_t graphExec;
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec,
-                                                eventrec, event2));
+  HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event2));
  HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out));
  // validate set event and get event are same
  REQUIRE(event1 == event_out);
@@ -190,7 +184,48 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_VerifyEventNotChanged") {
 }

 /**
- * Scenario 3: Negative Tests
+ * Scenario 3: This test verifies event in node of the executable graph can be changed to event on
+ * different device
+ */
+TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Positive_DifferentDevices") {
+  const auto device_count = HipTest::getDeviceCount();
+  if (device_count < 2) {
+    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
+    return;
+  }
+  hipGraphExec_t graphExec;
+  hipStream_t streamForGraph;
+  hipGraph_t graph;
+  hipEvent_t event1, event2;
+
+  HIP_CHECK(hipSetDevice(0));
+  HIP_CHECK(hipEventCreate(&event1));
+  HIP_CHECK(hipSetDevice(1));
+  HIP_CHECK(hipEventCreate(&event2));
+
+  HIP_CHECK(hipSetDevice(0));
+  hipGraphNode_t eventrec;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
+
+  // Verify event on different device can be set in graphExec
+  // Instantiate and launch the graph
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event2));
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  // Wait for graph to complete
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+  // Free resources
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipEventDestroy(event2));
+  HIP_CHECK(hipEventDestroy(event1))
+}
+
+/**
+ * Scenario 4: Negative Parameter Tests
 */
 TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") {
  hipGraph_t graph;
@@ -199,11 +234,10 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t eventrec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
-                                                            event1));
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
  // Create memset
  constexpr size_t Nbytes = 1024;
-  char *A_d;
+  char* A_d;
  hipGraphNode_t memset_A;
  hipMemsetParams memsetParams{};
  HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -219,66 +253,61 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") {
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));

  SECTION("hGraphExec = nullptr") {
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventRecordNodeSetEvent(nullptr, eventrec, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(nullptr, eventrec, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("hNode = nullptr") {
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventRecordNodeSetEvent(graphExec, nullptr, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, nullptr, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("event = nullptr") {
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, nullptr));
+    HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, nullptr),
+                    hipErrorInvalidValue);
  }

  SECTION("hGraphExec is uninitialized") {
    hipGraphExec_t graphExec1{};
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("hNode is uninitialized") {
    hipGraphNode_t dummy{};
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventRecordNodeSetEvent(graphExec, dummy, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, dummy, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("event is uninitialized") {
    hipEvent_t event_dummy{};
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec,
-                                            event_dummy));
+    HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event_dummy),
+                    hipErrorInvalidValue);
  }

  SECTION("event record node does not exist") {
    hipGraph_t graph1;
    HIP_CHECK(hipGraphCreate(&graph1, 0));
-    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0,
-                                    &memsetParams));
+    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, &memsetParams));
    hipGraphExec_t graphExec1;
    HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2),
+                    hipErrorInvalidValue);
    HIP_CHECK(hipGraphExecDestroy(graphExec1));
    HIP_CHECK(hipGraphDestroy(graph1));
  }

  SECTION("pass memset node as hNode") {
-    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
-                                    &memsetParams));
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventRecordNodeSetEvent(graphExec, memset_A, event2));
+    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
+    HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, memset_A, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("pass event wait node as hNode") {
    hipGraphNode_t event_wait_node;
-    HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0,
-                                                            event1));
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventRecordNodeSetEvent(graphExec, event_wait_node,
-                                            event2));
+    HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event1));
+    HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, event_wait_node, event2),
+                    hipErrorInvalidValue);
  }

  HIP_CHECK(hipFree(A_d));
@@ -47,33 +47,30 @@ Testcase Scenarios :
    - Pass event record node as input node.
 */

-#include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
 #include <hip_test_kernels.hh>

-#define GRID_DIM 64
-#define BLK_DIM 256
-#define LEN (GRID_DIM * BLK_DIM)
-#define DELAY_IN_MS 2000
-
 /**
 * Kernel Functions to perform square and introduce delay in device.
 */
-static __global__ void sqr_ker_func(int* a, int* b, int clockrate) {
-  int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
-  if (tx < LEN) b[tx] = a[tx]*a[tx];
-  uint64_t wait_t = DELAY_IN_MS,
-  start = clock64()/clockrate, cur;
-  do { cur = clock64()/clockrate - start;}while (cur < wait_t);
+static __global__ void sqr_ker_func(int* a, int* b, size_t N, int clockrate, size_t delayMs) {
+  int tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  if (tx < N) b[tx] = a[tx] * a[tx];
+  uint64_t wait_t = delayMs, start = clock64() / clockrate, cur;
+  do {
+    cur = clock64() / clockrate - start;
+  } while (cur < wait_t);
 }

-static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
+static __global__ void sqr_ker_func_gfx11(int* a, int* b, size_t N, int clockrate, size_t delayMs) {
 #if HT_AMD
-  int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
-  if (tx < LEN) b[tx] = a[tx]*a[tx];
-  uint64_t wait_t = DELAY_IN_MS,
-  start = wall_clock64()/clockrate, cur;
-  do { cur = wall_clock64()/clockrate - start;}while (cur < wait_t);
+  int tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  if (tx < N) b[tx] = a[tx] * a[tx];
+  uint64_t wait_t = delayMs, start = wall_clock64() / clockrate, cur;
+  do {
+    cur = wall_clock64() / clockrate - start;
+  } while (cur < wait_t);
 #endif
 }

@@ -81,7 +78,10 @@ static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
 * Scenario 1: Test to validate setting different events in executable graph.
 */
 TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
-  size_t memsize = LEN*sizeof(int);
+  constexpr size_t gridSize = 64;
+  constexpr size_t blockSize = 256;
+  constexpr size_t N = gridSize * blockSize;
+  size_t memsize = N * sizeof(int);
  hipGraph_t graph1, graph2;
  HIP_CHECK(hipGraphCreate(&graph1, 0));
  HIP_CHECK(hipGraphCreate(&graph2, 0));
@@ -91,8 +91,7 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
  HIP_CHECK(hipEventCreate(&event2));
  // Create nodes with event_start and event1_end
  hipGraphNode_t event_rec;
-  HIP_CHECK(hipGraphAddEventRecordNode(&event_rec, graph1, nullptr, 0,
-                                                    event1));
+  HIP_CHECK(hipGraphAddEventRecordNode(&event_rec, graph1, nullptr, 0, event1));
  int *inp_h, *inp_d, *out_h, *out_d;
  // Allocate host buffers
  inp_h = reinterpret_cast<int*>(malloc(memsize));
@@ -103,7 +102,7 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
  HIP_CHECK(hipMalloc(&inp_d, memsize));
  HIP_CHECK(hipMalloc(&out_d, memsize));
  // Initialize host buffer
-  for (uint32_t i = 0; i < LEN; i++) {
+  for (uint32_t i = 0; i < N; i++) {
    inp_h[i] = i;
    out_h[i] = 0;
  }
@@ -112,10 +111,12 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
  // MemcpyH2D -> kernel1 -> event_rec
  hipGraphNode_t memcpyH2D, kernelnode1;
  hipKernelNodeParams kernelNodeParams1{};
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d,
-                inp_h, memsize, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize,
+                                    hipMemcpyHostToDevice));
  // Get device clock rate
  int clkRate = 0;
+  size_t NElem{N};
+  size_t delayMs{2000};
  if (IsGfx11()) {
    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
  } else {
@@ -123,29 +124,25 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
  }
  // kernel1
  auto sqr_ker_func_used = IsGfx11() ? sqr_ker_func_gfx11 : sqr_ker_func;
-  void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void *>(&clkRate)};
-  kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func_used);
-  kernelNodeParams1.gridDim = dim3(GRID_DIM);
-  kernelNodeParams1.blockDim = dim3(BLK_DIM);
+  void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void*>(&NElem),
+                        reinterpret_cast<void*>(&clkRate), reinterpret_cast<void*>(&delayMs)};
+  kernelNodeParams1.func = reinterpret_cast<void*>(sqr_ker_func_used);
+  kernelNodeParams1.gridDim = dim3(gridSize);
+  kernelNodeParams1.blockDim = dim3(blockSize);
  kernelNodeParams1.sharedMemBytes = 0;
  kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs);
  kernelNodeParams1.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernelnode1, graph1, nullptr, 0,
-                                  &kernelNodeParams1));
+  HIP_CHECK(hipGraphAddKernelNode(&kernelnode1, graph1, nullptr, 0, &kernelNodeParams1));
  // Create dependencies for graph1
-  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D,
-                                    &kernelnode1, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode1,
-                                    &event_rec, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &kernelnode1, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode1, &event_rec, 1));
  // graph2 creation ...........
  // waitnode(event1) -> MemcpyD2H
  hipGraphNode_t event_wait_node, memcpyD2H;
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph2, nullptr, 0,
-                out_h, out_d, memsize, hipMemcpyDeviceToHost));
-  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0,
-                                                            event1));
-  HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node,
-                                    &memcpyD2H, 1));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph2, nullptr, 0, out_h, out_d, memsize,
+                                    hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1));
+  HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &memcpyD2H, 1));
  // Instantiate graph1 and graph2
  hipStream_t streamForGraph1, streamForGraph2;
  hipGraphExec_t graphExec1, graphExec2;
@@ -160,8 +157,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
  HIP_CHECK(hipStreamSynchronize(streamForGraph2));
  // Validate output
  bool btestPassed = true;
-  for (uint32_t i = 0; i < LEN; i++) {
-    if (out_h[i] != (inp_h[i]*inp_h[i])) {
+  for (uint32_t i = 0; i < N; i++) {
+    if (out_h[i] != (inp_h[i] * inp_h[i])) {
      btestPassed = false;
      break;
    }
@@ -170,10 +167,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
  // hipGraphExecEventWaitNodeSetEvent() TEST
  // Change the event at event_wait_node node to event2 and
  // the event at event_rec node to event2.
-  HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec1,
-                                            event_rec, event2));
-  HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec2,
-                                            event_wait_node, event2));
+  HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec1, event_rec, event2));
+  HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec2, event_wait_node, event2));
  // Launch graph1 and graph2
  HIP_CHECK(hipGraphLaunch(graphExec1, streamForGraph1));
  HIP_CHECK(hipGraphLaunch(graphExec2, streamForGraph2));
@@ -181,8 +176,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
  HIP_CHECK(hipStreamSynchronize(streamForGraph2));
  // Validate output
  btestPassed = true;
-  for (uint32_t i = 0; i < LEN; i++) {
-    if (out_h[i] != (inp_h[i]*inp_h[i])) {
+  for (uint32_t i = 0; i < N; i++) {
+    if (out_h[i] != (inp_h[i] * inp_h[i])) {
      btestPassed = false;
      break;
    }
@@ -214,12 +209,10 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_VerifyEventNotChanged") {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t eventwait;
-  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
-                                                          event1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
  hipGraphExec_t graphExec;
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec,
-                                              eventwait, event2));
+  HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, event2));
  HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out));
  // validate set event and get event are same
  REQUIRE(event1 == event_out);
@@ -240,13 +233,11 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_Negative") {
  HIP_CHECK(hipEventCreate(&event1));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t eventrec, eventwait;
-  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
-                                                        event1));
-  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
-                                                        event1));
+  HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
+  HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
  // Create memset
  constexpr size_t Nbytes = 1024;
-  char *A_d;
+  char* A_d;
  hipGraphNode_t memset_A;
  hipMemsetParams memsetParams{};
  HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -262,62 +253,59 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_Negative") {
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));

  SECTION("hGraphExec = nullptr") {
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventWaitNodeSetEvent(nullptr, eventwait, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(nullptr, eventwait, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("hNode = nullptr") {
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventWaitNodeSetEvent(graphExec, nullptr, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, nullptr, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("event = nullptr") {
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, nullptr));
+    HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, nullptr),
+                    hipErrorInvalidValue);
  }

  SECTION("hGraphExec is uninitialized") {
    hipGraphExec_t graphExec1{};
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("hNode is uninitialized") {
    hipGraphNode_t dummy{};
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventWaitNodeSetEvent(graphExec, dummy, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, dummy, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("event is uninitialized") {
    hipEvent_t event_dummy{};
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait,
-                                          event_dummy));
+    HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, event_dummy),
+                    hipErrorInvalidValue);
  }

  SECTION("event wait node does not exist") {
    hipGraph_t graph1;
    HIP_CHECK(hipGraphCreate(&graph1, 0));
-    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0,
-                                    &memsetParams));
+    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, &memsetParams));
    hipGraphExec_t graphExec1;
    HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2),
+                    hipErrorInvalidValue);
    HIP_CHECK(hipGraphExecDestroy(graphExec1));
    HIP_CHECK(hipGraphDestroy(graph1));
  }

  SECTION("pass memset node as hNode") {
-    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
-                                    &memsetParams));
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventWaitNodeSetEvent(graphExec, memset_A, event2));
+    HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
+    HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, memset_A, event2),
+                    hipErrorInvalidValue);
  }

  SECTION("pass event record node as hNode") {
-    REQUIRE(hipErrorInvalidValue ==
-        hipGraphExecEventWaitNodeSetEvent(graphExec, eventrec, event2));
+    HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventrec, event2),
+                    hipErrorInvalidValue);
  }

  HIP_CHECK(hipFree(A_d));
@@ -27,22 +27,6 @@ THE SOFTWARE.
 * and perform the update if possible.
 */

-/**
-Testcase Scenarios :
-Functional-
-1) Make a clone of the created graph and update the executable-graph from a clone or same graph again.
-2) Update the executable-graph from a graph and make sure they are taking effect.
-Negative-
-1) When Pass hGraphExec as nullptr and verify api returns error code.
-2) When Pass hGraph as nullptr and verify api returns error code.
-3) When Pass hErrorNode_out as nullptr and verify api returns error code.
-4) When Pass updateResult_out as nullptr and verify api returns error code.
-5) When the a graphExec was updated with with different type of node and verify api returns error code.
-6) When a node is deleted in hGraph but not its pair from hGraphExec and verify api returns error code.
-7) When a node is deleted in hGraphExec but not its pair from hGraph and verify api returns error code.
-8) When grpah dependencies differ but graph have same node and verify api returns error code.
-*/
-
 #include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
 #include <hip_test_kernels.hh>
@@ -65,13 +49,11 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Basic") {
  hipGraphNode_t hErrorNode_out{};
  hipGraphExecUpdateResult updateResult_out{};
  SECTION("Pass hGraphExec as nullptr") {
-    ret = hipGraphExecUpdate(nullptr, graph, &hErrorNode_out,
-                             &updateResult_out);
+    ret = hipGraphExecUpdate(nullptr, graph, &hErrorNode_out, &updateResult_out);
    REQUIRE(hipErrorInvalidValue == ret);
  }
  SECTION("Pass hGraph as nullptr") {
-    ret = hipGraphExecUpdate(graphExec, nullptr, &hErrorNode_out,
-                             &updateResult_out);
+    ret = hipGraphExecUpdate(graphExec, nullptr, &hErrorNode_out, &updateResult_out);
    REQUIRE(hipErrorInvalidValue == ret);
  }
  SECTION("Pass hErrorNode_out as nullptr") {
@@ -101,10 +83,9 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_TypeChange") {
  constexpr size_t N = 1024;
  constexpr size_t Nbytes = N * sizeof(char);
  constexpr size_t val = 0;
-  char *devData;
+  char* devData;
  int *A_d, *A_h;
-  HipTest::initArrays<int>(&A_d, nullptr, nullptr,
-                           &A_h, nullptr, nullptr, N, false);
+  HipTest::initArrays<int>(&A_d, nullptr, nullptr, &A_h, nullptr, nullptr, N, false);
  HIP_CHECK(hipMalloc(&devData, Nbytes));
  hipGraph_t graph, graph2;
  hipGraphExec_t graphExec;
@@ -122,18 +103,16 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_TypeChange") {
  memsetParams.elementSize = sizeof(char);
  memsetParams.width = Nbytes;
  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
-                                  &memsetParams));
+  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, &memsetParams));
  std::vector<hipGraphNode_t> dependencies;
  dependencies.push_back(memsetNode);
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphCreate(&graph2, 0));
  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
  // graphExec was created before memcpyTemp was added to graph.
-  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
-                           &updateResult_out);
+  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
  REQUIRE(hipGraphExecUpdateErrorNodeTypeChanged == updateResult_out);
  REQUIRE(hipErrorGraphExecUpdateFailure == ret);
  HIP_CHECK(hipFree(devData));
@@ -164,7 +143,7 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_CountDiffer") {
  int *A_d, *B_d, *C_d;
  int *A_h, *B_h, *C_h;
  size_t NElem{N};
-  int *hData = reinterpret_cast<int*>(malloc(Nbytes));
+  int* hData = reinterpret_cast<int*>(malloc(Nbytes));
  REQUIRE(hData != nullptr);
  memset(hData, 0, Nbytes);
  hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, memcpyTemp;
@@ -180,57 +159,52 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_CountDiffer") {
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  HIP_CHECK(hipGraphCreate(&graph1, 0));
  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
-  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
+  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph1, nullptr, 0,
-                                                        &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph1, nullptr, 0, &kernelNodeParams));
  // Create dependencies
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecAdd, 1));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecAdd, 1));
  HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecAdd, &memcpy_C, 1));
  // Create a cloned graph and added extra node to it
  HIP_CHECK(hipGraphClone(&graph2, graph1));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyTemp, graph2, nullptr, 0,
-                                    C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyTemp, graph2, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
  SECTION("When a node deleted from Graph but not from its pair GraphExec") {
-    ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out,
-                             &updateResult_out);
+    ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, &updateResult_out);
    REQUIRE(hipErrorGraphExecUpdateFailure == ret);
  }
  SECTION("When a node deleted from GraphExec but not from its pair Graph") {
-    ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out,
-                                 &updateResult_out);
+    ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out);
    REQUIRE(hipErrorGraphExecUpdateFailure == ret);
  }
  SECTION("When the dependent nodes of a pair differ") {
    HIP_CHECK(hipGraphCreate(&graph3, 0));
-    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph3, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
-    HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph3, nullptr, 0,
-                                                        &kernelNodeParams));
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, Nbytes,
+                                      hipMemcpyHostToDevice));
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, Nbytes,
+                                      hipMemcpyHostToDevice));
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph3, nullptr, 0, C_h, C_d, Nbytes,
+                                      hipMemcpyDeviceToHost));
+    HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph3, nullptr, 0, &kernelNodeParams));
    // Create dependencies
    HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_A, &kernel_vecAdd, 1));
    HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_B, &kernel_vecAdd, 1));
    HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_C, &kernel_vecAdd, 1));
-    ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out,
-                             &updateResult_out);
+    ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, &updateResult_out);
    REQUIRE(hipErrorGraphExecUpdateFailure == ret);
    HIP_CHECK(hipGraphDestroy(graph3));
  }
@@ -265,7 +239,7 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") {
  int *A_d, *B_d, *C_d;
  int *A_h, *B_h, *C_h;
  size_t NElem{N};
-  int *hData = reinterpret_cast<int*>(malloc(Nbytes));
+  int* hData = reinterpret_cast<int*>(malloc(Nbytes));
  REQUIRE(hData != nullptr);
  memset(hData, 0, Nbytes);
  hipGraphNode_t memcpy_A, memcpy_B, memcpy_C;
@@ -280,22 +254,20 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") {
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
-  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func =
-                   reinterpret_cast<void *>(HipTest::vector_square<int>);
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
+  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSquare, graph, nullptr, 0,
-                                                        &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSquare, graph, nullptr, 0, &kernelNodeParams));
  // Create dependencies
  HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_A, &kernel_vecSquare, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_B, &kernel_vecSquare, 1));
@@ -304,36 +276,32 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") {
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  SECTION("Update graphExec with clone graph") {
    HIP_CHECK(hipGraphClone(&clonedgraph, graph));
-    HIP_CHECK(hipGraphExecUpdate(graphExec, clonedgraph, &hErrorNode_out,
-                                 &updateResult_out));
+    HIP_CHECK(hipGraphExecUpdate(graphExec, clonedgraph, &hErrorNode_out, &updateResult_out));
  }
  // Code for new graph creation with samilar node setup
  HIP_CHECK(hipGraphCreate(&graph2, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
-  HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpy_C, hData, C_d, Nbytes,
-                                          hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpy_C, hData, C_d, Nbytes, hipMemcpyDeviceToHost));
  memset(&kernelNodeParams, 0, sizeof(hipKernelNodeParams));
-  void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
+  void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph2, nullptr, 0,
-                                                        &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph2, nullptr, 0, &kernelNodeParams));
  // Create dependencies
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecAdd, 1));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecAdd, 1));
  HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecAdd, &memcpy_C, 1));
  // Update the graphExec graph from graph -> graph2
-  HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
-                               &updateResult_out));
+  HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out));
  REQUIRE(updateResult_out == hipGraphExecUpdateSuccess);
  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
  HIP_CHECK(hipStreamSynchronize(streamForGraph));
@@ -380,24 +348,22 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_ParametersChanged") {
  hipGraphExecUpdateResult updateResult_out;
  HipTest::initArrays<int>(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  HIP_CHECK(hipGraphCreate(&graph1, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
  SECTION("Update graphExec with similar graph and verify") {
    HIP_CHECK(hipGraphCreate(&graph2, 0));
-    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
-                                      Nbytes, hipMemcpyHostToDevice));
-    ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out,
-                             &updateResult_out);
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
+                                      hipMemcpyHostToDevice));
+    ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out);
    REQUIRE(hipSuccess == ret);
    HIP_CHECK(hipGraphDestroy(graph2));
  }
  SECTION("Update graphExec with similar graph and verify") {
    HIP_CHECK(hipGraphCreate(&graph3, 0));
-    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_h, B_d,
-                                      Nbytes, hipMemcpyDeviceToHost));
-    ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out,
-                             &updateResult_out);
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_h, B_d, Nbytes,
+                                      hipMemcpyDeviceToHost));
+    ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, &updateResult_out);

    REQUIRE(hipErrorGraphExecUpdateFailure == ret);
    REQUIRE(hipGraphExecUpdateErrorParametersChanged == updateResult_out);
@@ -437,16 +403,15 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_1") {
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  HIP_CHECK(hipGraphCreate(&graph1, 0));
  HIP_CHECK(hipGraphCreate(&graph2, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
  // When count of nodes directly differ in graphExec1 and graph2
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
-  ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out,
-                           &updateResult_out);
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
+  ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out);

  REQUIRE(hipErrorGraphExecUpdateFailure == ret);
 #if HT_NVIDIA
@@ -495,16 +460,15 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") {
  hipGraphExecUpdateResult updateResult_out;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  HIP_CHECK(hipGraphCreate(&graph1, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
  // Delete a node from the graph
  HIP_CHECK(hipGraphDestroyNode(memcpy_B));
  SECTION("When a node deleted from Graph but not from its pair GraphExec") {
-    ret = hipGraphExecUpdate(graphExec1, graph1, &hErrorNode_out,
-                             &updateResult_out);
+    ret = hipGraphExecUpdate(graphExec1, graph1, &hErrorNode_out, &updateResult_out);
    REQUIRE(hipErrorGraphExecUpdateFailure == ret);
    REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out);
 #if HT_NVIDIA
@@ -513,11 +477,10 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") {
  }
  SECTION("Update the GraphExec with similar graph where a node get deleted") {
    HIP_CHECK(hipGraphCreate(&graph2, 0));
-    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h,
-                                      Nbytes, hipMemcpyHostToDevice));
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, Nbytes,
+                                      hipMemcpyHostToDevice));
    HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
-    ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out,
-                             &updateResult_out);
+    ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, &updateResult_out);
 #if HT_NVIDIA
    REQUIRE(hipErrorGraphExecUpdateFailure == ret);
    REQUIRE(hipGraphExecUpdateErrorNotSupported == updateResult_out);
@@ -529,13 +492,12 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") {
  }
  SECTION("When A node is deleted in GraphExec but not its pair from Graph") {
    HIP_CHECK(hipGraphCreate(&graph3, 0));
-    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h,
-                                      Nbytes, hipMemcpyHostToDevice));
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, Nbytes,
+                                      hipMemcpyHostToDevice));
    HIP_CHECK(hipGraphInstantiate(&graphExec3, graph3, nullptr, nullptr, 0));
-    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h,
-                                      Nbytes, hipMemcpyHostToDevice));
-    ret = hipGraphExecUpdate(graphExec3, graph3, &hErrorNode_out,
-                             &updateResult_out);
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, Nbytes,
+                                      hipMemcpyHostToDevice));
+    ret = hipGraphExecUpdate(graphExec3, graph3, &hErrorNode_out, &updateResult_out);
    REQUIRE(hipErrorGraphExecUpdateFailure == ret);
 #if HT_NVIDIA
    REQUIRE(hipGraphExecUpdateErrorNotSupported == updateResult_out);
@@ -581,27 +543,26 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Dependent_NodesDiffer") {
  hipGraphExecUpdateResult updateResult_out;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  HIP_CHECK(hipGraphCreate(&graph1, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_d, C_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_d, C_h, Nbytes,
+                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &memcpy_B, 1));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &memcpy_C, 1));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));

  HIP_CHECK(hipGraphCreate(&graph2, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, Nbytes,
+                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &memcpy_C, 1));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &memcpy_C, 1));
-  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
-                           &updateResult_out);
+  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);

  REQUIRE(hipErrorGraphExecUpdateFailure == ret);
  REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out);
@@ -642,10 +603,10 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_NodeType_Changed") {
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph1, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &memcpy_B, 1));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));

@@ -658,13 +619,11 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_NodeType_Changed") {
  memsetParams.elementSize = sizeof(char);
  memsetParams.width = Nbytes;
  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph2, nullptr, 0,
-                                  &memsetParams));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph2, nullptr, 0, &memsetParams));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &memsetNode, 1));
-  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
-                           &updateResult_out);
+  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
  REQUIRE(hipErrorGraphExecUpdateFailure == ret);
 #if HT_NVIDIA
  REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out);
@@ -726,22 +685,21 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_MultiDevice_Context_Changed") {
  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
  HIP_CHECK(hipGraphCreate(&graph1, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
  hipKernelNodeParams kernelNodeParams{};
-  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
+  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0,
-                                                  &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1));
  HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1));
@@ -750,27 +708,25 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_MultiDevice_Context_Changed") {

  HIP_CHECK(hipSetDevice(1));
  HIP_CHECK(hipGraphCreate(&graph2, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
  memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams));
-  void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorSUB<int>);
+  void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0,
-                                                  &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecSUB, 1));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecSUB, 1));
  HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C, 1));
-  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
-                           &updateResult_out);
+  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);

  REQUIRE(hipErrorGraphExecUpdateFailure == ret);
  REQUIRE(hipGraphExecUpdateErrorUnsupportedFunctionChange == updateResult_out);
@@ -819,49 +775,46 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional_KernelFunction_Changed") {
  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
  HIP_CHECK(hipGraphCreate(&graph1, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
  hipKernelNodeParams kernelNodeParams{};
-  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
+  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0,
-                                                  &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1));
  HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));

  HIP_CHECK(hipGraphCreate(&graph2, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
  memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams));
-  void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorSUB<int>);
+  void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0,
-                                                  &kernelNodeParams));
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecSUB, 1));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecSUB, 1));
  HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C, 1));
-  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
-                           &updateResult_out);
+  ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
  REQUIRE(hipSuccess == ret);
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));
@@ -19,394 +19,127 @@ THE SOFTWARE.

 #include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
-#include <hip_test_kernels.hh>
-/* Test verifies hipGraphLaunch API
-Negative scenarios -
-1) Pass graphExec as nullptr and verify api returns error code.
-2) Pass pGraphExec as nullptr and stream as hipStreamPerThread and verify  api returns error code.
-3) Pass pGraphExec as empty object and verify  api returns error code.
-4) Destroy executable graph and try to launch it. Make sure api should not crash and it should returns error code.
-5) Destroy stream and try to launch respective executable graph. Make sure api should not crash and it should returns error code.
-6) Destroy actual graph created and try to launch respective executable graph.
-   Check api should execute properly without crash or error code.
-Functional Scenario -
-1) Check basic functionality with stream as hipStreamPerThread
-2) Test hipGraphLaunch call on multiple devices.
-3) Create a graph with multiple nodes. Create an executable graph.
-   Launch the executable graph 3 times in stream simultaneously.
-   Wait for stream. Validate the output. No issues should be observed
-4) Create a graph with multiple nodes. Create an executable graph.
-   Verify if an executable graph be launched on null stream.
-*/

-#define SIZE 1024
-#define TEST_LOOP_SIZE 3
+/**
+ * @addtogroup hipGraphLaunch hipGraphLaunch
+ * @{
+ * @ingroup GraphTest
+ * `hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream)` -
+ * Launches an executable graph in a stream
+ */

-TEST_CASE("Unit_hipGraphLaunch_Negative") {
-  hipError_t ret;
-  SECTION("Pass pGraphExec as nullptr") {
-    hipStream_t stream{};
+static void HostFunctionSetToZero(void* arg) {
+  int* test_number = (int*)arg;
+  (*test_number) = 0;
+}
+
+static void HostFunctionAddOne(void* arg) {
+  int* test_number = (int*)arg;
+  (*test_number) += 1;
+}
+
+/* create an executable graph that will set an integer pointed to by 'number' to one*/
+static void CreateTestExecutableGraph(hipGraphExec_t* graph_exec, int* number) {
+  hipGraph_t graph;
+  hipGraphNode_t node_error;
+
+  hipGraphNode_t node_set_zero;
+  hipHostNodeParams params_set_to_zero = {HostFunctionSetToZero, number};
+
+  hipGraphNode_t node_add_one;
+  hipHostNodeParams params_set_add_one = {HostFunctionAddOne, number};
+
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  HIP_CHECK(hipGraphAddHostNode(&node_set_zero, graph, nullptr, 0, &params_set_to_zero));
+  HIP_CHECK(hipGraphAddHostNode(&node_add_one, graph, &node_set_zero, 1, &params_set_add_one));
+
+  HIP_CHECK(hipGraphInstantiate(graph_exec, graph, &node_error, nullptr, 0));
+  HIP_CHECK(hipGraphDestroy(graph));
+}
+
+static void HipGraphLaunch_Positive_Simple(hipStream_t stream) {
+  int number = 5;
+
+  hipGraphExec_t graph_exec;
+  CreateTestExecutableGraph(&graph_exec, &number);
+
+  HIP_CHECK(hipGraphLaunch(graph_exec, stream));
+  HIP_CHECK(hipStreamSynchronize(stream));
+  REQUIRE(number == 1);
+
+  HIP_CHECK(hipGraphExecDestroy(graph_exec));
+}
+
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Basic positive test for hipGraphLaunch
+ *        -# stream as a created stream
+ *        -# with stream as hipStreamPerThread
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphLaunch.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphLaunch_Positive") {
+  SECTION("stream as a created stream") {
+    hipStream_t stream;
+    HIP_CHECK(hipStreamCreate(&stream));
+    HipGraphLaunch_Positive_Simple(stream);
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+
+  SECTION("with stream as hipStreamPerThread") {
+    HipGraphLaunch_Positive_Simple(hipStreamPerThread);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Negative parameter test for hipGraphLaunch
+ *        -# graphExec is nullptr and stream is a created stream
+ *        -# graphExec is nullptr and stream is hipStreamPerThread
+ *        -# graphExec is an empty object
+ *        -# graphExec is destroyed before calling hipGraphLaunch
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphLaunch.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphLaunch_Negative_Parameters") {
+  SECTION("graphExec is nullptr and stream is a created stream") {
+    hipStream_t stream;
+    hipError_t ret;
+    HIP_CHECK(hipStreamCreate(&stream));
    ret = hipGraphLaunch(nullptr, stream);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass pGraphExec as nullptr and stream as hipStreamPerThread") {
-    ret = hipGraphLaunch(nullptr, hipStreamPerThread);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass pGraphExec as empty object") {
-    hipGraphExec_t graphExec{};
-    hipStream_t stream{};
-    ret = hipGraphLaunch(graphExec, stream);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Destroy executable graph and try to launch it") {
-    constexpr size_t Nbytes = 1024;
-    hipGraph_t graph;
-    hipGraphExec_t graphExec;
-    hipStream_t stream;
-    hipGraphNode_t memsetNode;
-
-    char *devData;
-    HIP_CHECK(hipMalloc(&devData, Nbytes));
-
-    HIP_CHECK(hipGraphCreate(&graph, 0));
-    HIP_CHECK(hipStreamCreate(&stream));
-
-    hipMemsetParams memsetParams{};
-    memset(&memsetParams, 0, sizeof(memsetParams));
-    memsetParams.dst = reinterpret_cast<void*>(devData);
-    memsetParams.value = 0;
-    memsetParams.pitch = 0;
-    memsetParams.elementSize = sizeof(char);
-    memsetParams.width = Nbytes;
-    memsetParams.height = 1;
-    HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
-                                    &memsetParams));
-    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-    HIP_CHECK(hipGraphLaunch(graphExec, stream));
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    HIP_CHECK(hipGraphExecDestroy(graphExec));
-    // Launch again after destroy graph exec object.
-    ret = hipGraphLaunch(graphExec, stream);
-    REQUIRE(hipErrorInvalidValue == ret);
-
-    HIP_CHECK(hipFree(devData));
-    HIP_CHECK(hipGraphDestroy(graph));
    HIP_CHECK(hipStreamDestroy(stream));
+    REQUIRE(ret == hipErrorInvalidValue);
  }
-/* In this case in CUDA setup this api call is giving - unknown error (999)
-   So enabling this test for both AMD and CUDA by checking with hipSuccess */
-  SECTION("Destroy stream and try to launch respective executable graph") {
-    constexpr size_t Nbytes = 1024;
-    hipGraph_t graph;
-    hipGraphExec_t graphExec;
-    hipStream_t stream;
-    hipGraphNode_t memsetNode;

-    char *devData;
-    HIP_CHECK(hipMalloc(&devData, Nbytes));
-
-    HIP_CHECK(hipGraphCreate(&graph, 0));
-    HIP_CHECK(hipStreamCreate(&stream));
-
-    hipMemsetParams memsetParams{};
-    memset(&memsetParams, 0, sizeof(memsetParams));
-    memsetParams.dst = reinterpret_cast<void*>(devData);
-    memsetParams.value = 0;
-    memsetParams.pitch = 0;
-    memsetParams.elementSize = sizeof(char);
-    memsetParams.width = Nbytes;
-    memsetParams.height = 1;
-    HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
-                                    &memsetParams));
-    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-    HIP_CHECK(hipGraphLaunch(graphExec, stream));
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    HIP_CHECK(hipStreamDestroy(stream));
-    // Launch again after destroy stream
-    ret = hipGraphLaunch(graphExec, stream);
-    REQUIRE(hipSuccess != ret);
-
-    HIP_CHECK(hipFree(devData));
-    HIP_CHECK(hipGraphExecDestroy(graphExec));
-    HIP_CHECK(hipGraphDestroy(graph));
+  SECTION("graphExec is nullptr and stream is hipStreamPerThread") {
+    HIP_CHECK_ERROR(hipGraphLaunch(nullptr, hipStreamPerThread), hipErrorInvalidValue);
  }
-  SECTION("Destroy graph and try to launch respective executable graph") {
-    constexpr size_t Nbytes = 1024;
-    hipGraph_t graph;
-    hipGraphExec_t graphExec;
-    hipStream_t stream;
-    hipGraphNode_t memsetNode;

-    char *devData;
-    HIP_CHECK(hipMalloc(&devData, Nbytes));
+  SECTION("graphExec is an empty object") {
+    hipGraphExec_t graph_exec{};
+    HIP_CHECK_ERROR(hipGraphLaunch(graph_exec, hipStreamPerThread), hipErrorInvalidValue);
+  }

-    HIP_CHECK(hipGraphCreate(&graph, 0));
-    HIP_CHECK(hipStreamCreate(&stream));
-
-    hipMemsetParams memsetParams{};
-    memset(&memsetParams, 0, sizeof(memsetParams));
-    memsetParams.dst = reinterpret_cast<void*>(devData);
-    memsetParams.value = 0;
-    memsetParams.pitch = 0;
-    memsetParams.elementSize = sizeof(char);
-    memsetParams.width = Nbytes;
-    memsetParams.height = 1;
-    HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
-                                    &memsetParams));
-    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-    HIP_CHECK(hipGraphLaunch(graphExec, stream));
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    HIP_CHECK(hipGraphDestroy(graph));
-    // Launch again after destroy graph
-    ret = hipGraphLaunch(graphExec, stream);
-    REQUIRE(hipSuccess == ret);
-
-    HIP_CHECK(hipFree(devData));
-    HIP_CHECK(hipGraphExecDestroy(graphExec));
-    HIP_CHECK(hipStreamDestroy(stream));
+  SECTION("graphExec is destroyed") {
+    int number = 5;
+    hipGraphExec_t graph_exec;
+    CreateTestExecutableGraph(&graph_exec, &number);
+    HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread));
+    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
+    REQUIRE(number == 1);
+    HIP_CHECK(hipGraphExecDestroy(graph_exec));
+    HIP_CHECK_ERROR(hipGraphLaunch(graph_exec, hipStreamPerThread), hipErrorInvalidValue);
  }
 }
-
-TEST_CASE("Unit_hipGraphLaunch_Functional_hipStreamPerThread") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(char);
-  constexpr size_t val = 0;
-  constexpr size_t updateVal = 2;
-  char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
-  char *A_h{nullptr}, *B_h{nullptr};
-
-  HipTest::initArrays<char>(&A_d, &B_d, &C_d,
-                            &A_h, &B_h, nullptr, N, false);
-
-  hipGraph_t graph;
-  hipGraphExec_t graphExec;
-  hipGraphNode_t memsetNode;
-
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-
-  hipMemsetParams memsetParams{};
-  memset(&memsetParams, 0, sizeof(memsetParams));
-  memsetParams.dst = reinterpret_cast<void*>(C_d);
-  memsetParams.value = val;
-  memsetParams.pitch = 0;
-  memsetParams.elementSize = sizeof(char);
-  memsetParams.width = Nbytes;
-  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
-                                  &memsetParams));
-
-  std::vector<hipGraphNode_t> dependencies;
-  dependencies.push_back(memsetNode);
-
-  memset(&memsetParams, 0, sizeof(memsetParams));
-  memsetParams.dst = reinterpret_cast<void*>(A_d);
-  memsetParams.value = updateVal;
-  memsetParams.pitch = 0;
-  memsetParams.elementSize = sizeof(char);
-  memsetParams.width = Nbytes;
-  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
-                                  dependencies.size(), &memsetParams));
-  HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
-  dependencies.push_back(memsetNode);
-
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, hipStreamPerThread));
-  HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
-
-  HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
-
-  // Validating the result
-  for (size_t i = 0; i < Nbytes; i++) {
-    if (A_h[i] != updateVal) {
-      WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
-      REQUIRE(false);
-    }
-  }
-
-  HipTest::freeArrays<char>(A_d, B_d, C_d,
-                            A_h, B_h, nullptr, false);
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-}
-
-static void hipGraphLaunch_test() {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(char);
-  constexpr size_t val = 0;
-  constexpr size_t updateVal = 1;
-  char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
-  char *A_h{nullptr}, *B_h{nullptr};
-
-  HipTest::initArrays<char>(&A_d, &B_d, &C_d,
-                            &A_h, &B_h, nullptr, N, false);
-
-  hipGraph_t graph;
-  hipGraphExec_t graphExec;
-  hipStream_t streamForGraph;
-  hipGraphNode_t memsetNode;
-
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-
-  hipMemsetParams memsetParams{};
-  memset(&memsetParams, 0, sizeof(memsetParams));
-  memsetParams.dst = reinterpret_cast<void*>(C_d);
-  memsetParams.value = val;
-  memsetParams.pitch = 0;
-  memsetParams.elementSize = sizeof(char);
-  memsetParams.width = Nbytes;
-  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
-                                  &memsetParams));
-
-  std::vector<hipGraphNode_t> dependencies;
-  dependencies.push_back(memsetNode);
-
-  memset(&memsetParams, 0, sizeof(memsetParams));
-  memsetParams.dst = reinterpret_cast<void*>(A_d);
-  memsetParams.value = updateVal;
-  memsetParams.pitch = 0;
-  memsetParams.elementSize = sizeof(char);
-  memsetParams.width = Nbytes;
-  memsetParams.height = 1;
-  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
-                                  dependencies.size(), &memsetParams));
-  HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
-  dependencies.push_back(memsetNode);
-
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-  HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
-
-  // Validating the result
-  for (size_t i = 0; i < Nbytes; i++) {
-    if (A_h[i] != updateVal) {
-      WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
-      REQUIRE(false);
-    }
-  }
-
-  HipTest::freeArrays<char>(A_d, B_d, C_d,
-                            A_h, B_h, nullptr, false);
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-}
-
-TEST_CASE("Unit_hipGraphLaunch_Functional_multidevice_test") {
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-
-  if (numDevices > 0) {
-    for (int i = 0; i < numDevices; i++) {
-      HIP_CHECK(hipSetDevice(i));
-      hipGraphLaunch_test();
-    }
-  } else {
-    SUCCEED("Skipped the testcase as there is no device to test.");
-  }
-}
-
-// Function to fill input data
-static void fillRandInpData(int *A1_h, int *A2_h, size_t N) {
-  unsigned int seed = time(nullptr);
-  for (size_t i = 0; i < N; i++) {
-    A1_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
-    A2_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
-  }
-}
-// Function to validate result
-static void validateOutData(int *A1_h, int *A2_h, size_t N) {
-  for (size_t i = 0; i < N; i++) {
-    int result = (A1_h[i]*A1_h[i]);
-    REQUIRE(result == A2_h[i]);
-  }
-}
-/*
- * 1.Create a graph with multiple nodes. Create an executable graph.
- * Launch the executable graph 3 times in stream simultaneously.
- * Wait for stream. Validate the output. No issues should be observed
- * 2.Create a graph with multiple nodes. Create an executable graph.
- * Verify if an executable graph be launched on null stream.
-*/
-TEST_CASE("Unit_hipGraphLaunch_Functional_MultipleLaunch") {
-  size_t memSize = SIZE;
-  constexpr auto blocksPerCU = 6;  // to hide latency
-  constexpr auto threadsPerBlock = 256;
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
-                            threadsPerBlock, SIZE);
-  hipGraph_t graph;
-  std::vector<hipGraphNode_t> nodeDependencies;
-
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  int *A_h{nullptr}, *A_d{nullptr}, *C_d{nullptr}, *C_h{nullptr};
-
-  HipTest::initArrays<int>(&A_d, &C_d, nullptr,
-               &A_h, &C_h, nullptr, SIZE, false);
-
-  hipGraphNode_t memcpyH2D, memcpyD2H, kernelNode;
-
-  // Create memcpy H2D nodes
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr,
-      0, A_d, A_h, (sizeof(int)*SIZE), hipMemcpyHostToDevice));
-  nodeDependencies.push_back(memcpyH2D);
-  // Creating kernel node
-  hipKernelNodeParams kerNodeParams;
-  void* kernelArgs[] = {reinterpret_cast<void*>(&A_d),
-                        reinterpret_cast<void*>(&C_d),
-                        reinterpret_cast<void*>(&memSize)};
-  kerNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
-  kerNodeParams.gridDim = dim3(blocks);
-  kerNodeParams.blockDim = dim3(threadsPerBlock);
-  kerNodeParams.sharedMemBytes = 0;
-  kerNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
-  kerNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(),
-                                    nodeDependencies.size(), &kerNodeParams));
-  nodeDependencies.clear();
-  nodeDependencies.push_back(kernelNode);
-
-  // Create memcpy D2H nodes
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nodeDependencies.data(),
-                         nodeDependencies.size(), C_h, C_d, (sizeof(int)*SIZE),
-                         hipMemcpyDeviceToHost));
-  nodeDependencies.clear();
-
-  // Create executable graph
-  hipStream_t streamForGraph;
-  hipGraphExec_t graphExec{nullptr};
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr,
-                                nullptr, 0));
-  // Execute graph
-  SECTION("Multiple Graph Launch") {
-    for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
-      fillRandInpData(A_h, C_h, SIZE);
-      HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-      HIP_CHECK(hipStreamSynchronize(streamForGraph));
-      validateOutData(A_h, C_h, SIZE);
-    }
-  }
-  SECTION("Graph launch on Null stream") {
-    for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
-      fillRandInpData(A_h, C_h, SIZE);
-      HIP_CHECK(hipGraphLaunch(graphExec, 0));
-      HIP_CHECK(hipStreamSynchronize(0));
-      validateOutData(A_h, C_h, SIZE);
-    }
-  }
-
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-
-  // Free
-  HipTest::freeArrays<int>(A_d, C_d, nullptr, A_h, C_h, nullptr, false);
-}
@@ -0,0 +1,412 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_kernels.hh>
+/* Test verifies hipGraphLaunch API
+Negative scenarios -
+1) Pass graphExec as nullptr and verify api returns error code.
+2) Pass pGraphExec as nullptr and stream as hipStreamPerThread and verify  api returns error code.
+3) Pass pGraphExec as empty object and verify  api returns error code.
+4) Destroy executable graph and try to launch it. Make sure api should not crash and it should returns error code.
+5) Destroy stream and try to launch respective executable graph. Make sure api should not crash and it should returns error code.
+6) Destroy actual graph created and try to launch respective executable graph.
+   Check api should execute properly without crash or error code.
+Functional Scenario -
+1) Check basic functionality with stream as hipStreamPerThread
+2) Test hipGraphLaunch call on multiple devices.
+3) Create a graph with multiple nodes. Create an executable graph.
+   Launch the executable graph 3 times in stream simultaneously.
+   Wait for stream. Validate the output. No issues should be observed
+4) Create a graph with multiple nodes. Create an executable graph.
+   Verify if an executable graph be launched on null stream.
+*/
+
+#define SIZE 1024
+#define TEST_LOOP_SIZE 3
+
+TEST_CASE("Unit_hipGraphLaunch_Negative") {
+  hipError_t ret;
+  SECTION("Pass pGraphExec as nullptr") {
+    hipStream_t stream{};
+    ret = hipGraphLaunch(nullptr, stream);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass pGraphExec as nullptr and stream as hipStreamPerThread") {
+    ret = hipGraphLaunch(nullptr, hipStreamPerThread);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass pGraphExec as empty object") {
+    hipGraphExec_t graphExec{};
+    hipStream_t stream{};
+    ret = hipGraphLaunch(graphExec, stream);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Destroy executable graph and try to launch it") {
+    constexpr size_t Nbytes = 1024;
+    hipGraph_t graph;
+    hipGraphExec_t graphExec;
+    hipStream_t stream;
+    hipGraphNode_t memsetNode;
+
+    char *devData;
+    HIP_CHECK(hipMalloc(&devData, Nbytes));
+
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    HIP_CHECK(hipStreamCreate(&stream));
+
+    hipMemsetParams memsetParams{};
+    memset(&memsetParams, 0, sizeof(memsetParams));
+    memsetParams.dst = reinterpret_cast<void*>(devData);
+    memsetParams.value = 0;
+    memsetParams.pitch = 0;
+    memsetParams.elementSize = sizeof(char);
+    memsetParams.width = Nbytes;
+    memsetParams.height = 1;
+    HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
+                                    &memsetParams));
+    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+    HIP_CHECK(hipGraphLaunch(graphExec, stream));
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    HIP_CHECK(hipGraphExecDestroy(graphExec));
+    // Launch again after destroy graph exec object.
+    ret = hipGraphLaunch(graphExec, stream);
+    REQUIRE(hipErrorInvalidValue == ret);
+
+    HIP_CHECK(hipFree(devData));
+    HIP_CHECK(hipGraphDestroy(graph));
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+/* In this case in CUDA setup this api call is giving - unknown error (999)
+   So enabling this test for both AMD and CUDA by checking with hipSuccess */
+  SECTION("Destroy stream and try to launch respective executable graph") {
+    constexpr size_t Nbytes = 1024;
+    hipGraph_t graph;
+    hipGraphExec_t graphExec;
+    hipStream_t stream;
+    hipGraphNode_t memsetNode;
+
+    char *devData;
+    HIP_CHECK(hipMalloc(&devData, Nbytes));
+
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    HIP_CHECK(hipStreamCreate(&stream));
+
+    hipMemsetParams memsetParams{};
+    memset(&memsetParams, 0, sizeof(memsetParams));
+    memsetParams.dst = reinterpret_cast<void*>(devData);
+    memsetParams.value = 0;
+    memsetParams.pitch = 0;
+    memsetParams.elementSize = sizeof(char);
+    memsetParams.width = Nbytes;
+    memsetParams.height = 1;
+    HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
+                                    &memsetParams));
+    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+    HIP_CHECK(hipGraphLaunch(graphExec, stream));
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    HIP_CHECK(hipStreamDestroy(stream));
+    // Launch again after destroy stream
+    ret = hipGraphLaunch(graphExec, stream);
+    REQUIRE(hipSuccess != ret);
+
+    HIP_CHECK(hipFree(devData));
+    HIP_CHECK(hipGraphExecDestroy(graphExec));
+    HIP_CHECK(hipGraphDestroy(graph));
+  }
+  SECTION("Destroy graph and try to launch respective executable graph") {
+    constexpr size_t Nbytes = 1024;
+    hipGraph_t graph;
+    hipGraphExec_t graphExec;
+    hipStream_t stream;
+    hipGraphNode_t memsetNode;
+
+    char *devData;
+    HIP_CHECK(hipMalloc(&devData, Nbytes));
+
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    HIP_CHECK(hipStreamCreate(&stream));
+
+    hipMemsetParams memsetParams{};
+    memset(&memsetParams, 0, sizeof(memsetParams));
+    memsetParams.dst = reinterpret_cast<void*>(devData);
+    memsetParams.value = 0;
+    memsetParams.pitch = 0;
+    memsetParams.elementSize = sizeof(char);
+    memsetParams.width = Nbytes;
+    memsetParams.height = 1;
+    HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
+                                    &memsetParams));
+    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+    HIP_CHECK(hipGraphLaunch(graphExec, stream));
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    HIP_CHECK(hipGraphDestroy(graph));
+    // Launch again after destroy graph
+    ret = hipGraphLaunch(graphExec, stream);
+    REQUIRE(hipSuccess == ret);
+
+    HIP_CHECK(hipFree(devData));
+    HIP_CHECK(hipGraphExecDestroy(graphExec));
+    HIP_CHECK(hipStreamDestroy(stream));
+  }
+}
+
+TEST_CASE("Unit_hipGraphLaunch_Functional_hipStreamPerThread") {
+  constexpr size_t N = 1024;
+  constexpr size_t Nbytes = N * sizeof(char);
+  constexpr size_t val = 0;
+  constexpr size_t updateVal = 2;
+  char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
+  char *A_h{nullptr}, *B_h{nullptr};
+
+  HipTest::initArrays<char>(&A_d, &B_d, &C_d,
+                            &A_h, &B_h, nullptr, N, false);
+
+  hipGraph_t graph;
+  hipGraphExec_t graphExec;
+  hipGraphNode_t memsetNode;
+
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  hipMemsetParams memsetParams{};
+  memset(&memsetParams, 0, sizeof(memsetParams));
+  memsetParams.dst = reinterpret_cast<void*>(C_d);
+  memsetParams.value = val;
+  memsetParams.pitch = 0;
+  memsetParams.elementSize = sizeof(char);
+  memsetParams.width = Nbytes;
+  memsetParams.height = 1;
+  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
+                                  &memsetParams));
+
+  std::vector<hipGraphNode_t> dependencies;
+  dependencies.push_back(memsetNode);
+
+  memset(&memsetParams, 0, sizeof(memsetParams));
+  memsetParams.dst = reinterpret_cast<void*>(A_d);
+  memsetParams.value = updateVal;
+  memsetParams.pitch = 0;
+  memsetParams.elementSize = sizeof(char);
+  memsetParams.width = Nbytes;
+  memsetParams.height = 1;
+  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
+                                  dependencies.size(), &memsetParams));
+  HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
+  dependencies.push_back(memsetNode);
+
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, hipStreamPerThread));
+  HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
+
+  HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
+
+  // Validating the result
+  for (size_t i = 0; i < Nbytes; i++) {
+    if (A_h[i] != updateVal) {
+      WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
+      REQUIRE(false);
+    }
+  }
+
+  HipTest::freeArrays<char>(A_d, B_d, C_d,
+                            A_h, B_h, nullptr, false);
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+}
+
+static void hipGraphLaunch_test() {
+  constexpr size_t N = 1024;
+  constexpr size_t Nbytes = N * sizeof(char);
+  constexpr size_t val = 0;
+  constexpr size_t updateVal = 1;
+  char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
+  char *A_h{nullptr}, *B_h{nullptr};
+
+  HipTest::initArrays<char>(&A_d, &B_d, &C_d,
+                            &A_h, &B_h, nullptr, N, false);
+
+  hipGraph_t graph;
+  hipGraphExec_t graphExec;
+  hipStream_t streamForGraph;
+  hipGraphNode_t memsetNode;
+
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+
+  hipMemsetParams memsetParams{};
+  memset(&memsetParams, 0, sizeof(memsetParams));
+  memsetParams.dst = reinterpret_cast<void*>(C_d);
+  memsetParams.value = val;
+  memsetParams.pitch = 0;
+  memsetParams.elementSize = sizeof(char);
+  memsetParams.width = Nbytes;
+  memsetParams.height = 1;
+  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
+                                  &memsetParams));
+
+  std::vector<hipGraphNode_t> dependencies;
+  dependencies.push_back(memsetNode);
+
+  memset(&memsetParams, 0, sizeof(memsetParams));
+  memsetParams.dst = reinterpret_cast<void*>(A_d);
+  memsetParams.value = updateVal;
+  memsetParams.pitch = 0;
+  memsetParams.elementSize = sizeof(char);
+  memsetParams.width = Nbytes;
+  memsetParams.height = 1;
+  HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
+                                  dependencies.size(), &memsetParams));
+  HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
+  dependencies.push_back(memsetNode);
+
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
+
+  // Validating the result
+  for (size_t i = 0; i < Nbytes; i++) {
+    if (A_h[i] != updateVal) {
+      WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
+      REQUIRE(false);
+    }
+  }
+
+  HipTest::freeArrays<char>(A_d, B_d, C_d,
+                            A_h, B_h, nullptr, false);
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+}
+
+TEST_CASE("Unit_hipGraphLaunch_Functional_multidevice_test") {
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+
+  if (numDevices > 0) {
+    for (int i = 0; i < numDevices; i++) {
+      HIP_CHECK(hipSetDevice(i));
+      hipGraphLaunch_test();
+    }
+  } else {
+    SUCCEED("Skipped the testcase as there is no device to test.");
+  }
+}
+
+// Function to fill input data
+static void fillRandInpData(int *A1_h, int *A2_h, size_t N) {
+  unsigned int seed = time(nullptr);
+  for (size_t i = 0; i < N; i++) {
+    A1_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
+    A2_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
+  }
+}
+// Function to validate result
+static void validateOutData(int *A1_h, int *A2_h, size_t N) {
+  for (size_t i = 0; i < N; i++) {
+    int result = (A1_h[i]*A1_h[i]);
+    REQUIRE(result == A2_h[i]);
+  }
+}
+/*
+ * 1.Create a graph with multiple nodes. Create an executable graph.
+ * Launch the executable graph 3 times in stream simultaneously.
+ * Wait for stream. Validate the output. No issues should be observed
+ * 2.Create a graph with multiple nodes. Create an executable graph.
+ * Verify if an executable graph be launched on null stream.
+*/
+TEST_CASE("Unit_hipGraphLaunch_Functional_MultipleLaunch") {
+  size_t memSize = SIZE;
+  constexpr auto blocksPerCU = 6;  // to hide latency
+  constexpr auto threadsPerBlock = 256;
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
+                            threadsPerBlock, SIZE);
+  hipGraph_t graph;
+  std::vector<hipGraphNode_t> nodeDependencies;
+
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  int *A_h{nullptr}, *A_d{nullptr}, *C_d{nullptr}, *C_h{nullptr};
+
+  HipTest::initArrays<int>(&A_d, &C_d, nullptr,
+               &A_h, &C_h, nullptr, SIZE, false);
+
+  hipGraphNode_t memcpyH2D, memcpyD2H, kernelNode;
+
+  // Create memcpy H2D nodes
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr,
+      0, A_d, A_h, (sizeof(int)*SIZE), hipMemcpyHostToDevice));
+  nodeDependencies.push_back(memcpyH2D);
+  // Creating kernel node
+  hipKernelNodeParams kerNodeParams;
+  void* kernelArgs[] = {reinterpret_cast<void*>(&A_d),
+                        reinterpret_cast<void*>(&C_d),
+                        reinterpret_cast<void*>(&memSize)};
+  kerNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
+  kerNodeParams.gridDim = dim3(blocks);
+  kerNodeParams.blockDim = dim3(threadsPerBlock);
+  kerNodeParams.sharedMemBytes = 0;
+  kerNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
+  kerNodeParams.extra = nullptr;
+  HIP_CHECK(hipGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(),
+                                    nodeDependencies.size(), &kerNodeParams));
+  nodeDependencies.clear();
+  nodeDependencies.push_back(kernelNode);
+
+  // Create memcpy D2H nodes
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nodeDependencies.data(),
+                         nodeDependencies.size(), C_h, C_d, (sizeof(int)*SIZE),
+                         hipMemcpyDeviceToHost));
+  nodeDependencies.clear();
+
+  // Create executable graph
+  hipStream_t streamForGraph;
+  hipGraphExec_t graphExec{nullptr};
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr,
+                                nullptr, 0));
+  // Execute graph
+  SECTION("Multiple Graph Launch") {
+    for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
+      fillRandInpData(A_h, C_h, SIZE);
+      HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+      HIP_CHECK(hipStreamSynchronize(streamForGraph));
+      validateOutData(A_h, C_h, SIZE);
+    }
+  }
+  SECTION("Graph launch on Null stream") {
+    for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
+      fillRandInpData(A_h, C_h, SIZE);
+      HIP_CHECK(hipGraphLaunch(graphExec, 0));
+      HIP_CHECK(hipStreamSynchronize(0));
+      validateOutData(A_h, C_h, SIZE);
+    }
+  }
+
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+
+  // Free
+  HipTest::freeArrays<int>(A_d, C_d, nullptr, A_h, C_h, nullptr, false);
+}
@@ -261,9 +261,10 @@ TEST_CASE("Unit_hipGraphUpload_Functional_With_Priority_Stream") {
 1) Pass graphExec node as nullptr.
 2) Pass graphExec node as uninitialize object
 3) Pass stream as uninitialize object
+4) Graphexec is destroyed before upload
 */

-TEST_CASE("Unit_hipGraphUpload_Negative_Argument_Check") {
+TEST_CASE("Unit_hipGraphUpload_Negative_Parameters") {
  hipGraphExec_t graphExec{};
  hipError_t ret;

@@ -271,21 +272,30 @@ TEST_CASE("Unit_hipGraphUpload_Negative_Argument_Check") {
  HIP_CHECK(hipStreamCreate(&stream));

  SECTION("Pass graphExec node as nullptr") {
-    ret = hipGraphUpload(nullptr, stream);
-    REQUIRE(hipErrorInvalidValue == ret);
+    HIP_CHECK_ERROR(hipGraphUpload(nullptr, stream), hipErrorInvalidValue);
  }
  SECTION("Pass graphExec node as uninitialize object") {
-    ret = hipGraphUpload(graphExec, stream);
-    REQUIRE(hipErrorInvalidValue == ret);
+    HIP_CHECK_ERROR(hipGraphUpload(graphExec, stream), hipErrorInvalidValue);
  }
  SECTION("Pass stream as uninitialize object") {
    hipStream_t stream1{};
    hipGraph_t graph;
    HIP_CHECK(hipGraphCreate(&graph, 0));
-    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
+    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));

    ret = hipGraphUpload(graphExec, stream1);
    REQUIRE(hipSuccess == ret);
  }
+  SECTION("graphExec is destroyed"){
+    hipGraphExec_t graph_exec;
+    hipGraph_t graph;
+
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+    
+    HIP_CHECK(hipGraphUpload(graph_exec, hipStreamPerThread));
+    HIP_CHECK(hipGraphExecDestroy(graph_exec));
+    HIP_CHECK_ERROR(hipGraphUpload(graph_exec, hipStreamPerThread), hipErrorInvalidValue);
+  }
  HIP_CHECK(hipStreamDestroy(stream));
 }
@@ -4,9 +4,23 @@ set(TEST_SRC
  hipOccupancyMaxActiveBlocksPerMultiprocessor_old.cc
  hipOccupancyMaxPotentialBlockSize.cc
  hipOccupancyMaxPotentialBlockSize_old.cc
+  hipModuleOccupancyMaxPotentialBlockSize.cc
+  hipModuleOccupancyMaxPotentialBlockSizeWithFlags.cc
+  hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.cc
+  hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.cc
  hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags.cc
 )

+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/simple_kernel.code
+                   COMMAND ${CMAKE_CXX_COMPILER} --genco --std=c++17
+                   ${CMAKE_CURRENT_SOURCE_DIR}/simple_kernel.cc
+                   -o simple_kernel.code --rocm-path=${ROCM_PATH}
+                   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/simple_kernel.cc)
+
+add_custom_target(simple_kernel ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/simple_kernel.code)
+
 hip_add_exe_to_target(NAME OccupancyTest
                      TEST_SRC ${TEST_SRC}
                      TEST_TARGET_NAME build_tests)
+
+add_dependencies(OccupancyTest simple_kernel)
@@ -0,0 +1,92 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/*
+Testcase Scenarios :
+Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation - Test correct
+execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessor for diffrent parameter values
+Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters - Test unsuccessful
+execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessor api when parameters are invalid
+*/
+#include "occupancy_common.hh"
+
+TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters") {
+  hipModule_t module;
+  hipFunction_t function;
+  int blockSize = 0;
+  int gridSize = 0;
+
+  HIP_CHECK(hipFree(nullptr));
+
+  HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
+  HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
+
+  // Get potential blocksize
+  HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
+
+  // Common negative tests
+  MaxActiveBlocksPerMultiprocessorNegative(
+      [&function](int* numBlocks, int blockSize, size_t dynSharedMemPerBlk) {
+        return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize,
+                                                                  dynSharedMemPerBlk);
+      },
+      blockSize);
+
+  HIP_CHECK(hipModuleUnload(module));
+}
+
+TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation") {
+  hipDeviceProp_t devProp;
+  hipModule_t module;
+  hipFunction_t function;
+  int blockSize = 0;
+  int gridSize = 0;
+
+  HIP_CHECK(hipFree(nullptr));
+
+  HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
+  HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
+
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+
+  SECTION("dynSharedMemPerBlk = 0") {
+    // Get potential blocksize
+    HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
+
+    MaxActiveBlocksPerMultiprocessor(
+        [blockSize, &function](int* numBlocks) {
+          return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize,
+                                                                    0);
+        },
+        blockSize, devProp.maxThreadsPerMultiProcessor);
+  }
+  SECTION("dynSharedMemPerBlk = sharedMemPerBlock") {
+    // Get potential blocksize
+    HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function,
+                                                      devProp.sharedMemPerBlock, 0));
+
+    MaxActiveBlocksPerMultiprocessor(
+        [blockSize, devProp, &function](int* numBlocks) {
+          return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize,
+                                                                    devProp.sharedMemPerBlock);
+        },
+        blockSize, devProp.maxThreadsPerMultiProcessor);
+  }
+
+  HIP_CHECK(hipModuleUnload(module));
+}
@@ -0,0 +1,103 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/*
+Testcase Scenarios :
+Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation - Test
+correct execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags for diffrent
+parameter values
+Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters - Test
+unsuccessful execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags api when
+parameters are invalid
+*/
+#include "occupancy_common.hh"
+
+TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters") {
+  hipModule_t module;
+  hipFunction_t function;
+  int numBlocks = 0;
+  int blockSize = 0;
+  int gridSize = 0;
+
+  HIP_CHECK(hipFree(nullptr));
+
+  HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
+  HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
+
+  // Get potential blocksize
+  HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
+
+  // Common negative tests
+  MaxActiveBlocksPerMultiprocessorNegative(
+      [&function](int* numBlocks, int blockSize, size_t dynSharedMemPerBlk) {
+        return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+            numBlocks, function, blockSize, dynSharedMemPerBlk, hipOccupancyDefault);
+      },
+      blockSize);
+
+  SECTION("Flag is invalid") {
+    // Only default flag is supported
+    HIP_CHECK_ERROR(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+                        &numBlocks, function, blockSize, 0, 2),
+                    hipErrorInvalidValue);
+  }
+
+  HIP_CHECK(hipModuleUnload(module));
+}
+
+TEST_CASE(
+    "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation") {
+  hipDeviceProp_t devProp;
+  hipModule_t module;
+  hipFunction_t function;
+  int blockSize = 0;
+  int gridSize = 0;
+
+  HIP_CHECK(hipFree(nullptr));
+
+  HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
+  HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
+
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+
+  SECTION("dynSharedMemPerBlk = 0") {
+    // Get potential blocksize
+    HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
+
+    MaxActiveBlocksPerMultiprocessor(
+        [blockSize, &function](int* numBlocks) {
+          return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+              numBlocks, function, blockSize, 0, hipOccupancyDefault);
+        },
+        blockSize, devProp.maxThreadsPerMultiProcessor);
+  }
+  SECTION("dynSharedMemPerBlk = sharedMemPerBlock") {
+    // Get potential blocksize
+    HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function,
+                                                      devProp.sharedMemPerBlock, 0));
+
+    MaxActiveBlocksPerMultiprocessor(
+        [blockSize, devProp, &function](int* numBlocks) {
+          return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+              numBlocks, function, blockSize, devProp.sharedMemPerBlock, hipOccupancyDefault);
+        },
+        blockSize, devProp.maxThreadsPerMultiProcessor);
+  }
+
+  HIP_CHECK(hipModuleUnload(module));
+}
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/*
+Testcase Scenarios :
+Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation - Test correct execution of
+hipModuleOccupancyMaxPotentialBlockSize for diffrent parameter values
+Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters - Test unsuccessful execution of
+hipModuleOccupancyMaxPotentialBlockSize api when parameters are invalid
+*/
+#include "occupancy_common.hh"
+
+TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters") {
+  hipModule_t module;
+  hipFunction_t function;
+
+  HIP_CHECK(hipFree(nullptr));
+
+  HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
+  HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
+
+  // Common negative tests
+  MaxPotentialBlockSizeNegative([&function](int* gridSize, int* blockSize) {
+    return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, function, 0, 0);
+  });
+
+  HIP_CHECK(hipModuleUnload(module));
+}
+
+TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation") {
+  hipDeviceProp_t devProp;
+  hipModule_t module;
+  hipFunction_t function;
+
+  HIP_CHECK(hipFree(nullptr));
+
+  HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
+  HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
+
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+
+  SECTION("dynSharedMemPerBlk = 0, blockSizeLimit = 0") {
+    MaxPotentialBlockSize(
+        [&function](int* gridSize, int* blockSize) {
+          return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, function, 0, 0);
+        },
+        devProp.maxThreadsPerBlock);
+  }
+
+  SECTION("dynSharedMemPerBlk = sharedMemPerBlock, blockSizeLimit = maxThreadsPerBlock") {
+    MaxPotentialBlockSize(
+        [&function, devProp](int* gridSize, int* blockSize) {
+          return hipModuleOccupancyMaxPotentialBlockSize(
+              gridSize, blockSize, function, devProp.sharedMemPerBlock, devProp.maxThreadsPerBlock);
+        },
+        devProp.maxThreadsPerBlock);
+  }
+
+  HIP_CHECK(hipModuleUnload(module));
+}
@@ -0,0 +1,87 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/*
+Testcase Scenarios :
+Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation - Test correct
+execution of hipModuleOccupancyMaxPotentialBlockSizeWithFlags for diffrent parameter values
+Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters - Test unsuccessful
+execution of hipModuleOccupancyMaxPotentialBlockSizeWithFlags api when parameters are invalid
+*/
+#include "occupancy_common.hh"
+
+TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters") {
+  hipModule_t module;
+  hipFunction_t function;
+  int blockSize = 0;
+  int gridSize = 0;
+
+  HIP_CHECK(hipFree(nullptr));
+
+  HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
+  HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
+
+  // Common negative tests
+  MaxPotentialBlockSizeNegative([&function](int* gridSize, int* blockSize) {
+    return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, function, 0, 0,
+                                                            hipOccupancyDefault);
+  });
+
+  SECTION("Flag is invalid") {
+    // Only default flag is supported
+    HIP_CHECK_ERROR(
+        hipModuleOccupancyMaxPotentialBlockSizeWithFlags(&gridSize, &blockSize, function, 0, 0, 2),
+        hipErrorInvalidValue);
+  }
+
+  HIP_CHECK(hipModuleUnload(module));
+}
+
+TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation") {
+  hipDeviceProp_t devProp;
+  hipModule_t module;
+  hipFunction_t function;
+
+  HIP_CHECK(hipFree(nullptr));
+
+  HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
+  HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
+
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+
+  SECTION("dynSharedMemPerBlk = 0, blockSizeLimit = 0") {
+    MaxPotentialBlockSize(
+        [&function](int* gridSize, int* blockSize) {
+          return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, function, 0,
+                                                                  0, hipOccupancyDefault);
+        },
+        devProp.maxThreadsPerBlock);
+  }
+
+  SECTION("dynSharedMemPerBlk = sharedMemPerBlock, blockSizeLimit = maxThreadsPerBlock") {
+    MaxPotentialBlockSize(
+        [&function, devProp](int* gridSize, int* blockSize) {
+          return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(
+              gridSize, blockSize, function, devProp.sharedMemPerBlock, devProp.maxThreadsPerBlock,
+              hipOccupancyDefault);
+        },
+        devProp.maxThreadsPerBlock);
+  }
+
+  HIP_CHECK(hipModuleUnload(module));
+}
@@ -66,7 +66,5 @@ template <typename F> void MaxActiveBlocksPerMultiprocessorNegative(F func, int
  SECTION("numBlocks is nullptr") {
    HIP_CHECK_ERROR(func(nullptr, blockSize, 0), hipErrorInvalidValue);
  }
-  SECTION("Block size is 0") { 
-    HIP_CHECK_ERROR(func(&numBlocks, 0, 0), hipErrorInvalidValue);
-  }
+  SECTION("Block size is 0") { HIP_CHECK_ERROR(func(&numBlocks, 0, 0), hipErrorInvalidValue); }
 }
@@ -0,0 +1,25 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "hip/hip_runtime.h"
+
+extern "C" __global__ void SimpleKernel(int* a, int* b) {
+    int tx = threadIdx.x;
+    b[tx] = a[tx];
+}
@@ -0,0 +1,9 @@
+# Common Tests - Test independent of all platforms
+set(TEST_SRC
+  warp_shfl_xor.cc
+  warp_shfl.cc
+)
+
+hip_add_exe_to_target(NAME WarpTest
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests)
@@ -0,0 +1,84 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+static __device__ bool deactivate_thread(const uint64_t* const active_masks) {
+  const auto warp =
+      cooperative_groups::tiled_partition(cooperative_groups::this_thread_block(), warpSize);
+  const auto block = cooperative_groups::this_thread_block();
+  const auto warps_per_block = (block.size() + warpSize - 1) / warpSize;
+  const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+  const auto idx = block_rank * warps_per_block + block.thread_rank() / warpSize;
+
+  return !(active_masks[idx] & (static_cast<uint64_t>(1) << warp.thread_rank()));
+}
+
+static inline std::mt19937& GetRandomGenerator() {
+  static std::mt19937 mt(std::random_device{}());
+  return mt;
+}
+
+template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  return dist(GetRandomGenerator());
+}
+
+template <typename T> static inline T GenerateRandomReal(const T min, const T max) {
+  std::uniform_real_distribution<T> dist(min, max);
+  return dist(GetRandomGenerator());
+}
+
+inline int generate_width(int warp_size) {
+  int exponent = 0;
+  while (warp_size >>= 1) {
+    ++exponent;
+  }
+
+  return GENERATE_COPY(map([](int e) { return 1 << e; }, range(1, exponent + 1)));
+}
+
+inline uint64_t get_active_mask(unsigned int warp_id, unsigned int warp_size) {
+  uint64_t active_mask = 0;
+  switch (warp_id % 5) {
+    case 0:  // even threads in the warp
+      active_mask = 0xAAAAAAAAAAAAAAAA;
+      break;
+    case 1:  // odd threads in the warp
+      active_mask = 0x5555555555555555;
+      break;
+    case 2:  // first half of the warp
+      for (int i = 0; i < warp_size / 2; i++) {
+        active_mask = active_mask | (static_cast<uint64_t>(1) << i);
+      }
+      break;
+    case 3:  // second half of the warp
+      for (int i = warp_size / 2; i < warp_size; i++) {
+        active_mask = active_mask | (static_cast<uint64_t>(1) << i);
+      }
+      break;
+    case 4:  // all threads
+      active_mask = 0xFFFFFFFFFFFFFFFF;
+      break;
+  }
+  return active_mask;
+}
@@ -0,0 +1,121 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_shfl_common.hh"
+
+#include <bitset>
+
+/**
+ * @addtogroup shfl shfl
+ * @{
+ * @ingroup DeviceLanguageTest
+ * `T __shfl(T var, int src_lane, int width = warpSize)` -
+ * Contains unit test for warp shfl function
+ */
+
+namespace cg = cooperative_groups;
+
+template <typename T>
+__global__ void shfl(T* const out, const T* const in, const uint64_t* const active_masks,
+                     const uint8_t* const src_lanes, const int width) {
+  if (deactivate_thread(active_masks)) {
+    return;
+  }
+  const auto grid = cg::this_grid();
+  const auto block = cg::this_thread_block();
+  T var = in[grid.thread_rank()];
+  out[grid.thread_rank()] = __shfl(var, src_lanes[block.thread_rank() % width], width);
+}
+
+template <typename T> class WarpShfl : public WarpShflTest<WarpShfl<T>, T> {
+ public:
+  void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) {
+    width_ = generate_width(this->warp_size_);
+    INFO("Width: " << width_);
+    const auto alloc_size = width_ * sizeof(uint8_t);
+    LinearAllocGuard<uint8_t> src_lanes_dev(LinearAllocs::hipMalloc, alloc_size);
+    src_lanes_.resize(width_);
+    std::generate(src_lanes_.begin(), src_lanes_.end(),
+                  [this] { return GenerateRandomInteger(0, static_cast<int>(2 * width_)); });
+
+    HIP_CHECK(hipMemcpy(src_lanes_dev.ptr(), src_lanes_.data(), alloc_size, hipMemcpyHostToDevice));
+    shfl<<<this->grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks,
+                                                            src_lanes_dev.ptr(), width_);
+  }
+
+  void validate(const T* const arr, const T* const input) {
+    ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional<T> {
+      const auto rank_in_block = this->grid_.thread_rank_in_block(i).value();
+      const auto rank_in_warp = rank_in_block % this->warp_size_;
+      const auto rank_in_partition = rank_in_block % width_;
+      const int src_lane = src_lanes_[rank_in_partition] % width_;
+      const int src_offset = src_lane - rank_in_partition;
+
+      const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) +
+          rank_in_block / this->warp_size_;
+      const std::bitset<sizeof(uint64_t) * 8> active_mask(this->active_masks_[mask_idx]);
+
+      if (!active_mask.test(rank_in_warp) || (!active_mask.test((rank_in_warp + src_offset))) ||
+          (rank_in_block + src_offset >= this->grid_.threads_in_block_count_)) {
+        return std::nullopt;
+      }
+
+      return input[i + src_offset];
+    });
+  };
+
+ private:
+  std::vector<uint8_t> src_lanes_;
+  int width_;
+};
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validates the warp shuffle behavior for all valid width sizes {2, 4, 8, 16, 32,
+ * 64(if supported)} for generated shuffle target lanes. The threads are deactivated based on the
+ * passed active mask. The test is run for all overloads of shfl.
+ * Test source
+ * ------------------------
+ *  - unit/warp/warp_shfl.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ *  - Device supports warp shuffle
+ */
+TEMPLATE_TEST_CASE("Unit_Warp_Shfl_Positive_Basic", "", int, unsigned int, long, unsigned long,
+                   long long, unsigned long long, float, double) {
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.arch.hasWarpShuffle) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!");
+    return;
+  }
+
+  SECTION("Shfl with specified active mask and input values") {
+    WarpShfl<TestType>().run(false);
+  }
+
+  SECTION("Shfl with random active mask and input values") {
+    WarpShfl<TestType>().run(true);
+  }
+}
@@ -0,0 +1,114 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "warp_common.hh"
+
+#include <cpu_grid.h>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+template <typename Derived, typename T> class WarpShflTest {
+ public:
+  WarpShflTest() : warp_size_{get_warp_size()} {}
+
+  void run(bool random = false) {
+    const auto blocks = GenerateBlockDimensionsForShuffle();
+    INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+    const auto threads = GenerateThreadDimensionsForShuffle();
+    INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+    grid_ = CPUGrid(blocks, threads);
+
+    const auto alloc_size = grid_.thread_count_ * sizeof(T);
+    LinearAllocGuard<T> input_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<T> input(LinearAllocs::hipHostMalloc, alloc_size);
+    LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+    HIP_CHECK(hipMemset(arr_dev.ptr(), 0, alloc_size));
+
+    warps_in_block_ = (grid_.threads_in_block_count_ + warp_size_ - 1) / warp_size_;
+    const auto warps_in_grid = warps_in_block_ * grid_.block_count_;
+    LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
+                                                warps_in_grid * sizeof(uint64_t));
+    active_masks_.resize(warps_in_grid);
+
+    generate_input(input.ptr(), random);
+
+    HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks_.data(),
+                        warps_in_grid * sizeof(uint64_t), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(input_dev.ptr(), input.ptr(), alloc_size, hipMemcpyHostToDevice));
+    cast_to_derived().launch_kernel(arr_dev.ptr(), input_dev.ptr(), active_masks_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    cast_to_derived().validate(arr.ptr(), input.ptr());
+  }
+
+ private:
+  int get_warp_size() const {
+    int current_dev = -1;
+    HIP_CHECK(hipGetDevice(&current_dev));
+    int warp_size = 0u;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    return warp_size;
+  }
+
+  void generate_input(T* input, bool random) {
+    if (random) {
+      std::generate(active_masks_.begin(), active_masks_.end(), [] {
+        return GenerateRandomInteger<unsigned long long>(0ul, std::numeric_limits<uint64_t>().max());
+      });
+
+      if constexpr (std::is_same_v<float, T> || std::is_same_v<double, T>) {
+        std::generate_n(input, grid_.thread_count_, [] {
+          return static_cast<T>(
+              GenerateRandomReal(std::numeric_limits<T>().min(), std::numeric_limits<T>().max()));
+        });
+      } else {
+        std::generate_n(input, grid_.thread_count_, [] {
+          return static_cast<T>(GenerateRandomInteger(std::numeric_limits<T>().min(),
+                                                      std::numeric_limits<T>().max()));
+        });
+      }
+    } else {
+      unsigned long long int i = 0;
+      std::generate(active_masks_.begin(), active_masks_.end(),
+                    [this, &i]() { return get_active_mask(i++, warp_size_); });
+
+      i = 0;
+      std::generate_n(input, grid_.thread_count_, [&i]() {
+        if (static_cast<T>(i) > std::numeric_limits<T>().max())
+          i = 0;
+        else
+          i++;
+        return static_cast<T>(i);
+      });
+    }
+  }
+
+  Derived& cast_to_derived() { return reinterpret_cast<Derived&>(*this); }
+
+ protected:
+  const int warp_size_;
+  CPUGrid grid_;
+  unsigned int warps_in_block_;
+  std::vector<uint64_t> active_masks_;
+};
@@ -0,0 +1,118 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_shfl_common.hh"
+
+#include <bitset>
+
+/**
+ * @addtogroup shfl_xor shfl_xor
+ * @{
+ * @ingroup DeviceLanguageTest
+ * `T __shfl_xor(T var, int lane_mask, int width = warpSize)` -
+ * Contains unit test for warp shfl_xor function
+ */
+
+namespace cg = cooperative_groups;
+
+template <typename T>
+__global__ void shfl_xor(T* const out, const T* const in, const uint64_t* const active_masks,
+                         const int lane_mask, const int width) {
+  if (deactivate_thread(active_masks)) {
+    return;
+  }
+
+  const auto grid = cg::this_grid();
+  T var = in[grid.thread_rank()];
+  out[grid.thread_rank()] = __shfl_xor(var, lane_mask, width);
+}
+
+template <typename T> class WarpShflXOR : public WarpShflTest<WarpShflXOR<T>, T> {
+ public:
+  void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) {
+    width_ = generate_width(this->warp_size_);
+    INFO("Width: " << width_);
+    lane_mask_ = GENERATE_COPY(range(0, this->warp_size_));
+    INFO("Lane mask: " << lane_mask_);
+    shfl_xor<<<this->grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks,
+                                                                lane_mask_, width_);
+  }
+
+  void validate(const T* const arr, const T* const input) {
+    ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional<T> {
+      const auto rank_in_block = this->grid_.thread_rank_in_block(i).value();
+      const auto rank_in_warp = rank_in_block % this->warp_size_;
+      const int warp_target = rank_in_warp ^ this->lane_mask_;
+      const int target_offset = warp_target - rank_in_warp;
+      const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) +
+          rank_in_block / this->warp_size_;
+      const std::bitset<sizeof(uint64_t) * 8> active_mask(this->active_masks_[mask_idx]);
+
+      const auto target_partition = warp_target / width_;
+      const auto partition_rank = rank_in_warp / width_;
+      if (!active_mask.test(rank_in_warp) ||
+          (target_partition <= partition_rank && !active_mask.test(rank_in_warp + target_offset)) ||
+          (target_partition <= partition_rank &&
+           rank_in_block + target_offset >= this->grid_.threads_in_block_count_)) {
+        return std::nullopt;
+      }
+
+      return target_partition > partition_rank ? input[i] : input[i + target_offset];
+    });
+  };
+
+ private:
+  int lane_mask_;
+  int width_;
+};
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validates the warp shuffle xor behavior for all valid width sizes {2, 4, 8, 16, 32,
+ * 64(if supported)} for mask values of [0, width). The threads are deactivated based on the
+ * passed active mask. The test is run for all overloads of shfl_xor.
+ * Test source
+ * ------------------------
+ *  - unit/warp/warp_shfl_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ *  - Device supports warp shuffle
+ */
+TEMPLATE_TEST_CASE("Unit_Warp_Shfl_XOR_Positive_Basic", "", int, unsigned int, long, unsigned long,
+                   long long, unsigned long long, float, double) {
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.arch.hasWarpShuffle) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!");
+    return;
+  }
+
+  SECTION("Shfl Xor with specified active mask and input values") {
+    WarpShflXOR<TestType>().run(false);
+  }
+
+  SECTION("Shfl Xor with random active mask and input values") {
+    WarpShflXOR<TestType>().run(true);
+  }
+}