EXSWHTEC-275 - Implement tests for atomic CAS operations. (#197)

[ROCm/hip-tests commit: a8640d8c62]
2023-08-14 18:01:50 +02:00
parent 3191fecb78
commit 0f844d75a4
6 changed files with 1183 additions and 0 deletions
@@ -0,0 +1,42 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+set(TEST_SRC
+    atomicCAS.cc
+    atomicCAS_system.cc
+)
+
+if(HIP_PLATFORM MATCHES "nvidia")
+    set_source_files_properties(atomicCAS_system.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+    hip_add_exe_to_target(NAME AtomicsTest
+                        TEST_SRC ${TEST_SRC}
+                        TEST_TARGET_NAME build_tests
+                        LINKER_LIBS "nvrtc -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
+elseif(HIP_PLATFORM MATCHES "amd")
+    hip_add_exe_to_target(NAME AtomicsTest
+                        TEST_SRC ${TEST_SRC}
+                        TEST_TARGET_NAME build_tests
+                        LINKER_LIBS hiprtc)
+endif()
+
+add_test(NAME Unit_atomicCAS_Negative_Parameters
+         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+         atomicCAS_negative_kernels.cc 48)
@@ -0,0 +1,449 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <resource_guards.hh>
+#include <cmd_options.hh>
+
+namespace cg = cooperative_groups;
+
+enum class AtomicOperation {
+  kAdd = 0,
+  kAddSystem,
+  kSub,
+  kSubSystem,
+  kInc,
+  kDec,
+  kUnsafeAdd,
+  kSafeAdd,
+  kCASAdd,
+  kCASAddSystem
+};
+
+constexpr auto kIntegerTestValue = 7;
+constexpr auto kFloatingPointTestValue = 3.125;
+constexpr auto kIncDecWraparoundValue = 1023;
+
+template <typename TestType, AtomicOperation operation>
+__host__ __device__ TestType GetTestValue() {
+  if constexpr (operation == AtomicOperation::kInc || operation == AtomicOperation::kDec) {
+    return kIncDecWraparoundValue;
+  }
+
+  return std::is_floating_point_v<TestType> ? kFloatingPointTestValue : kIntegerTestValue;
+}
+
+template <typename TestType> __device__ TestType CASAtomicAdd(TestType* address, TestType val) {
+  TestType old = *address, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, val + assumed);
+  } while (assumed != old);
+
+  return old;
+}
+
+template <typename TestType>
+__device__ TestType CASAtomicAddSystem(TestType* address, TestType val) {
+  TestType old = *address, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS_system(address, assumed, val + assumed);
+  } while (assumed != old);
+
+  return old;
+}
+
+template <typename TestType, AtomicOperation operation>
+__device__ TestType PerformAtomicOperation(TestType* const mem) {
+  const auto val = GetTestValue<TestType, operation>();
+
+  if constexpr (operation == AtomicOperation::kAdd) {
+    return atomicAdd(mem, val);
+  } else if constexpr (operation == AtomicOperation::kAddSystem) {
+    return atomicAdd_system(mem, val);
+  } else if constexpr (operation == AtomicOperation::kSub) {
+    return atomicSub(mem, val);
+  } else if constexpr (operation == AtomicOperation::kSubSystem) {
+    return atomicSub_system(mem, val);
+  } else if constexpr (operation == AtomicOperation::kInc) {
+    return atomicInc(mem, val);
+  } else if constexpr (operation == AtomicOperation::kDec) {
+    return atomicDec(mem, val);
+  } else if constexpr (operation == AtomicOperation::kUnsafeAdd) {
+    return unsafeAtomicAdd(mem, val);
+  } else if constexpr (operation == AtomicOperation::kSafeAdd) {
+    return safeAtomicAdd(mem, val);
+  } else if constexpr (operation == AtomicOperation::kCASAdd) {
+    return CASAtomicAdd(mem, val);
+  } else if constexpr (operation == AtomicOperation::kCASAddSystem) {
+    return CASAtomicAddSystem(mem, val);
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) {
+  __shared__ TestType shared_mem;
+
+  const auto tid = cg::this_grid().thread_rank();
+
+  TestType* const mem = use_shared_mem ? &shared_mem : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid == 0) mem[0] = global_mem[0];
+    __syncthreads();
+  }
+
+  old_vals[tid] = PerformAtomicOperation<TestType, operation>(mem);
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid == 0) global_mem[0] = mem[0];
+  }
+}
+
+template <typename TestType>
+__host__ __device__ TestType* PitchedOffset(TestType* const ptr, const unsigned int pitch,
+                                            const unsigned int idx) {
+  const auto byte_ptr = reinterpret_cast<uint8_t*>(ptr);
+  return reinterpret_cast<TestType*>(byte_ptr + idx * pitch);
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals,
+                           const unsigned int width, const unsigned pitch) {
+  extern __shared__ uint8_t shared_mem[];
+
+  const auto tid = cg::this_grid().thread_rank();
+
+  TestType* const mem = use_shared_mem ? reinterpret_cast<TestType*>(shared_mem) : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid < width) {
+      const auto target = PitchedOffset(mem, pitch, tid);
+      *target = *PitchedOffset(global_mem, pitch, tid);
+    };
+    __syncthreads();
+  }
+
+  old_vals[tid] =
+      PerformAtomicOperation<TestType, operation>(PitchedOffset(mem, pitch, tid % width));
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid < width) {
+      const auto target = PitchedOffset(global_mem, pitch, tid);
+      *target = *PitchedOffset(mem, pitch, tid);
+    };
+  }
+}
+
+struct TestParams {
+  auto ThreadCount() const {
+    return blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z;
+  }
+
+  auto HostIterationsPerThread() const {
+    return std::max(num_devices * kernel_count * ThreadCount() / 20, width);
+  }
+
+  dim3 blocks;
+  dim3 threads;
+  unsigned int num_devices = 1u;
+  unsigned int kernel_count = 1u;
+  unsigned int width = 1u;
+  unsigned int pitch = 0u;
+  unsigned int host_thread_count = 0u;
+  LinearAllocs alloc_type;
+};
+
+template <typename TestType, AtomicOperation operation>
+std::tuple<std::vector<TestType>, std::vector<TestType>> TestKernelHostRef(const TestParams& p) {
+  const auto val = GetTestValue<TestType, operation>();
+
+  const auto total_thread_count = p.num_devices * p.kernel_count * p.ThreadCount() +
+      p.host_thread_count * p.HostIterationsPerThread();
+
+  std::vector<TestType> res_vals(p.width);
+  std::vector<TestType> old_vals;
+  old_vals.reserve(total_thread_count);
+
+  auto perform_op = [&](unsigned id) {
+    auto& res = res_vals[id % p.width];
+    old_vals.push_back(res);
+
+    if constexpr (operation == AtomicOperation::kAdd || operation == AtomicOperation::kAddSystem ||
+                  operation == AtomicOperation::kUnsafeAdd ||
+                  operation == AtomicOperation::kSafeAdd || operation == AtomicOperation::kCASAdd ||
+                  operation == AtomicOperation::kCASAddSystem) {
+      res = res + val;
+    } else if constexpr (operation == AtomicOperation::kSub ||
+                         operation == AtomicOperation::kSubSystem) {
+      res = res - val;
+    } else if constexpr (operation == AtomicOperation::kInc) {
+      res = (res >= val) ? 0 : res + 1;
+    } else if constexpr (operation == AtomicOperation::kDec) {
+      res = ((res == 0) || (res > val)) ? val : res - 1;
+    }
+  };
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    for (auto j = 0u; j < p.kernel_count; ++j) {
+      for (auto tid = 0u; tid < p.ThreadCount(); ++tid) {
+        perform_op(tid);
+      }
+    }
+  }
+
+  for (auto i = 0u; i < p.host_thread_count; ++i) {
+    for (auto j = 0u; j < p.HostIterationsPerThread(); ++j) {
+      perform_op(j);
+    }
+  }
+
+  return {res_vals, old_vals};
+}
+
+template <typename TestType, AtomicOperation operation>
+void Verify(const TestParams& p, std::vector<TestType>& res_vals, std::vector<TestType>& old_vals) {
+  auto [expected_res_vals, expected_old_vals] = TestKernelHostRef<TestType, operation>(p);
+
+  for (auto i = 0u; i < res_vals.size(); ++i) {
+    INFO("Results index: " << i);
+    REQUIRE(expected_res_vals[i] == res_vals[i]);
+  }
+
+  std::sort(begin(old_vals), end(old_vals));
+  std::sort(begin(expected_old_vals), end(expected_old_vals));
+  for (auto i = 0u; i < old_vals.size(); ++i) {
+    INFO("Old values index: " << i);
+    REQUIRE(expected_old_vals[i] == old_vals[i]);
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+void LaunchKernel(const TestParams& p, hipStream_t stream, TestType* const mem_ptr,
+                  TestType* const old_vals) {
+  const auto shared_mem_size = use_shared_mem ? p.width * p.pitch : 0u;
+  if (p.width == 1 && p.pitch == sizeof(TestType))
+    TestKernel<TestType, operation, use_shared_mem>
+        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals);
+  else
+    TestKernel<TestType, operation, use_shared_mem>
+        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals, p.width, p.pitch);
+}
+
+template <typename TestType, AtomicOperation operation>
+void HostAtomicOperation(const unsigned int iterations, TestType* mem, TestType* const old_vals,
+                         const unsigned int width, const unsigned pitch, TestType base_val) {
+  const auto val = GetTestValue<TestType, operation>();
+
+  for (auto i = 0u; i < iterations; ++i) {
+    if constexpr (operation == AtomicOperation::kAddSystem ||
+                  operation == AtomicOperation::kCASAddSystem) {
+      old_vals[i] = __atomic_fetch_add(PitchedOffset(mem, pitch, i % width), val, __ATOMIC_RELAXED);
+    } else if constexpr (operation == AtomicOperation::kSubSystem) {
+      old_vals[i] = __atomic_fetch_sub(PitchedOffset(mem, pitch, i % width), val, __ATOMIC_RELAXED);
+    }
+  }
+}
+
+template <typename TestType, AtomicOperation operation>
+void PerformHostAtomicOperation(const TestParams& p, TestType* mem, TestType* const old_vals) {
+  if (p.host_thread_count == 0) {
+    return;
+  }
+
+  const auto host_base_val = p.num_devices * p.kernel_count * p.ThreadCount();
+
+  std::vector<std::thread> threads;
+  for (auto i = 0u; i < p.host_thread_count; ++i) {
+    const auto iterations = p.HostIterationsPerThread();
+    const auto thread_base_val = host_base_val + i * iterations;
+    threads.push_back(std::thread(HostAtomicOperation<TestType, operation>, iterations, mem,
+                                  old_vals + thread_base_val, p.width, p.pitch, thread_base_val));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+void TestCore(const TestParams& p) {
+  const unsigned int flags =
+      p.alloc_type == LinearAllocs::mallocAndRegister ? hipHostRegisterMapped : 0u;
+
+  const auto old_vals_alloc_size = p.kernel_count * p.ThreadCount() * sizeof(TestType);
+  std::vector<LinearAllocGuard<TestType>> old_vals_devs;
+  std::vector<StreamGuard> streams;
+  for (auto i = 0; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
+    old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size);
+    for (auto j = 0; j < p.kernel_count; ++j) {
+      streams.emplace_back(Streams::created);
+    }
+  }
+
+  const auto mem_alloc_size = p.width * p.pitch;
+  LinearAllocGuard<TestType> mem_dev(p.alloc_type, mem_alloc_size, flags);
+
+  std::vector<TestType> old_vals(p.num_devices * p.kernel_count * p.ThreadCount() +
+                                 p.host_thread_count * p.HostIterationsPerThread());
+  std::vector<TestType> res_vals(p.width);
+
+  TestType* const mem_ptr =
+      p.alloc_type == LinearAllocs::hipMalloc ? mem_dev.ptr() : mem_dev.host_ptr();
+
+  HIP_CHECK(hipMemset(mem_ptr, 0, mem_alloc_size));
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    for (auto j = 0u; j < p.kernel_count; ++j) {
+      const auto& stream = streams[i * p.kernel_count + j].stream();
+      const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
+      LaunchKernel<TestType, operation, use_shared_mem>(p, stream, mem_dev.ptr(), old_vals);
+    }
+  }
+
+  PerformHostAtomicOperation<TestType, operation>(p, mem_dev.host_ptr(), old_vals.data());
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    const auto device_offset = i * p.kernel_count * p.ThreadCount();
+    HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(),
+                        old_vals_alloc_size, hipMemcpyDeviceToHost));
+  }
+  HIP_CHECK(hipMemcpy2D(res_vals.data(), sizeof(TestType), mem_ptr, p.pitch, sizeof(TestType),
+                        p.width, hipMemcpyDeviceToHost));
+
+  Verify<TestType, operation>(p, res_vals, old_vals);
+}
+
+inline dim3 GenerateThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
+
+inline dim3 GenerateBlockDimensions() {
+  int sm_count = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
+  return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
+}
+
+template <typename TestType, AtomicOperation operation>
+void SingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
+  TestParams params;
+  params.num_devices = 1;
+  params.kernel_count = 1;
+  params.threads = GenerateThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+
+  SECTION("Global memory") {
+    params.blocks = GenerateBlockDimensions();
+    using LA = LinearAllocs;
+    for (const auto alloc_type :
+         {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+      params.alloc_type = alloc_type;
+      DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+        TestCore<TestType, operation, false>(params);
+      }
+    }
+  }
+
+  SECTION("Shared memory") {
+    params.blocks = dim3(1);
+    params.alloc_type = LinearAllocs::hipMalloc;
+    TestCore<TestType, operation, true>(params);
+  }
+}
+
+template <typename TestType, AtomicOperation operation>
+void SingleDeviceMultipleKernelTest(const unsigned int kernel_count, const unsigned int width,
+                                    const unsigned int pitch) {
+  int concurrent_kernels = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0));
+  if (!concurrent_kernels) {
+    HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+    return;
+  }
+
+  TestParams params;
+  params.num_devices = 1;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type :
+       {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      TestCore<TestType, operation, false>(params);
+    }
+  }
+}
+
+template <typename TestType, AtomicOperation operation>
+void MultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices,
+                                             const unsigned int kernel_count,
+                                             const unsigned int width, const unsigned int pitch,
+                                             const unsigned int host_thread_count = 0u) {
+  if (num_devices > 1) {
+    if (HipTest::getDeviceCount() < num_devices) {
+      std::string msg = std::to_string(num_devices) + " devices are required";
+      HipTest::HIP_SKIP_TEST(msg.c_str());
+      return;
+    }
+  }
+
+  if (kernel_count > 1) {
+    for (auto i = 0u; i < num_devices; ++i) {
+      int concurrent_kernels = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
+      if (!concurrent_kernels) {
+        HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+        return;
+      }
+    }
+  }
+
+  TestParams params;
+  params.num_devices = num_devices;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+  params.host_thread_count = host_thread_count;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      TestCore<TestType, operation, false>(params);
+    }
+  }
+}
@@ -0,0 +1,172 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+#include "atomicCAS_negative_kernels_rtc.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicCAS atomicCAS
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+#ifdef HT_NVIDIA
+#define TYPES
+#else
+#define TYPES , float, double
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * addition, implemented using an atomic CAS operation, on a target memory location. Each thread
+ * will add the same value to the memory location, storing the return value into a separate output
+ * array slot corresponding to it. Once complete, the output array and target memory is validated to
+ * contain all the expected values. Several memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - Several grid and block dimension combinations (only one block is used for shared memory).
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_Positive", "", int, unsigned int, unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kCASAdd>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kCASAdd>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kCASAdd>(warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a single device wherein all threads will perform
+ * an atomic addition, implemented using an atomic CAS operation, on a target memory location. Each
+ * thread will add the same value to the memory location, storing the return value into a separate
+ * output array slot corresponding to it. Once complete, the output array and target memory is
+ * validated to contain all the expected values. Several memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_Positive_Multi_Kernel", "", int, unsigned int,
+                   unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kCASAdd>(2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kCASAdd>(2, warp_size,
+                                                                         sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kCASAdd>(2, warp_size,
+                                                                         cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
+ * atomicCAS.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicCAS_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source = GENERATE(kAtomicCAS_int, kAtomicCAS_uint, kAtomicCAS_ulong,
+                                       kAtomicCAS_ulonglong, kAtomicCAS_float, kAtomicCAS_double);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicCAS_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+#define ATOMIC_CAS_NEGATIVE_KERNEL(type_name)                                                      \
+  __global__ void atomicCAS_v1(type_name* address, type_name* result) {                            \
+    *result = atomicCAS(&address, 12, 13);                                                         \
+  }                                                                                                \
+  __global__ void atomicCAS_v2(type_name* address, type_name* result) {                            \
+    *result = atomicCAS(address, address, 13);                                                     \
+  }                                                                                                \
+  __global__ void atomicCAS_v3(type_name* address, type_name* result) {                            \
+    *result = atomicCAS(address, 12, address);                                                     \
+  }                                                                                                \
+  __global__ void atomicCAS_v4(Dummy* address, type_name* result) {                                \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }                                                                                                \
+  __global__ void atomicCAS_v5(char* address, type_name* result) {                                 \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }                                                                                                \
+  __global__ void atomicCAS_v6(short* address, type_name* result) {                                \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }                                                                                                \
+  __global__ void atomicCAS_v7(long* address, type_name* result) {                                 \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }                                                                                                \
+  __global__ void atomicCAS_v8(long long* address, type_name* result) {                            \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }
+
+ATOMIC_CAS_NEGATIVE_KERNEL(int)
+ATOMIC_CAS_NEGATIVE_KERNEL(unsigned int)
+ATOMIC_CAS_NEGATIVE_KERNEL(unsigned long)
+ATOMIC_CAS_NEGATIVE_KERNEL(unsigned long long)
+ATOMIC_CAS_NEGATIVE_KERNEL(float)
+ATOMIC_CAS_NEGATIVE_KERNEL(double)
@@ -0,0 +1,273 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicCAS_int{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_int_v1(int* address, int* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v2(int* address, int* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_int_v3(int* address, int* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_int_v4(Dummy* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v5(char* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v6(short* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v7(long* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v8(long long* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_uint{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v5(char* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v6(short* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v7(long* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_ulong{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_ulong_v1(unsigned long* address, unsigned long* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v2(unsigned long* address, unsigned long* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v3(unsigned long* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_ulong_v4(Dummy* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v5(char* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v6(short* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v7(long* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v8(long long* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_ulonglong{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_ulonglong_v4(Dummy* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v5(char* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v6(short* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v7(long* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v8(long long* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_float{
+    R"(
+    class Dummy {
+      public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_float_v1(float* address, float* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v2(float* address, float* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_float_v3(float* address, float* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_float_v4(Dummy* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v5(char* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v6(short* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v7(long* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v8(long long* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_double{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_double_v1(double* address, double* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v2(double* address, double* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_double_v3(double* address, double* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_double_v4(Dummy* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v5(char* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v6(short* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v7(long* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v8(long long* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
@@ -0,0 +1,185 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicCAS_system atomicCAS_system
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+#ifdef HT_NVIDIA
+#define TYPES
+#else
+#define TYPES , float, double
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a two devices wherein all threads will perform
+ * an atomic addition, implemented using an atomic CAS operation, on a target memory location. Each
+ * thread will add the same value to the memory location, storing the return value into a separate
+ * output array slot corresponding to it. Once complete, the output array and target memory is
+ * validated to contain all the expected values. Several memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Peer_GPUs", "", int, unsigned int,
+                   unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel on a single device wherein all threads will perform
+ * an atomic addition, implemented using an atomic CAS operation, on a target memory location.
+ * Each thread will add the same value to the memory location, storing the return value into a
+ * separate output array slot corresponding to it. While the kernel is running, the host
+ * performs atomic additions, in 4 threads, on the same memory location(s). Once complete, the
+ * output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Host_And_GPU", "", int, unsigned int,
+                   unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          1, 1, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          1, 1, warp_size, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          1, 1, warp_size, cache_line_size, 4);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times on two devices wherein all threads will perform
+ * an atomic addition, implemented using an atomic CAS operation, on a target memory location.
+ * Each thread will add the same value to the memory location, storing the return value into a
+ * separate output array slot corresponding to it. While the kernel is running, the host
+ * performs atomic additions, in 4 threads, on the same memory location(s). Once complete, the
+ * output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
+                   unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, warp_size, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, warp_size, cache_line_size, 4);
+    }
+  }
+}