EXSWHTEC-300 - Extend tests for atomic bitwise operations #288

Change-Id: I499a1c57400f13ebdf056d093228501f4eb2cb31
2023-12-28 17:54:17 +01:00
@@ -35,6 +35,9 @@ set(TEST_SRC
    unsafeAtomicMax.cc
    atomicExch.cc
    atomicExch_system.cc
+    __hip_atomic_fetch_and.cc
+    __hip_atomic_fetch_or.cc
+    __hip_atomic_fetch_xor.cc
 )

 if(HIP_PLATFORM MATCHES "nvidia")
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_and __hip_atomic_fetch_and
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_or __hip_atomic_fetch_or
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_xor __hip_atomic_fetch_xor
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
@@ -22,10 +22,10 @@ THE SOFTWARE.

 #pragma once

+#include <cmd_options.hh>
 #include <hip_test_common.hh>
 #include <hip/hip_cooperative_groups.h>
 #include <resource_guards.hh>
-#include <cmd_options.hh>

 namespace cg = cooperative_groups;

@@ -37,6 +37,9 @@ enum class AtomicOperation {
  kOrSystem,
  kXor,
  kXorSystem,
+  kBuiltinAnd,
+  kBuiltinOr,
+  kBuiltinXor
 };

 constexpr auto kMask = 0xAAAA;
@@ -52,7 +55,7 @@ __host__ __device__ TestType GetTestValue() {
  return kTestValue;
 }

-template <typename TestType, AtomicOperation operation>
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 __device__ TestType PerformAtomicOperation(TestType* const mem) {
  const auto mask = kMask;

@@ -68,10 +71,17 @@ __device__ TestType PerformAtomicOperation(TestType* const mem) {
    return atomicXor(mem, mask);
  } else if constexpr (operation == AtomicOperation::kXorSystem) {
    return atomicXor_system(mem, mask);
+  } else if constexpr (operation == AtomicOperation::kBuiltinAnd) {
+    return __hip_atomic_fetch_and(mem, mask, __ATOMIC_RELAXED, memory_scope);
+  } else if constexpr (operation == AtomicOperation::kBuiltinOr) {
+    return __hip_atomic_fetch_or(mem, mask, __ATOMIC_RELAXED, memory_scope);
+  } else if constexpr (operation == AtomicOperation::kBuiltinXor) {
+    return __hip_atomic_fetch_xor(mem, mask, __ATOMIC_RELAXED, memory_scope);
  }
 }

-template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) {
  __shared__ TestType shared_mem;

@@ -84,7 +94,7 @@ __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals)
    __syncthreads();
  }

-  old_vals[tid] = PerformAtomicOperation<TestType, operation>(mem);
+  old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(mem);

  if constexpr (use_shared_mem) {
    __syncthreads();
@@ -99,7 +109,16 @@ __host__ __device__ TestType* PitchedOffset(TestType* const ptr, const unsigned
  return reinterpret_cast<TestType*>(byte_ptr + idx * pitch);
 }

-template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+__device__ void GenerateMemoryTraffic(uint8_t* const begin_addr, uint8_t* const end_addr) {
+  for (volatile uint8_t* addr = begin_addr; addr != end_addr; ++addr) {
+    uint8_t val = *addr;
+    val ^= 0xAB;
+    *addr = val;
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals,
                           const unsigned int width, const unsigned pitch) {
  extern __shared__ uint8_t shared_mem[];
@@ -116,8 +135,18 @@ __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals,
    __syncthreads();
  }

-  old_vals[tid] =
-      PerformAtomicOperation<TestType, operation>(PitchedOffset(mem, pitch, tid % width));
+  const auto n = cooperative_groups::this_grid().size() - width;
+
+  TestType* atomic_addr = PitchedOffset(mem, pitch, tid % width);
+
+  if (tid < n) {
+    old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(
+        PitchedOffset(mem, pitch, tid % width));
+  } else {
+    uint8_t* const begin_addr = reinterpret_cast<uint8_t*>(atomic_addr + 1);
+    uint8_t* const end_addr = reinterpret_cast<uint8_t*>(atomic_addr) + pitch;
+    GenerateMemoryTraffic(begin_addr, end_addr);
+  }

  if constexpr (use_shared_mem) {
    __syncthreads();
@@ -157,13 +186,16 @@ std::tuple<std::vector<TestType>, std::vector<TestType>> TestKernelHostRef(const
    auto& res = res_vals[tid % p.width];
    old_vals.push_back(res);

-    if constexpr (operation == AtomicOperation::kAnd || operation == AtomicOperation::kAndSystem) {
+    if constexpr (operation == AtomicOperation::kAnd || operation == AtomicOperation::kAndSystem ||
+                  operation == AtomicOperation::kBuiltinAnd) {
      res = res & mask;
    } else if constexpr (operation == AtomicOperation::kOr ||
-                         operation == AtomicOperation::kOrSystem) {
+                         operation == AtomicOperation::kOrSystem ||
+                         operation == AtomicOperation::kBuiltinOr) {
      res = res | mask;
    } else if constexpr (operation == AtomicOperation::kXor ||
-                         operation == AtomicOperation::kXorSystem) {
+                         operation == AtomicOperation::kXorSystem ||
+                         operation == AtomicOperation::kBuiltinXor) {
      res = res ^ mask;
    }
  }
@@ -188,19 +220,21 @@ void Verify(const TestParams& p, std::vector<TestType>& res_vals, std::vector<Te
  }
 }

-template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 void LaunchKernel(const TestParams& p, hipStream_t stream, TestType* const mem_ptr,
                  TestType* const old_vals) {
  const auto shared_mem_size = use_shared_mem ? p.width * p.pitch : 0u;
  if (p.width == 1 && p.pitch == sizeof(TestType))
-    TestKernel<TestType, operation, use_shared_mem>
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals);
  else
-    TestKernel<TestType, operation, use_shared_mem>
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals, p.width, p.pitch);
 }

-template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 void TestCore(const TestParams& p) {
  const auto old_vals_alloc_size = p.kernel_count * p.ThreadCount() * sizeof(TestType);
  std::vector<LinearAllocGuard<TestType>> old_vals_devs;
@@ -232,7 +266,8 @@ void TestCore(const TestParams& p) {
    for (auto j = 0u; j < p.kernel_count; ++j) {
      const auto& stream = streams[i * p.kernel_count + j].stream();
      const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
-      LaunchKernel<TestType, operation, use_shared_mem>(p, stream, mem_dev.ptr(), old_vals);
+      LaunchKernel<TestType, operation, use_shared_mem, memory_scope>(p, stream, mem_dev.ptr(),
+                                                                      old_vals);
    }
  }

@@ -247,17 +282,48 @@ void TestCore(const TestParams& p) {
  Verify<TestType, operation>(p, res_vals, old_vals);
 }

-template <typename TestType, AtomicOperation operation>
+inline dim3 GenerateThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
+
+inline dim3 GenerateBlockDimensions() {
+  int sm_count = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
+  return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
+}
+
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 void SingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
  TestParams params;
  params.num_devices = 1;
  params.kernel_count = 1;
-  params.threads = GENERATE(dim3(1023));
+  if constexpr ((operation == AtomicOperation::kBuiltinAnd ||
+                 operation == AtomicOperation::kBuiltinOr ||
+                 operation == AtomicOperation::kBuiltinXor) &&
+                memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD) {
+    params.threads = 1;
+  } else if constexpr ((operation == AtomicOperation::kBuiltinAnd ||
+                        operation == AtomicOperation::kBuiltinOr ||
+                        operation == AtomicOperation::kBuiltinXor) &&
+                       memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    int warp_size = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    params.threads = dim3(warp_size);
+  } else {
+    params.threads = GenerateThreadDimensions();
+  }
  params.width = width;
  params.pitch = pitch;

  SECTION("Global memory") {
-    params.blocks = GENERATE(dim3(3));
+    if constexpr ((operation == AtomicOperation::kBuiltinAnd ||
+                   operation == AtomicOperation::kBuiltinOr ||
+                   operation == AtomicOperation::kBuiltinXor) &&
+                  (memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP)) {
+      params.blocks = dim3(1);
+    } else {
+      params.blocks = GenerateBlockDimensions();
+    }
    using LA = LinearAllocs;
    for (const auto alloc_type :
         {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
@@ -288,8 +354,8 @@ void SingleDeviceMultipleKernelTest(const unsigned int kernel_count, const unsig
  TestParams params;
  params.num_devices = 1;
  params.kernel_count = kernel_count;
-  params.blocks = GENERATE(dim3(3));
-  params.threads = GENERATE(dim3(1023));
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
  params.width = width;
  params.pitch = pitch;

@@ -329,8 +395,8 @@ void MultipleDeviceMultipleKernelTest(const unsigned int num_devices,
  TestParams params;
  params.num_devices = num_devices;
  params.kernel_count = kernel_count;
-  params.blocks = GENERATE(dim3(3));
-  params.threads = GENERATE(dim3(1023));
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
  params.width = width;
  params.pitch = pitch;

@@ -338,8 +404,9 @@ void MultipleDeviceMultipleKernelTest(const unsigned int num_devices,
  for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
    params.alloc_type = alloc_type;
    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
-      TestCore<TestType, operation, false>(params);
+      TestCore<TestType, operation, false, __HIP_MEMORY_SCOPE_SYSTEM>(params);
    }
  }
 }
+
 }  // namespace Bitwise