From fa5ba557a5ea7edbb15932a6ca40e584dc55fb4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirza=20Halil=C4=8Devi=C4=87?=
 <109971222+mirza-halilcevic@users.noreply.github.com>
Date: Thu, 28 Dec 2023 17:54:17 +0100
Subject: [PATCH] EXSWHTEC-300 - Extend tests for atomic bitwise operations
 #288

Change-Id: I499a1c57400f13ebdf056d093228501f4eb2cb31
---
 catch/unit/atomics/CMakeLists.txt            |   3 +
 catch/unit/atomics/__hip_atomic_fetch_and.cc | 187 +++++++++++++++++++
 catch/unit/atomics/__hip_atomic_fetch_or.cc  | 187 +++++++++++++++++++
 catch/unit/atomics/__hip_atomic_fetch_xor.cc | 187 +++++++++++++++++++
 catch/unit/atomics/bitwise_common.hh         | 113 ++++++++---
 5 files changed, 654 insertions(+), 23 deletions(-)
 create mode 100644 catch/unit/atomics/__hip_atomic_fetch_and.cc
 create mode 100644 catch/unit/atomics/__hip_atomic_fetch_or.cc
 create mode 100644 catch/unit/atomics/__hip_atomic_fetch_xor.cc

diff --git a/catch/unit/atomics/CMakeLists.txt b/catch/unit/atomics/CMakeLists.txt
index f18abbf3e5..bfe6e6bf59 100644
--- a/catch/unit/atomics/CMakeLists.txt
+++ b/catch/unit/atomics/CMakeLists.txt
@@ -35,6 +35,9 @@ set(TEST_SRC
     unsafeAtomicMax.cc
     atomicExch.cc
     atomicExch_system.cc
+    __hip_atomic_fetch_and.cc
+    __hip_atomic_fetch_or.cc
+    __hip_atomic_fetch_xor.cc
 )
 
 if(HIP_PLATFORM MATCHES "nvidia")
diff --git a/catch/unit/atomics/__hip_atomic_fetch_and.cc b/catch/unit/atomics/__hip_atomic_fetch_and.cc
new file mode 100644
index 0000000000..51fd37bf59
--- /dev/null
+++ b/catch/unit/atomics/__hip_atomic_fetch_and.cc
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_and __hip_atomic_fetch_and
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
diff --git a/catch/unit/atomics/__hip_atomic_fetch_or.cc b/catch/unit/atomics/__hip_atomic_fetch_or.cc
new file mode 100644
index 0000000000..000df50f80
--- /dev/null
+++ b/catch/unit/atomics/__hip_atomic_fetch_or.cc
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_or __hip_atomic_fetch_or
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
diff --git a/catch/unit/atomics/__hip_atomic_fetch_xor.cc b/catch/unit/atomics/__hip_atomic_fetch_xor.cc
new file mode 100644
index 0000000000..0f3f3f3743
--- /dev/null
+++ b/catch/unit/atomics/__hip_atomic_fetch_xor.cc
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_xor __hip_atomic_fetch_xor
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
diff --git a/catch/unit/atomics/bitwise_common.hh b/catch/unit/atomics/bitwise_common.hh
index 9e71c99cb0..887d25d4f9 100644
--- a/catch/unit/atomics/bitwise_common.hh
+++ b/catch/unit/atomics/bitwise_common.hh
@@ -22,10 +22,10 @@ THE SOFTWARE.
 
 #pragma once
 
+#include <cmd_options.hh>
 #include <hip_test_common.hh>
 #include <hip/hip_cooperative_groups.h>
 #include <resource_guards.hh>
-#include <cmd_options.hh>
 
 namespace cg = cooperative_groups;
 
@@ -37,6 +37,9 @@ enum class AtomicOperation {
   kOrSystem,
   kXor,
   kXorSystem,
+  kBuiltinAnd,
+  kBuiltinOr,
+  kBuiltinXor
 };
 
 constexpr auto kMask = 0xAAAA;
@@ -52,7 +55,7 @@ __host__ __device__ TestType GetTestValue() {
   return kTestValue;
 }
 
-template <typename TestType, AtomicOperation operation>
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 __device__ TestType PerformAtomicOperation(TestType* const mem) {
   const auto mask = kMask;
 
@@ -68,10 +71,17 @@ __device__ TestType PerformAtomicOperation(TestType* const mem) {
     return atomicXor(mem, mask);
   } else if constexpr (operation == AtomicOperation::kXorSystem) {
     return atomicXor_system(mem, mask);
+  } else if constexpr (operation == AtomicOperation::kBuiltinAnd) {
+    return __hip_atomic_fetch_and(mem, mask, __ATOMIC_RELAXED, memory_scope);
+  } else if constexpr (operation == AtomicOperation::kBuiltinOr) {
+    return __hip_atomic_fetch_or(mem, mask, __ATOMIC_RELAXED, memory_scope);
+  } else if constexpr (operation == AtomicOperation::kBuiltinXor) {
+    return __hip_atomic_fetch_xor(mem, mask, __ATOMIC_RELAXED, memory_scope);
   }
 }
 
-template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) {
   __shared__ TestType shared_mem;
 
@@ -84,7 +94,7 @@ __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals)
     __syncthreads();
   }
 
-  old_vals[tid] = PerformAtomicOperation<TestType, operation>(mem);
+  old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(mem);
 
   if constexpr (use_shared_mem) {
     __syncthreads();
@@ -99,7 +109,16 @@ __host__ __device__ TestType* PitchedOffset(TestType* const ptr, const unsigned
   return reinterpret_cast<TestType*>(byte_ptr + idx * pitch);
 }
 
-template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+__device__ void GenerateMemoryTraffic(uint8_t* const begin_addr, uint8_t* const end_addr) {
+  for (volatile uint8_t* addr = begin_addr; addr != end_addr; ++addr) {
+    uint8_t val = *addr;
+    val ^= 0xAB;
+    *addr = val;
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals,
                            const unsigned int width, const unsigned pitch) {
   extern __shared__ uint8_t shared_mem[];
@@ -116,8 +135,18 @@ __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals,
     __syncthreads();
   }
 
-  old_vals[tid] =
-      PerformAtomicOperation<TestType, operation>(PitchedOffset(mem, pitch, tid % width));
+  const auto n = cooperative_groups::this_grid().size() - width;
+
+  TestType* atomic_addr = PitchedOffset(mem, pitch, tid % width);
+
+  if (tid < n) {
+    old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(
+        PitchedOffset(mem, pitch, tid % width));
+  } else {
+    uint8_t* const begin_addr = reinterpret_cast<uint8_t*>(atomic_addr + 1);
+    uint8_t* const end_addr = reinterpret_cast<uint8_t*>(atomic_addr) + pitch;
+    GenerateMemoryTraffic(begin_addr, end_addr);
+  }
 
   if constexpr (use_shared_mem) {
     __syncthreads();
@@ -157,13 +186,16 @@ std::tuple<std::vector<TestType>, std::vector<TestType>> TestKernelHostRef(const
     auto& res = res_vals[tid % p.width];
     old_vals.push_back(res);
 
-    if constexpr (operation == AtomicOperation::kAnd || operation == AtomicOperation::kAndSystem) {
+    if constexpr (operation == AtomicOperation::kAnd || operation == AtomicOperation::kAndSystem ||
+                  operation == AtomicOperation::kBuiltinAnd) {
       res = res & mask;
     } else if constexpr (operation == AtomicOperation::kOr ||
-                         operation == AtomicOperation::kOrSystem) {
+                         operation == AtomicOperation::kOrSystem ||
+                         operation == AtomicOperation::kBuiltinOr) {
       res = res | mask;
     } else if constexpr (operation == AtomicOperation::kXor ||
-                         operation == AtomicOperation::kXorSystem) {
+                         operation == AtomicOperation::kXorSystem ||
+                         operation == AtomicOperation::kBuiltinXor) {
       res = res ^ mask;
     }
   }
@@ -188,19 +220,21 @@ void Verify(const TestParams& p, std::vector<TestType>& res_vals, std::vector<Te
   }
 }
 
-template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 void LaunchKernel(const TestParams& p, hipStream_t stream, TestType* const mem_ptr,
                   TestType* const old_vals) {
   const auto shared_mem_size = use_shared_mem ? p.width * p.pitch : 0u;
   if (p.width == 1 && p.pitch == sizeof(TestType))
-    TestKernel<TestType, operation, use_shared_mem>
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
         <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals);
   else
-    TestKernel<TestType, operation, use_shared_mem>
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
         <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals, p.width, p.pitch);
 }
 
-template <typename TestType, AtomicOperation operation, bool use_shared_mem>
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 void TestCore(const TestParams& p) {
   const auto old_vals_alloc_size = p.kernel_count * p.ThreadCount() * sizeof(TestType);
   std::vector<LinearAllocGuard<TestType>> old_vals_devs;
@@ -232,7 +266,8 @@ void TestCore(const TestParams& p) {
     for (auto j = 0u; j < p.kernel_count; ++j) {
       const auto& stream = streams[i * p.kernel_count + j].stream();
       const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
-      LaunchKernel<TestType, operation, use_shared_mem>(p, stream, mem_dev.ptr(), old_vals);
+      LaunchKernel<TestType, operation, use_shared_mem, memory_scope>(p, stream, mem_dev.ptr(),
+                                                                      old_vals);
     }
   }
 
@@ -247,17 +282,48 @@ void TestCore(const TestParams& p) {
   Verify<TestType, operation>(p, res_vals, old_vals);
 }
 
-template <typename TestType, AtomicOperation operation>
+inline dim3 GenerateThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
+
+inline dim3 GenerateBlockDimensions() {
+  int sm_count = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
+  return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
+}
+
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 void SingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
   TestParams params;
   params.num_devices = 1;
   params.kernel_count = 1;
-  params.threads = GENERATE(dim3(1023));
+  if constexpr ((operation == AtomicOperation::kBuiltinAnd ||
+                 operation == AtomicOperation::kBuiltinOr ||
+                 operation == AtomicOperation::kBuiltinXor) &&
+                memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD) {
+    params.threads = 1;
+  } else if constexpr ((operation == AtomicOperation::kBuiltinAnd ||
+                        operation == AtomicOperation::kBuiltinOr ||
+                        operation == AtomicOperation::kBuiltinXor) &&
+                       memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    int warp_size = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    params.threads = dim3(warp_size);
+  } else {
+    params.threads = GenerateThreadDimensions();
+  }
   params.width = width;
   params.pitch = pitch;
 
   SECTION("Global memory") {
-    params.blocks = GENERATE(dim3(3));
+    if constexpr ((operation == AtomicOperation::kBuiltinAnd ||
+                   operation == AtomicOperation::kBuiltinOr ||
+                   operation == AtomicOperation::kBuiltinXor) &&
+                  (memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP)) {
+      params.blocks = dim3(1);
+    } else {
+      params.blocks = GenerateBlockDimensions();
+    }
     using LA = LinearAllocs;
     for (const auto alloc_type :
          {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
@@ -288,8 +354,8 @@ void SingleDeviceMultipleKernelTest(const unsigned int kernel_count, const unsig
   TestParams params;
   params.num_devices = 1;
   params.kernel_count = kernel_count;
-  params.blocks = GENERATE(dim3(3));
-  params.threads = GENERATE(dim3(1023));
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
   params.width = width;
   params.pitch = pitch;
 
@@ -329,8 +395,8 @@ void MultipleDeviceMultipleKernelTest(const unsigned int num_devices,
   TestParams params;
   params.num_devices = num_devices;
   params.kernel_count = kernel_count;
-  params.blocks = GENERATE(dim3(3));
-  params.threads = GENERATE(dim3(1023));
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
   params.width = width;
   params.pitch = pitch;
 
@@ -338,8 +404,9 @@ void MultipleDeviceMultipleKernelTest(const unsigned int num_devices,
   for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
     params.alloc_type = alloc_type;
     DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
-      TestCore<TestType, operation, false>(params);
+      TestCore<TestType, operation, false, __HIP_MEMORY_SCOPE_SYSTEM>(params);
     }
   }
 }
+
 }  // namespace Bitwise