SWDEV-453422: introduce warp sync tests

The following builtins are tested: - __all_sync, __any_sync, __ballot_sync and __activemask - __match_any_sync and __match_all_sync - __shfl_sync, __shfl_up_sync, __shfl_down_sync, and __shfl_xor_sync The tests for shfl (all variants) were manually validated on a CUDA setup. NOTE: - Unit_hipVoteSync_All temporarily disabled on Windows (SWDEV-452308). - All new tests temporarily disabled on CUDA (SWDEV-453145). Change-Id: I84b205a88aa24219d199c760793e2f19f0cf8f13 [ROCm/hip-tests commit: 53f0a9bd01]
2024-03-21 16:28:33 +05:30
Commit ec1f43cd05
@@ -1435,6 +1435,8 @@
        "=== SWDEV-453453 : Below tests failed in stress test on 22/03/24 ===",
        "Unit_hipDeviceGetGraphMemAttribute_Functional",
        "Unit_hipDeviceGetGraphMemAttribute_Functional_Multi_Device",
+        "=== SWDEV-452308 : Windows failure, should be fixed for ROCm 6.3. ===",
+        "Unit_hipVoteSync_All",
    #endif
        "End of json"
    ]
@@ -170,6 +170,21 @@ static void initHipCtx(hipCtx_t* pcontext) {
 #define HIP_ARRAY hipArray_t
 #endif

+static inline int getWarpSize() {
+#if HT_NVIDIA
+  return 32;
+#elif HT_AMD
+  int device = -1;
+  int warpSize = -1;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, device));
+  return warpSize;
+#else
+  std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
+  assert(false);
+#endif
+}
+
 static inline bool IsGfx11() {
 #if HT_NVIDIA
  return false;
@@ -22,7 +22,6 @@
 set(TEST_SRC
    hipMemFaultStackAllocation.cc
    hipLaunchBounds.cc
-    hipShflTests.cc
    hipDynamicShared.cc
    hipDynamicShared2.cc
    hipEmptyKernel.cc
@@ -43,7 +42,6 @@ endif()
 # only for AMD
 if(HIP_PLATFORM MATCHES "amd")
  set(AMD_SRC
-      hipShflUpDownTest.cc
      hipExtLaunchKernelGGL.cc
  )
  set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
@@ -15,6 +15,7 @@ set(AMD_TEST_SRC
    hipRtcBfloat16.cc
    linker.cc
    shfl.cc
+    shfl_sync.cc
    stdheaders.cc
    hiprtc_MathConstants_HeaderTst.cc
    hiprtc_VectorTypes_HeaderTst.cc
@@ -31,8 +31,7 @@ THE SOFTWARE.
 #include <iostream>
 #include <iterator>
 #include <vector>
-
-static constexpr int n = 32;
+#include "shfl.hh"

 static constexpr auto shfl {
 R"(
@@ -63,45 +62,6 @@ __global__ void shflXorSum(T* a, int size) {
 }
 )"};

-void getFactor(int& fact) { fact = 101; }
-void getFactor(__half& fact) { fact = 2.5; }
-
-template <typename T> T sum(T* a) {
-  T cpuSum = 0;
-  T factor;
-  getFactor(factor);
-  for (int i = 0; i < n; i++) {
-    a[i] = i + factor;
-    cpuSum += a[i];
-  }
-  return cpuSum;
-}
-
-template <typename T> bool compare(T gpuSum, T cpuSum) {
-  if (gpuSum != cpuSum) {
-    return true;
-  }
-  return false;
-}
-
-template <> __half sum(__half* a) {
-  __half cpuSum = 0;
-  __half factor;
-  getFactor(factor);
-  for (int i = 0; i < n; i++) {
-    a[i] = i + __half2float(factor);
-    cpuSum = __half2float(cpuSum) + __half2float(a[i]);
-  }
-  return cpuSum;
-}
-
-template <> bool compare(__half gpuSum, __half cpuSum) {
-  if (__half2float(gpuSum) != __half2float(cpuSum)) {
-    return true;
-  }
-  return false;
-}
-
 template <typename T>
 void runTestShfl(int option) {
  using namespace std;
@@ -0,0 +1,64 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+static constexpr int n = 32;
+
+inline void getFactor(int& fact) { fact = 101; }
+inline void getFactor(__half& fact) { fact = 2.5; }
+
+template <typename T> inline T sum(T* a) {
+  T cpuSum = 0;
+  T factor;
+  getFactor(factor);
+  for (int i = 0; i < n; i++) {
+    a[i] = i + factor;
+    cpuSum += a[i];
+  }
+  return cpuSum;
+}
+
+template <typename T> inline bool compare(T gpuSum, T cpuSum) {
+  if (gpuSum != cpuSum) {
+    return true;
+  }
+  return false;
+}
+
+template <> inline __half sum(__half* a) {
+  __half cpuSum = 0;
+  __half factor;
+  getFactor(factor);
+  for (int i = 0; i < n; i++) {
+    a[i] = i + __half2float(factor);
+    cpuSum = __half2float(cpuSum) + __half2float(a[i]);
+  }
+  return cpuSum;
+}
+
+template <> inline bool compare(__half gpuSum, __half cpuSum) {
+  if (__half2float(gpuSum) != __half2float(cpuSum)) {
+    return true;
+  }
+  return false;
+}
@@ -0,0 +1,161 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+#include <hip/hiprtc.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <iostream>
+#include <iterator>
+#include <vector>
+#include "shfl.hh"
+
+static constexpr auto shfl {
+  R"(
+template <typename T>
+__global__ void shflUpSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  auto all_threads = __activemask();
+  for (unsigned int i = size / 2; i > 0; i /= 2) {
+    val += __shfl_up_sync(all_threads, val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+template <typename T>
+__global__ void shflDownSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  auto all_threads = __activemask();
+  for (int i = size / 2; i > 0; i /= 2) {
+    val += __shfl_down_sync(all_threads, val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+template <typename T>
+__global__ void shflXorSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  auto all_threads = __activemask();
+  for (int i = size/2; i > 0; i /= 2) {
+    val += __shfl_xor_sync(all_threads, val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+)"};
+
+template <typename T>
+void runTestShflSync(int option) {
+  using namespace std;
+  hiprtcProgram prog;
+  hiprtcCreateProgram(&prog,      // prog
+                      shfl,       // buffer
+                      "shfl_sync.cu",  // name
+                      0, nullptr, nullptr);
+
+  string str;
+  switch(option) {
+  case 1:
+  str = "shflUpSum<__half>"; break;
+  case 2:
+  str = "shflDownSum<__half>"; break;
+  case 3:
+  str = "shflXorSum<__half>"; break;
+  default:
+  INFO("Options 1,2,3 are supported, but the passed option is: " << option);
+  REQUIRE(false);
+  }
+
+  hiprtcAddNameExpression(prog, str.c_str());
+  const char* options[] = { "-DHIP_ENABLE_WARP_SYNC_BUILTINS" };
+  hiprtcResult compileResult{hiprtcCompileProgram(prog, 1, options)};
+  size_t logSize;
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(prog, &logSize));
+  if (logSize) {
+    string log(logSize, '\0');
+    HIPRTC_CHECK(hiprtcGetProgramLog(prog, &log[0]));
+    std::cout << log << '\n';
+  }
+  REQUIRE(compileResult == HIPRTC_SUCCESS);
+  size_t codeSize;
+  HIPRTC_CHECK(hiprtcGetCodeSize(prog, &codeSize));
+
+  vector<char> code(codeSize);
+  HIPRTC_CHECK(hiprtcGetCode(prog, code.data()));
+
+  // Do hip malloc first so that we donot need to do a cuInit manually before calling hipModule APIs
+  size_t bufferSize = n * sizeof(T);
+
+  T a[n];
+  T cpuSum = sum(a);
+  T* d_a;
+  HIP_CHECK(hipMalloc(&d_a, bufferSize));
+
+  hipModule_t module;
+  hipFunction_t kernel;
+  HIP_CHECK(hipModuleLoadData(&module, code.data()));
+  const char* name;
+  hiprtcGetLoweredName(prog, str.c_str(), &name);
+  HIP_CHECK(hipModuleGetFunction(&kernel, module, name));
+
+  HIP_CHECK(hipMemcpy(d_a, &a, bufferSize, hipMemcpyDefault));
+
+  struct {
+    T* a_;
+    int b_;
+  } args{d_a, n};
+
+  auto size = sizeof(args);
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                    HIP_LAUNCH_PARAM_END};
+
+  HIP_CHECK(hipModuleLaunchKernel(kernel, 1, 1, 1, n, 1, 1, 0, nullptr, nullptr, config));
+
+  HIP_CHECK(hipMemcpy(&a, d_a, bufferSize, hipMemcpyDefault));
+  bool result;
+  switch (option) {
+  case 1: //shflUpSum
+  result = compare(a[n - 1], cpuSum); break;
+  case 2: //shflDownSum
+  case 3: //shflXorSum
+  result = compare(a[0], cpuSum); break;
+  }
+
+  if (result) {
+    HIP_CHECK(hipFree(d_a));
+    REQUIRE(false);
+  }
+
+  HIP_CHECK(hipFree(d_a));
+  HIP_CHECK(hipModuleUnload(module));
+  HIPRTC_CHECK(hiprtcDestroyProgram(&prog));
+
+}
+
+TEST_CASE("Unit_hiprtc_half_shuffle_sync") {
+  runTestShflSync<__half>(1);
+  runTestShflSync<__half>(2);
+  runTestShflSync<__half>(3);
+}
@@ -11,6 +11,18 @@ if(HIP_PLATFORM MATCHES "amd")
    warp_shfl_xor.cc
    warp_shfl_up.cc
    warp_shfl_down.cc
+    hipShflUpDownTest.cc
+    hipShflTests.cc
+    # FIXME: The tests for sync intrinsics are temporarily disabled on CUDA
+    # because they depend on __match_any_sync, which is not available on older
+    # NVIDIA devices. (SWDEV-453145)
+    hipMatchSyncAllTests.cc
+    hipMatchSyncAnyTests.cc
+    hipShflSyncDownTests.cc
+    hipShflSyncUpTests.cc
+    hipShflSyncXorTests.cc
+    hipShflSyncTests.cc
+    hipVoteSyncTests.cc
  )
 endif()

@@ -0,0 +1,281 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_common.hh"
+#include <hip_test_common.hh>
+
+template <typename T>
+__global__ void matchAll_1(T *Input, unsigned long long *Output, int *Predicate) {
+  auto tid = threadIdx.x;
+
+  Output[tid] = __match_all_sync(AllThreads, Input[tid], &Predicate[tid]);
+}
+
+template <typename T>
+static void runTestMatchAll_1() {
+  const int size = 64;
+  T Input[size] = {(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,};
+  unsigned long long Output[size];
+  unsigned long long Expected[size];
+  std::fill_n(Expected, size, -1);
+
+  int Predicate[size];
+  int ExpPredicate[size];
+  std::fill_n(ExpPredicate, size, true);
+
+  expandPrecision(Input, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  unsigned long long* d_Output;
+  int *d_Predicate;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, 8 * size));
+  HIP_CHECK(hipMalloc(&d_Predicate, 4 * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(matchAll_1<T>, 1, warpSize, 0, 0, d_Input, d_Output, d_Predicate);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
+  }
+
+  HIP_CHECK(hipMemcpy(&Predicate, d_Predicate, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Predicate[i] == ExpPredicate[i]);
+  }
+}
+
+template <typename T>
+__global__ void matchAll_2(T *Input, unsigned long long *Output, int *Predicate) {
+  auto tid = threadIdx.x;
+
+  Output[tid] = __match_all_sync(AllThreads, Input[tid], &Predicate[tid]);
+}
+
+template <typename T>
+static void runTestMatchAll_2() {
+  const int size = 64;
+  T Input[size] = {(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, (T)-5, (T)-5,};
+  unsigned long long Output[size];
+  unsigned long long Expected[size];
+
+  int warpSize = getWarpSize();
+
+  if (warpSize == 32)
+    std::fill_n(Expected, size, -1);
+  else
+    std::fill_n(Expected, size, 0);
+
+  int Predicate[size];
+  int ExpPredicate[size];
+  if (warpSize == 32)
+    std::fill_n(ExpPredicate, size, true);
+  else
+    std::fill_n(ExpPredicate, size, false);
+
+  expandPrecision(Input, size);
+
+  T* d_Input;
+  unsigned long long* d_Output;
+  int *d_Predicate;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, 8 * size));
+  HIP_CHECK(hipMalloc(&d_Predicate, 4 * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(matchAll_2<T>, 1, warpSize, 0, 0, d_Input, d_Output, d_Predicate);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
+  }
+
+  HIP_CHECK(hipMemcpy(&Predicate, d_Predicate, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Predicate[i] == ExpPredicate[i]);
+  }
+}
+
+template <typename T>
+__global__ void matchAll_3(T *Input, unsigned long long *Output, int *Predicate) {
+  auto tid = threadIdx.x;
+  // It's okay to use the non-sync__ match, because the purpose of the test is
+  // to exercise the mask argument on the sync version.
+  auto mask = __match_any_sync(AllThreads, tid / 12);
+
+  Output[tid] = __match_all_sync(mask, Input[tid], &Predicate[tid]);
+}
+
+template <typename T>
+static void runTestMatchAll_3() {
+  const int size = 64;
+  T Input[size] = {(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
+                   (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, (T)-5, (T)-5,};
+  unsigned long long Output[size];
+  unsigned long long Expected[size] = {
+    0xfff, 0xfff, 0xfff, 0xfff,
+    0xfff, 0xfff, 0xfff, 0xfff,
+    0xfff, 0xfff, 0xfff, 0xfff,
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0xfff000000, 0xfff000000, 0xfff000000, 0xfff000000,
+    0xfff000000, 0xfff000000, 0xfff000000, 0xfff000000,
+    0xfff000000, 0xfff000000, 0xfff000000, 0xfff000000,
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0xf000000000000000, 0xf000000000000000, 0xf000000000000000, 0xf000000000000000
+  };
+
+  int Predicate[size];
+  int ExpPredicate[size] {
+    true, true, true, true,
+    true, true, true, true,
+    true, true, true, true,
+    false, false, false, false,
+    false, false, false, false,
+    false, false, false, false,
+    true, true, true, true,
+    true, true, true, true,
+    true, true, true, true,
+    false, false, false, false,
+    false, false, false, false,
+    false, false, false, false,
+    false, false, false, false,
+    false, false, false, false,
+    false, false, false, false,
+    true, true, true, true
+  };
+
+  expandPrecision(Input, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  unsigned long long* d_Output;
+  int *d_Predicate;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, 8 * size));
+  HIP_CHECK(hipMalloc(&d_Predicate, 4 * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(matchAll_3<T>, 1, warpSize, 0, 0, d_Input, d_Output, d_Predicate);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
+  }
+
+  HIP_CHECK(hipMemcpy(&Predicate, d_Predicate, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Predicate[i] == ExpPredicate[i]);
+  }
+}
+
+/**
+ * @addtogroup __match_sync
+ * @{
+ * @ingroup MatchSyncTest
+ * `unsigned long long __match_all_sync(unsigned long long mask, T value, int *pred)` -
+ * Contains warp __match sync functions.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ * - Test case to verify __match_sync warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipMatchSyncAllTests.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipMatchSync_All") {
+  SECTION("run test for int") {
+    runTestMatchAll_1<int>();
+    runTestMatchAll_2<int>();
+    runTestMatchAll_3<int>();
+  }
+  SECTION("run test for unsigned int") {
+    runTestMatchAll_1<unsigned int>();
+    runTestMatchAll_2<unsigned int>();
+    runTestMatchAll_3<unsigned int>();
+  }
+  SECTION("run test for long") {
+    runTestMatchAll_1<long>();
+    runTestMatchAll_2<long>();
+    runTestMatchAll_3<long>();
+  }
+  SECTION("run test for unsigned long") {
+    runTestMatchAll_1<unsigned long>();
+    runTestMatchAll_2<unsigned long>();
+    runTestMatchAll_3<unsigned long>();
+  }
+  SECTION("run test for long long") {
+    runTestMatchAll_1<long long>();
+    runTestMatchAll_2<long long>();
+    runTestMatchAll_3<long long>();
+  }
+  SECTION("run test for unsigned long long") {
+    runTestMatchAll_1<unsigned long long>();
+    runTestMatchAll_2<unsigned long long>();
+    runTestMatchAll_3<unsigned long long>();
+  }
+  SECTION("run test for float") {
+    runTestMatchAll_1<float>();
+    runTestMatchAll_2<float>();
+    runTestMatchAll_3<float>();
+  }
+  SECTION("run test for double") {
+    runTestMatchAll_1<double>();
+    runTestMatchAll_2<double>();
+    runTestMatchAll_3<double>();
+  }
+}
@@ -0,0 +1,196 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_common.hh"
+#include <hip_test_common.hh>
+
+template <typename T>
+__global__ void matchAny_1(T *Input, unsigned long long *Output) {
+  auto tid = threadIdx.x;
+  Output[tid] = __match_any_sync(AllThreads, Input[tid]);
+}
+
+template <typename T>
+static void runTestMatchAny_1() {
+  const int size = 64;
+  T Input[size] = {0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7,
+                   (T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2,
+                   4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3,
+                   6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5,
+                   0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7,
+                   (T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2,
+                   4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3,
+                   6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5};
+  unsigned long long Output[size];
+  unsigned long long Expected[size] = {
+    0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108,
+    0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080,
+    0x2080010820800108, 0x4001021040010210, 0x8002042080020420, 0x0104084001040840,
+    0x0208108002081080, 0x0410200104102001, 0x0820400208204002, 0x1040800410408004,
+    0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080,
+    0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108,
+    0x0104084001040840, 0x0208108002081080, 0x0410200104102001, 0x0820400208204002,
+    0x1040800410408004, 0x2080010820800108, 0x4001021040010210, 0x8002042080020420,
+    0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108,
+    0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080,
+    0x2080010820800108, 0x4001021040010210, 0x8002042080020420, 0x0104084001040840,
+    0x0208108002081080, 0x0410200104102001, 0x0820400208204002, 0x1040800410408004,
+    0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080,
+    0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108,
+    0x0104084001040840, 0x0208108002081080, 0x0410200104102001, 0x0820400208204002,
+    0x1040800410408004, 0x2080010820800108, 0x4001021040010210, 0x8002042080020420
+  };
+
+  expandPrecision(Input, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  unsigned long long* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, 8 * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(matchAny_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
+  }
+}
+
+template <typename T>
+__global__ void matchAny_2(T *Input, unsigned long long *Output) {
+  auto tid = threadIdx.x;
+  // It's okay to use the non-sync__ match, because the purpose of the test is
+  // to exercise the mask argument on the sync version.
+  auto mask = __match_any_sync(AllThreads, tid / 12);
+  Output[tid] = __match_any_sync(mask, Input[tid]);
+}
+
+template <typename T>
+static void runTestMatchAny_2() {
+  const int size = 64;
+  T Input[size] = {0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7,
+                   (T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2,
+                   4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3,
+                   6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5,
+                   0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7,
+                   (T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2,
+                   4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3,
+                   6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5};
+  unsigned long long Output[size];
+  unsigned long long Expected[size] = {
+    0x0000000000000001, 0x0000000000000002, 0x0000000000000004, 0x0000000000000108,
+    0x0000000000000210, 0x0000000000000420, 0x0000000000000840, 0x0000000000000080,
+    0x0000000000000108, 0x0000000000000210, 0x0000000000000420, 0x0000000000000840,
+
+    0x0000000000081000, 0x0000000000102000, 0x0000000000204000, 0x0000000000408000,
+    0x0000000000010000, 0x0000000000020000, 0x0000000000040000, 0x0000000000081000,
+    0x0000000000102000, 0x0000000000204000, 0x0000000000408000, 0x0000000000800000,
+
+    0x0000000001000000, 0x0000000002000000, 0x0000000104000000, 0x0000000208000000,
+    0x0000000410000000, 0x0000000820000000, 0x0000000040000000, 0x0000000080000000,
+    0x0000000104000000, 0x0000000208000000, 0x0000000410000000, 0x0000000820000000,
+
+    0x0000021000000000, 0x0000042000000000, 0x0000084000000000, 0x0000108000000000,
+    0x0000010000000000, 0x0000021000000000, 0x0000042000000000, 0x0000084000000000,
+    0x0000108000000000, 0x0000200000000000, 0x0000400000000000, 0x0000800000000000,
+
+    0x0001000000000000, 0x0002000000000000, 0x0104000000000000, 0x0208000000000000,
+    0x0410000000000000, 0x0820000000000000, 0x0040000000000000, 0x0080000000000000,
+    0x0104000000000000, 0x0208000000000000, 0x0410000000000000, 0x0820000000000000,
+
+    0x1000000000000000, 0x2000000000000000, 0x4000000000000000, 0x8000000000000000
+  };
+
+  expandPrecision(Input, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  unsigned long long* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, 8 * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(matchAny_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
+  }
+}
+
+/**
+ * @addtogroup __match_sync
+ * @{
+ * @ingroup MatchSyncTest
+ * `unsigned long long __match_any_sync(unsigned long long mask, T value, int *pred)` -
+ * Contains warp __match sync functions.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ * - Test case to verify __match_sync warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipMatchSyncAnyTests.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipMatchSync_Any") {
+  SECTION("run test for int") {
+    runTestMatchAny_1<int>();
+    runTestMatchAny_2<int>();
+  }
+  SECTION("run test for unsigned int") {
+    runTestMatchAny_1<unsigned int>();
+    runTestMatchAny_2<unsigned int>();
+  }
+  SECTION("run test for long") {
+    runTestMatchAny_1<long>();
+    runTestMatchAny_2<long>();
+  }
+  SECTION("run test for unsigned long") {
+    runTestMatchAny_1<unsigned long>();
+    runTestMatchAny_2<unsigned long>();
+  }
+  SECTION("run test for long long") {
+    runTestMatchAny_1<long long>();
+    runTestMatchAny_2<long long>();
+  }
+  SECTION("run test for unsigned long long") {
+    runTestMatchAny_1<unsigned long long>();
+    runTestMatchAny_2<unsigned long long>();
+  }
+  SECTION("run test for float") {
+    runTestMatchAny_1<float>();
+    runTestMatchAny_2<float>();
+  }
+  SECTION("run test for double") {
+    runTestMatchAny_1<double>();
+    runTestMatchAny_2<double>();
+  }
+}
@@ -0,0 +1,251 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_common.hh"
+#include <hip_test_common.hh>
+
+// For all threads in the warp, shfl the value "down" by three threads. To
+// account for the end of the warp, we set the delta to zero near the warp-32
+// boundary. This also works for warp-64 since it is a multiple.
+template <typename T>
+__global__ void shflDown_1(T* Input, T *Output) {
+  auto tid = threadIdx.x;
+  int srcLane = (tid % 32 + 3 < 32) ? 3 : 0;
+  Output[tid] = __shfl_down_sync(AllThreads, Input[tid], srcLane);
+}
+
+template <typename T>
+static void runTestShflDown_1() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  int Values[size] = {3, 4, 5, -6, 7, 8, -9, 10,
+                      11, 12, 13, -14, 15, 16, 17, -18,
+                      19, 20, -21, 22, 23, 24, 25, 26,
+                      -27, 28, 29, 30, 31, 29, 30, 31,
+                      35, -36, 37, 38, -39, 40, 41, 42,
+                      43, -44, -45, 46, 47, 48, 49, 50,
+                      -51, 52, 53, -54, 55, 56, 57, -58,
+                      59, 60, 61, 62, -63, 61, 62, -63};
+  T Expected[size];
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflDown_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+// Use the mask argument to divide the warp into groups of 12 threads, and then
+// shfl "down" by three threads. Account for the boundary within a group as well
+// as withing a warp-32.
+
+template <typename T>
+__global__ void shflDown_2(T* Input, T *Output) {
+  auto tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid / 12);
+  int srcLane = ((tid % 32 + 3 >= 32) || (tid % 12 + 3 >= 12)) ? 0 : 3;
+  Output[tid] = __shfl_down_sync(mask, Input[tid], srcLane);
+}
+
+template <typename T>
+static void runTestShflDown_2() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  int Values[size] = {3, 4, 5, -6, 7, 8, -9, 10,
+                      11, -9, 10, 11, 15, 16, 17, -18,
+                      19, 20, -21, 22, 23, -21, 22, 23,
+                      -27, 28, 29, 30, 31, 29, 30, 31,
+                      35, 33, 34, 35, -39, 40, 41, 42,
+                      43, -44, -45, 46, 47, -45, 46, 47,
+                      -51, 52, 53, -54, 55, 56, 57, -58,
+                      59, 57, -58, 59, -63, 61, 62, -63};
+  T Expected[size];
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflDown_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+template <typename T>
+__global__ void shflDown_3(T* Input, T *Output) {
+  auto tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid / 12);
+  int srcLane = ((tid % 12 + 3 >= 12) || (tid % 8 + 3 >= 8)) ? 0 : 3;
+  Output[tid] = __shfl_down_sync(mask, Input[tid], srcLane, 8);
+}
+
+template <typename T>
+static void runTestShflDown_3() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  int Values[size] = {3, 4, 5, -6, 7,
+                      5, -6, 7,                        // cannot cross 8
+                      11, -9, 10, 11,              // cannot cross 12
+                      15,
+                      13, -14, 15,                 // cannot cross 8
+                      19, 20, -21, 22,
+                      23, -21, 22, 23,             // canot cross 12
+                      // pattern repeats
+                      -27, 28, 29, 30, 31,
+                      29, 30, 31,
+                      35, 33, 34, 35,
+                      -39,
+                      37, 38, -39,
+                      43, -44, -45, 46,
+                      47, -45, 46, 47,
+                      // pattern repeats
+                      -51, 52, 53, -54, 55,
+                      53, -54, 55,
+                      59, 57, -58, 59,
+                      -63,
+                      61, 62, -63};
+  T Expected[size];
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflDown_3<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+/**
+ * @addtogroup __shfl_sync
+ * @{
+ * @ingroup ShflSyncTest
+ * `T  __shfl_down_sync(unsigned long long mask, T var, int delta, int width=warpSize)` -
+ * Contains warp __shfl sync functions.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ * - Test case to verify __shfl_down_sync warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflSyncDownTests.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipShflSync_Down") {
+  SECTION("run test for short") {
+    runTestShflDown_1<short>();
+    runTestShflDown_2<short>();
+    runTestShflDown_3<short>();
+  }
+  SECTION("run test for unsigned short") {
+    runTestShflDown_1<unsigned short>();
+    runTestShflDown_2<unsigned short>();
+    runTestShflDown_3<unsigned short>();
+  }
+  SECTION("run test for int") {
+    runTestShflDown_1<int>();
+    runTestShflDown_2<int>();
+    runTestShflDown_3<int>();
+  }
+  SECTION("run test for unsigned int") {
+    runTestShflDown_1<unsigned int>();
+    runTestShflDown_2<unsigned int>();
+    runTestShflDown_3<unsigned int>();
+  }
+  SECTION("run test for long") {
+    runTestShflDown_1<long>();
+    runTestShflDown_2<long>();
+    runTestShflDown_3<long>();
+  }
+  SECTION("run test for unsigned long") {
+    runTestShflDown_1<unsigned long>();
+    runTestShflDown_2<unsigned long>();
+    runTestShflDown_3<unsigned long>();
+  }
+  SECTION("run test for long long") {
+    runTestShflDown_1<long long>();
+    runTestShflDown_2<long long>();
+    runTestShflDown_3<long long>();
+  }
+  SECTION("run test for unsigned long long") {
+    runTestShflDown_1<unsigned long long>();
+    runTestShflDown_2<unsigned long long>();
+    runTestShflDown_3<unsigned long long>();
+  }
+  SECTION("run test for float") {
+    runTestShflDown_1<float>();
+    runTestShflDown_2<float>();
+    runTestShflDown_3<float>();
+  }
+  SECTION("run test for double") {
+    runTestShflDown_1<double>();
+    runTestShflDown_2<double>();
+    runTestShflDown_3<double>();
+  }
+  SECTION("run test for __half") {
+    runTestShflDown_1<__half>();
+    runTestShflDown_2<__half>();
+    runTestShflDown_3<__half>();
+  }
+  SECTION("run test for __half2") {
+    runTestShflDown_1<__half2>();
+    runTestShflDown_2<__half2>();
+    runTestShflDown_3<__half2>();
+  }
+}
@@ -0,0 +1,176 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_common.hh"
+#include <hip_test_common.hh>
+
+template <typename T>
+__global__ void shfl_1(T *Input, T *Output) {
+  int tid = threadIdx.x;
+  // Creates groups consisting of every fourth thread.
+  auto mask = __match_any_sync(AllThreads, tid % 4);
+  int srcLane = tid % 4;
+
+  // Each group reads from the first active thread within that group.
+  Output[tid] = __shfl_sync(mask, Input[tid], srcLane);
+}
+
+template <typename T>
+static void runTestShfl_1() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  T Expected[size];
+  int Values[size] = {0, -1, 2, 3, 0, -1, 2, 3,
+                      0, -1, 2, 3, 0, -1, 2, 3,
+                      0, -1, 2, 3, 0, -1, 2, 3,
+                      0, -1, 2, 3, 0, -1, 2, 3,
+                      0, -1, 2, 3, 0, -1, 2, 3,
+                      0, -1, 2, 3, 0, -1, 2, 3,
+                      0, -1, 2, 3, 0, -1, 2, 3,
+                      0, -1, 2, 3, 0, -1, 2, 3};
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shfl_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+template <typename T>
+__global__ void shfl_2(T *Input, T *Output) {
+  int tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid % 4);
+  int srcLane = tid % 4;
+
+  // Each subgroup of eight reads from the first active thread within that
+  // subgroup.
+  Output[tid] = __shfl_sync(mask, Input[tid], srcLane, 8);
+}
+
+template <typename T>
+static void runTestShfl_2() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  T Expected[size];
+  int Values[size] = {0, -1, 2, 3, 0, -1, 2, 3,
+                      8, -9, 10, 11, 8, -9, 10, 11,
+                      16, 17, -18, 19, 16, 17, -18, 19,
+                      24, 25, 26, -27, 24, 25, 26, -27,
+                      -32, 33, 34, 35, -32, 33, 34, 35,
+                      40, 41, 42, 43, 40, 41, 42, 43,
+                      48, 49, 50, -51, 48, 49, 50, -51,
+                      56, 57, -58, 59, 56, 57, -58, 59};
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shfl_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+/**
+ * @addtogroup __shfl_sync
+ * @{
+ * @ingroup ShflSyncTest
+ * `T  __shfl_sync(unsigned long long mask, T var, int srcLane, int width=warpSize)` -
+ * Contains warp __shfl sync functions.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ * - Test case to verify __shfl_sync warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflSyncTests.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipShflSync") {
+  SECTION("run test for short") {
+    runTestShfl_1<short>();
+    runTestShfl_2<short>();
+  }
+  SECTION("run test for unsigned short") {
+    runTestShfl_1<unsigned short>();
+    runTestShfl_2<unsigned short>();
+  }
+  SECTION("run test for int") {
+    runTestShfl_1<int>();
+    runTestShfl_2<int>();
+  }
+  SECTION("run test for unsigned int") {
+    runTestShfl_1<unsigned int>();
+    runTestShfl_2<unsigned int>();
+  }
+  SECTION("run test for long") {
+    runTestShfl_1<long>();
+    runTestShfl_2<long>();
+  }
+  SECTION("run test for unsigned long") {
+    runTestShfl_1<unsigned long>();
+    runTestShfl_2<unsigned long>();
+  }
+  SECTION("run test for long long") {
+    runTestShfl_1<long long>();
+    runTestShfl_2<long long>();
+  }
+  SECTION("run test for unsigned long long") {
+    runTestShfl_1<unsigned long long>();
+    runTestShfl_2<unsigned long long>();
+  }
+  SECTION("run test for float") {
+    runTestShfl_1<float>();
+    runTestShfl_2<float>();
+  }
+  SECTION("run test for double") {
+    runTestShfl_1<double>();
+    runTestShfl_2<double>();
+  }
+}
@@ -0,0 +1,243 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_common.hh"
+#include <hip_test_common.hh>
+
+template <typename T>
+__global__ void shflUp_1(T* Input, T *Output) {
+  auto tid = threadIdx.x;
+  int srcLane = (tid > 3) ? 3 : 0;
+  Output[tid] = __shfl_up_sync(AllThreads, Input[tid], srcLane);
+}
+
+template <typename T>
+static void runTestShflUp_1() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  T Expected[size];
+  int Values[] = {0, -1, 2, 3, -1, 2, 3, 4, 5, -6, 7, 8,
+                  -9, 10, 11, 12, 13, -14, 15, 16, 17, -18, 19, 20,
+                  -21, 22, 23, 24, 25, 26, -27, 28, 29, 30, 31, -32,
+                  33, 34, 35, -36, 37, 38, -39, 40, 41, 42, 43, -44,
+                  -45, 46, 47, 48, 49, 50, -51, 52, 53, -54, 55, 56,
+                  57, -58, 59, 60};
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflUp_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+template <typename T>
+__global__ void shflUp_2(T* Input, T *Output) {
+  auto tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid / 12);
+  int srcLane = (tid % 12) < 3 ? 0 : 3;
+  Output[tid] = __shfl_up_sync(mask, Input[tid], srcLane);
+}
+
+template <typename T>
+static void runTestShflUp_2() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  T Expected[size];
+  int Values[size] = {0, -1, 2, 0, -1, 2, 3, 4, 5, -6, 7, 8,
+                      12, 13, -14, 12, 13, -14, 15, 16, 17, -18, 19, 20,
+                      24, 25, 26, 24, 25, 26, -27, 28, 29, 30, 31, -32,
+                      -36, 37, 38, -36, 37, 38, -39, 40, 41, 42, 43, -44,
+                      48, 49, 50, 48, 49, 50, -51, 52, 53, -54, 55, 56,
+                      60, 61, 62, 60};
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflUp_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+template <typename T>
+__global__ void shflUp_3(T* Input, T *Output) {
+  auto tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid / 12);
+  int srcLane = (tid % 12) < 3 ? 0 : 3;
+  Output[tid] = __shfl_up_sync(mask, Input[tid], srcLane, 8);
+}
+
+template <typename T>
+static void runTestShflUp_3() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  T Expected[size];
+  int Values[size] = {0, -1, 2,                           // cannot cross mod-12
+                      0, -1, 2, 3, 4,
+                      8, -9, 10,                      // cannot cross mod-8
+                      8,
+                      12, 13, -14,                    // cannot cross mod-12
+                      12,
+                      16, 17, -18,                    // cannot cross mod-8
+                      16, 17, -18, 19, 20,
+                      // pattern repeats
+                      24, 25, 26,
+                      24, 25, 26, -27, 28,
+                      -32, 33, 34,
+                      -32,
+                      -36, 37, 38,
+                      -36,
+                      40, 41, 42,
+                      40, 41, 42, 43, -44,
+                      // pattern repeats
+                      48, 49, 50,
+                      48, 49, 50, -51, 52,
+                      56, 57, -58,
+                      56,
+                      60, 61, 62,
+                      60};
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflUp_3<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+/**
+ * @addtogroup __shfl_sync
+ * @{
+ * @ingroup ShflSyncTest
+ * `T  __shfl_up_sync(unsigned long long mask, T var, int delta, int width=warpSize)` -
+ * Contains warp __shfl sync functions.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ * - Test case to verify __shfl_up_sync warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflSyncUpTests.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipShflSync_Up") {
+  SECTION("run test for short") {
+    runTestShflUp_1<short>();
+    runTestShflUp_2<short>();
+    runTestShflUp_3<short>();
+  }
+  SECTION("run test for unsigned short") {
+    runTestShflUp_1<unsigned short>();
+    runTestShflUp_2<unsigned short>();
+    runTestShflUp_3<unsigned short>();
+  }
+  SECTION("run test for int") {
+    runTestShflUp_1<int>();
+    runTestShflUp_2<int>();
+    runTestShflUp_3<int>();
+  }
+  SECTION("run test for unsigned int") {
+    runTestShflUp_1<unsigned int>();
+    runTestShflUp_2<unsigned int>();
+    runTestShflUp_3<unsigned int>();
+  }
+  SECTION("run test for long") {
+    runTestShflUp_1<long>();
+    runTestShflUp_2<long>();
+    runTestShflUp_3<long>();
+  }
+  SECTION("run test for unsigned long") {
+    runTestShflUp_1<unsigned long>();
+    runTestShflUp_2<unsigned long>();
+    runTestShflUp_3<unsigned long>();
+  }
+  SECTION("run test for long long") {
+    runTestShflUp_1<long long>();
+    runTestShflUp_2<long long>();
+    runTestShflUp_3<long long>();
+  }
+  SECTION("run test for unsigned long long") {
+    runTestShflUp_1<unsigned long long>();
+    runTestShflUp_2<unsigned long long>();
+    runTestShflUp_3<unsigned long long>();
+  }
+  SECTION("run test for float") {
+    runTestShflUp_1<float>();
+    runTestShflUp_2<float>();
+    runTestShflUp_3<float>();
+  }
+  SECTION("run test for double") {
+    runTestShflUp_1<double>();
+    runTestShflUp_2<double>();
+    runTestShflUp_3<double>();
+  }
+  SECTION("run test for __half") {
+    runTestShflUp_1<__half>();
+    runTestShflUp_2<__half>();
+    runTestShflUp_3<__half>();
+  }
+  SECTION("run test for __half2") {
+    runTestShflUp_1<__half2>();
+    runTestShflUp_2<__half2>();
+    runTestShflUp_3<__half2>();
+  }
+}
@@ -0,0 +1,234 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_common.hh"
+#include <hip_test_common.hh>
+
+template <typename T>
+__global__ void shflXor_1(T* Input, T *Output) {
+  auto tid = threadIdx.x;
+  Output[tid] = __shfl_xor_sync(AllThreads, Input[tid], 16);
+}
+
+template <typename T>
+static void runTestShflXor_1() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  int Values[size] = {16, 17, -18, 19, 20, -21, 22, 23,
+                      24, 25, 26, -27, 28, 29, 30, 31,
+                      0, -1, 2, 3, 4, 5, -6, 7,
+                      8, -9, 10, 11, 12, 13, -14, 15,
+                      48, 49, 50, -51, 52, 53, -54, 55,
+                      56, 57, -58, 59, 60, 61, 62, -63,
+                      -32, 33, 34, 35, -36, 37, 38, -39,
+                      40, 41, 42, 43, -44, -45, 46, 47};
+  T Expected[size];
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflXor_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+template <typename T>
+__global__ void shflXor_2(T* Input, T *Output) {
+  unsigned tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid / 12);
+  int laneMask = 4;
+  int section = tid % 24;
+  if (section > 7 && section < 16)
+    laneMask = 0;
+  Output[tid] = __shfl_xor_sync(mask, Input[tid], laneMask);
+}
+
+template <typename T>
+static void runTestShflXor_2() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  int Values[size] = {4, 5, -6, 7, 0, -1, 2, 3,
+                      8, -9, 10, 11, 12, 13, -14, 15,      // disabled around mid mod-24
+                      20, -21, 22, 23, 16, 17, -18, 19,
+                      28, 29, 30, 31, 24, 25, 26, -27,
+                      -32, 33, 34, 35, -36, 37, 38, -39,   // disabled around mid mod-24
+                      -44, -45, 46, 47, 40, 41, 42, 43,
+                      52, 53, -54, 55, 48, 49, 50, -51,
+                      56, 57, -58, 59, 60, 61, 62, -63};   // disabled around mid mod-24
+  T Expected[size];
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflXor_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+template <typename T>
+__global__ void shflXor_3(T* Input, T *Output) {
+  auto tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid / 16);
+  Output[tid] = __shfl_xor_sync(mask, Input[tid], 4, 8);
+}
+
+template <typename T>
+static void runTestShflXor_3() {
+  const int size = 64;
+  T Input[size];
+  T Output[size];
+  int Values[size] = {4, 5, -6, 7, 0, -1, 2, 3,
+                      12, 13, -14, 15, 8, -9, 10, 11,
+                      20, -21, 22, 23, 16, 17, -18, 19,
+                      28, 29, 30, 31, 24, 25, 26, -27,
+                      -36, 37, 38, -39, -32, 33, 34, 35,
+                      -44, -45, 46, 47, 40, 41, 42, 43,
+                      52, 53, -54, 55, 48, 49, 50, -51,
+                      60, 61, 62, -63, 56, 57, -58, 59};
+  T Expected[size];
+
+  initializeInput(Input, size);
+  initializeExpected(Expected, Values, size);
+
+
+  int warpSize = getWarpSize();
+
+  T* d_Input;
+  T* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
+  HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflXor_3<T>, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareEqual(Output[i], Expected[i]));
+  }
+}
+
+/**
+ * @addtogroup __shfl_sync
+ * @{
+ * @ingroup ShflSyncTest
+ * `T  __shfl_xor_sync(unsigned long long mask, T var, unsigned laneMask, int width=warpSize)` -
+ * Contains warp __shfl sync functions.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ * - Test case to verify __shfl_xor_sync warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflSyncXorTests.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipShflSync_Xor") {
+  SECTION("run test for short") {
+    runTestShflXor_1<short>();
+    runTestShflXor_2<short>();
+    runTestShflXor_3<short>();
+  }
+  SECTION("run test for unsigned short") {
+    runTestShflXor_1<unsigned short>();
+    runTestShflXor_2<unsigned short>();
+    runTestShflXor_3<unsigned short>();
+  }
+  SECTION("run test for int") {
+    runTestShflXor_1<int>();
+    runTestShflXor_2<int>();
+    runTestShflXor_3<int>();
+  }
+  SECTION("run test for unsigned int") {
+    runTestShflXor_1<unsigned int>();
+    runTestShflXor_2<unsigned int>();
+    runTestShflXor_3<unsigned int>();
+  }
+  SECTION("run test for long") {
+    runTestShflXor_1<long>();
+    runTestShflXor_2<long>();
+    runTestShflXor_3<long>();
+  }
+  SECTION("run test for unsigned long") {
+    runTestShflXor_1<unsigned long>();
+    runTestShflXor_2<unsigned long>();
+    runTestShflXor_3<unsigned long>();
+  }
+  SECTION("run test for long long") {
+    runTestShflXor_1<long long>();
+    runTestShflXor_2<long long>();
+    runTestShflXor_3<long long>();
+  }
+  SECTION("run test for unsigned long long") {
+    runTestShflXor_1<unsigned long long>();
+    runTestShflXor_2<unsigned long long>();
+    runTestShflXor_3<unsigned long long>();
+  }
+  SECTION("run test for float") {
+    runTestShflXor_1<float>();
+    runTestShflXor_2<float>();
+    runTestShflXor_3<float>();
+  }
+  SECTION("run test for double") {
+    runTestShflXor_1<double>();
+    runTestShflXor_2<double>();
+    runTestShflXor_3<double>();
+  }
+  SECTION("run test for __half") {
+    runTestShflXor_1<__half>();
+    runTestShflXor_2<__half>();
+    runTestShflXor_3<__half>();
+  }
+  SECTION("run test for __half2") {
+    runTestShflXor_1<__half2>();
+    runTestShflXor_2<__half2>();
+    runTestShflXor_3<__half2>();
+  }
+}
@@ -0,0 +1,595 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_common.hh"
+#include <hip_test_common.hh>
+
+__global__ void any_1(int *Input, int *Output) {
+  auto tid = threadIdx.x;
+  Output[tid] = __any_sync(AllThreads, Input[tid]);
+}
+
+static void runTestAny_1() {
+  const int size = 64;
+  int Input[size] = {0, };
+  int Output[size];
+  int Expected[size] = {0, };
+
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(any_1, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+__global__ void any_2(int *Input, int *Output) {
+  auto tid = threadIdx.x;
+  Output[tid] = __any_sync(AllThreads, Input[tid]);
+}
+
+static void runTestAny_2_w64() {
+  const int size = 64;
+  int Input[size] = {0, };
+  int Output[size];
+  int Expected[size] = {0, };
+
+  Input[60] = 1;
+
+  int warpSize = getWarpSize();
+  if (warpSize == 64)
+    std::fill_n(Expected, size, 1);
+
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(any_2, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+static void runTestAny_2_w32() {
+  const int size = 64;
+  int Input[size] = {0, };
+  int Output[size];
+  int Expected[size] = {0, };
+
+  Input[30] = 1;
+  std::fill_n(Expected, size, 1);
+
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(any_2, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+__global__ void any_3(int *Input, int *Output) {
+  auto tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid/12);
+  Output[tid] = __any_sync(mask, Input[tid]);
+}
+
+static void runTestAny_3() {
+  const int size = 64;
+  int Input[size] = {0, 0, 0, 0, 0, 0, 0, 0,
+                     0, 0, 0, 0, 0, 0, 1, 0,
+                     0, 0, 0, 0, 0, 0, 0, 0,
+                     0, 0, 1, 0, 0, 0, 0, 0,
+                     0, 0, 0, 0, 0, 0, 0, 0,
+                     0, 0, 0, 0, 0, 0, 0, 0,
+                     0, 0, 1, 0, 0, 0, 0, 0,
+                     0, 0, 0, 0, 0, 1, 0, 0};
+
+  int Output[size];
+  int Expected[size] = {0, 0, 0, 0, 0, 0, 0, 0,
+                        0, 0, 0, 0,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0, 0, 0,
+                        0, 0, 0, 0,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1,
+                        1, 1, 1, 1};
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(any_3, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+__global__ void any_4(int *Input, int *Output) {
+  auto tid = threadIdx.x;
+  unsigned long long masks[2] = { Every5thBut9th, Every9thBit };
+
+  Output[tid] = -1;
+  if (tid % 5 == 0 || tid % 9 == 0)
+    Output[tid] = __any_sync(masks[tid % 9 == 0], Input[tid]);
+}
+
+static void runTestAny_4() {
+  const int size = 64;
+  int Input[size] = {0, };
+  Input[5] = 1;
+
+  int Output[size];
+  int Expected[size];
+
+  for (int i = 0; i != size; ++i) {
+    if (i % 9 == 0) {
+      Expected[i] = 0;
+      continue;
+    }
+
+    if (i % 5 == 0) {
+      Expected[i] = 1;
+      continue;
+    }
+
+    Expected[i] = -1;
+  }
+
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(any_4, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+__global__ void all_1(int *Input, int *Output) {
+  auto tid = threadIdx.x;
+  Output[tid] = __all_sync(AllThreads, Input[tid]);
+}
+
+static void runTestAll_1_w64() {
+  const int size = 64;
+  int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 0, 1, 1};
+
+  int Output[size];
+  int Expected[size] = {0, };
+
+  int warpSize = getWarpSize();
+  if (warpSize == 32)
+    std::fill_n(Expected, size, 1);
+
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(all_1, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+static void runTestAll_1_w32() {
+  const int size = 64;
+  int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 0, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1};
+
+  int Output[size];
+  int Expected[size] = {0, };
+
+  int warpSize = getWarpSize();
+
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(all_1, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+__global__ void all_2(int *Input, int *Output) {
+  auto tid = threadIdx.x;
+  Output[tid] = __all_sync(AllThreads, Input[tid]);
+}
+
+static void runTestAll_2() {
+  const int size = 64;
+  int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1};
+
+  int Output[size];
+  int Expected[size] = {1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1, 1, 1};
+
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(all_2, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+__global__ void all_3(int *Input, int *Output) {
+  auto tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid/12);
+  Output[tid] = __all_sync(mask, Input[tid]);
+}
+
+static void runTestAll_3() {
+  const int size = 64;
+  int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 0, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 0, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 0, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 0, 1, 1};
+
+  int Output[size];
+  int Expected[size] = {1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0, 0, 0,
+                        0, 0, 0, 0,
+                        0, 0, 0, 0, 0, 0, 0, 0,
+                        0, 0, 0, 0,
+                        1, 1, 1, 1, 1, 1, 1, 1,
+                        1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0, 0, 0,
+                        0, 0, 0, 0,
+                        0, 0, 0, 0};
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(all_3, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+__global__ void all_4(int *Input, int *Output) {
+  auto tid = threadIdx.x;
+  unsigned long long masks[2] = { Every5thBut9th, Every9thBit };
+
+  Output[tid] = -1;
+  if (tid % 5 == 0 || tid % 9 == 0)
+    Output[tid] = __all_sync(masks[tid % 9 == 0], Input[tid]);
+}
+
+static void runTestAll_4() {
+  const int size = 64;
+  int Input[size];
+  std::fill_n(Input, size, 1);
+  Input[5] = 0;
+
+  int Output[size];
+  int Expected[size];
+
+  for (int i = 0; i != size; ++i) {
+    if (i % 9 == 0) {
+      Expected[i] = 1;
+      continue;
+    }
+
+    if (i % 5 == 0) {
+      Expected[i] = 0;
+      continue;
+    }
+
+    Expected[i] = -1;
+  }
+
+  int* d_Input;
+  int* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 4 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(all_4, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(Output[i] == Expected[i]);
+  }
+}
+
+__global__ void ballot_1(int *Input, unsigned long long *Output) {
+  auto tid = threadIdx.x;
+  Output[tid] = __ballot_sync(AllThreads, Input[tid]);
+}
+
+static void runTestBallot_1() {
+  const int size = 64;
+  int Input[size] = {0, 1, 0, 0, 1, 1, 1, 0,
+                     0, 1, 1, 1, 0, 0, 1, 0,
+                     1, 1, 1, 0, 0, 1, 0, 0,
+                     1, 0, 0, 1, 0, 0, 1, 1,
+                     0, 1, 0, 0, 1, 1, 1, 0,
+                     0, 1, 1, 1, 0, 0, 1, 0,
+                     1, 1, 1, 0, 0, 1, 0, 0,
+                     1, 0, 0, 1, 0, 0, 1, 1};
+  unsigned long long Output[size];
+  unsigned long long Expected[size] = {
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
+    0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72
+  };
+
+  int* d_Input;
+  unsigned long long* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 8 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(ballot_1, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
+  }
+}
+
+__global__ void ballot_2(int *Input, unsigned long long *Output) {
+  auto tid = threadIdx.x;
+  auto mask = __match_any_sync(AllThreads, tid / 12);
+  Output[tid] = __ballot_sync(mask, Input[tid]);
+}
+
+static void runTestBallot_2() {
+  const int size = 64;
+  int Input[size] = {0, 1, 0, 0, 1, 1, 1, 0,
+                     0, 1, 1, 1, 0, 0, 1, 0,
+                     1, 1, 1, 0, 0, 1, 0, 0,
+                     1, 0, 0, 1, 0, 0, 1, 1,
+                     0, 1, 0, 0, 1, 1, 1, 0,
+                     0, 1, 1, 1, 0, 0, 1, 0,
+                     1, 1, 1, 0, 0, 1, 0, 0,
+                     1, 0, 0, 1, 0, 0, 1, 1};
+  unsigned long long Output[size];
+  unsigned long long Expected[size] = {
+    0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72,
+    0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72,
+    0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72,
+    0x0000000000274000, 0x0000000000274000, 0x0000000000274000, 0x0000000000274000,
+    0x0000000000274000, 0x0000000000274000, 0x0000000000274000, 0x0000000000274000,
+    0x0000000000274000, 0x0000000000274000, 0x0000000000274000, 0x0000000000274000,
+    0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000,
+    0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000,
+    0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000,
+    0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000,
+    0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000,
+    0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000,
+    0x0927000000000000, 0x0927000000000000, 0x0927000000000000, 0x0927000000000000,
+    0x0927000000000000, 0x0927000000000000, 0x0927000000000000, 0x0927000000000000,
+    0x0927000000000000, 0x0927000000000000, 0x0927000000000000, 0x0927000000000000,
+    0xc000000000000000, 0xc000000000000000, 0xc000000000000000, 0xc000000000000000};
+
+  int* d_Input;
+  unsigned long long* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 8 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(ballot_2, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
+  }
+}
+
+__global__ void ballot_3(int *Input, unsigned long long *Output) {
+  auto tid = threadIdx.x;
+  unsigned long long masks[2] = { Every5thBut9th, Every9thBit };
+
+  Output[tid] = -1;
+  if (tid % 5 == 0 || tid % 9 == 0)
+    Output[tid] = __ballot_sync(masks[tid % 9 == 0], Input[tid]);
+}
+
+static void runTestBallot_3() {
+  const int size = 64;
+  int Input[size];
+  std::fill_n(Input, size, 1);
+
+  unsigned long long Output[size];
+  unsigned long long Expected[size];
+
+  for (int i = 0; i != size; ++i) {
+    if (i % 9 == 0) {
+      Expected[i] = Every9thBit;
+      continue;
+    }
+
+    if (i % 5 == 0) {
+      Expected[i] = Every5thBut9th;
+      continue;
+    }
+
+    Expected[i] = -1;
+  }
+
+  int* d_Input;
+  unsigned long long* d_Output;
+  HIP_CHECK(hipMalloc(&d_Input, 4 * size));
+  HIP_CHECK(hipMalloc(&d_Output, 8 * size));
+
+  int warpSize = getWarpSize();
+
+  HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(ballot_3, 1, warpSize, 0, 0, d_Input, d_Output);
+
+  HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
+  for (int i = 0; i != warpSize; ++i) {
+    REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
+  }
+}
+
+/**
+ * @addtogroup __vote_sync
+ * @{
+ * @ingroup VoteSyncTest
+ *
+ *   - `unsigned long long __any_sync(unsigned long long mask, int predicate)`
+ *   - `unsigned long long __all_sync(unsigned long long mask, int predicate)`
+ *   - `unsigned long long __ballot_sync(unsigned long long mask, int predicate)`
+ *
+ * Contains warp vote sync functions.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ * - Test cases to verify warp vote functions.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipVoteSyncTests.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipVoteSync_Any") {
+  runTestAny_1();
+  runTestAny_2_w64();
+  runTestAny_2_w32();
+  runTestAny_3();
+  runTestAny_4();
+}
+
+TEST_CASE("Unit_hipVoteSync_All") {
+  runTestAll_1_w64();
+  runTestAll_1_w32();
+  runTestAll_2();
+  runTestAll_3();
+  runTestAll_4();
+}
+
+TEST_CASE("Unit_hipVoteSync_Ballot") {
+  runTestBallot_1();
+  runTestBallot_2();
+  runTestBallot_3();
+}
@@ -19,11 +19,18 @@ THE SOFTWARE.

 #pragma once

+#define HIP_ENABLE_WARP_SYNC_BUILTINS
+
 #include <hip_test_common.hh>
 #include <hip/hip_cooperative_groups.h>
 #include <hip/hip_fp16.h>

-static __device__ bool deactivate_thread(const uint64_t* const active_masks) {
+const unsigned long long Every5thBit = 0x1084210842108421;
+const unsigned long long Every9thBit = 0x8040201008040201;
+const unsigned long long Every5thBut9th = Every5thBit & ~Every9thBit;
+const unsigned long long AllThreads = ~0;
+
+inline __device__ bool deactivate_thread(const uint64_t* const active_masks) {
  const auto warp =
      cooperative_groups::tiled_partition(cooperative_groups::this_thread_block(), warpSize);
  const auto block = cooperative_groups::this_thread_block();
@@ -34,17 +41,17 @@ static __device__ bool deactivate_thread(const uint64_t* const active_masks) {
  return !(active_masks[idx] & (static_cast<uint64_t>(1) << warp.thread_rank()));
 }

-static inline std::mt19937& GetRandomGenerator() {
+inline std::mt19937& GetRandomGenerator() {
  static std::mt19937 mt(std::random_device{}());
  return mt;
 }

-template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
+template <typename T> inline T GenerateRandomInteger(const T min, const T max) {
  std::uniform_int_distribution<T> dist(min, max);
  return dist(GetRandomGenerator());
 }

-template <typename T> static inline T GenerateRandomReal(const T min, const T max) {
+template <typename T> inline T GenerateRandomReal(const T min, const T max) {
  std::uniform_real_distribution<T> dist(min, max);
  return dist(GetRandomGenerator());
 }
@@ -83,3 +90,78 @@ inline uint64_t get_active_mask(unsigned int warp_id, unsigned int warp_size) {
  }
  return active_mask;
 }
+
+template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
+inline T expandPrecision(int X) { return X; }
+
+template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+inline T expandPrecision(int X) {
+  return X * 3.141592653589793115997963468544185161590576171875;
+}
+
+template <typename T, std::enable_if_t<std::is_same<T, __half>::value, bool> = true>
+inline __half expandPrecision(int X) {
+  return (__half)expandPrecision<float>(X);
+}
+
+template <typename T, std::enable_if_t<std::is_same<T, __half2>::value, bool> = true>
+inline __half2 expandPrecision(int X) {
+  __half H = expandPrecision<float>(X);
+  return {H, H};
+}
+
+template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
+inline void expandPrecision(T* Array, int size) {
+  (void)Array;
+  (void)size;
+}
+
+template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+inline void expandPrecision(T *Array, int size) {
+  for (int i = 0; i != size; ++i) {
+    Array[i] *= 3.141592653589793115997963468544185161590576171875;
+  }
+}
+
+template <typename T>
+inline void initializeInput(T *Input, int size) {
+  int Values[] = {0, -1, 2, 3, 4, 5, -6, 7,
+                  8, -9, 10, 11, 12, 13, -14, 15,
+                  16, 17, -18, 19, 20, -21, 22, 23,
+                  24, 25, 26, -27, 28, 29, 30, 31,
+                  -32, 33, 34, 35, -36, 37, 38, -39,
+                  40, 41, 42, 43, -44, -45, 46, 47,
+                  48, 49, 50, -51, 52, 53, -54, 55,
+                  56, 57, -58, 59, 60, 61, 62, -63};
+
+  for (int i = 0; i != size; ++i) {
+    Input[i] = expandPrecision<T>(Values[i]);
+  }
+}
+
+template <typename T>
+inline void initializeExpected(T *Expected, int *Values, int size) {
+  for (int i = 0; i != size; ++i) {
+    Expected[i] = expandPrecision<T>(Values[i]);
+  }
+}
+
+template <typename T>
+inline bool compareEqual(T X, T Y) { return X == Y; }
+
+template <>
+inline bool compareEqual(__half X, __half Y) {
+  return __half2float(X) == __half2float(Y);
+}
+
+template <>
+inline bool compareEqual(__half2 X, __half2 Y) {
+  return compareEqual(X.x, Y.x) && compareEqual(X.y, Y.y);
+}
+
+inline bool compareMaskEqual(unsigned long long *Actual, unsigned long long *Expected,
+                       int i, int warpSize) {
+  if (warpSize == 32)
+    return (unsigned)Actual[i] == (unsigned)Expected[i];
+  return Actual[i] == Expected[i];
+}