diff --git a/catch/unit/deviceLib/CMakeLists.txt b/catch/unit/deviceLib/CMakeLists.txt
index 357a68e4a3..ce11801786 100644
--- a/catch/unit/deviceLib/CMakeLists.txt
+++ b/catch/unit/deviceLib/CMakeLists.txt
@@ -7,11 +7,18 @@ set(TEST_SRC
     ffs.cc
     funnelshift.cc
     brev.cc
+    popc.cc
+    ldg.cc
+    syncthreadsand.cc
+    syncthreadscount.cc
+    syncthreadsor.cc
+    threadfence_system.cc
 )
 
 # AMD only tests
 set(AMD_TEST_SRC
     vectorTypesDevice.cc
+    mbcnt.cc
     bitExtract.cc
     bitInsert.cc
     floatTM.cc
diff --git a/catch/unit/deviceLib/ldg.cc b/catch/unit/deviceLib/ldg.cc
new file mode 100644
index 0000000000..61456ee940
--- /dev/null
+++ b/catch/unit/deviceLib/ldg.cc
@@ -0,0 +1,250 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+
+constexpr int WIDTH = 16;
+constexpr int HEIGHT = 16;
+constexpr int NUM = WIDTH * HEIGHT;
+
+constexpr int THREADS_PER_BLOCK_X = 8;
+constexpr int THREADS_PER_BLOCK_Y = 8;
+
+template <typename T>
+__global__ void vectoradd_float(T* a, const T* bm, int width, int height)
+
+{
+  int x = blockDim.x * blockIdx.x + threadIdx.x;
+  int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+  int i = y * width + x;
+  if (i < (width * height)) {
+    a[i] = __ldg(&bm[i]);
+  }
+}
+
+int2 make_vector2(int a) { return make_int2(a, a); }
+
+char2 make_vector2(signed char a) { return make_char2(a, a); }
+
+char4 make_vector4(signed char a) { return make_char4(a, a, a, a); }
+
+short2 make_vector2(short a) { return make_short2(a, a); }
+
+ushort2 make_vector2(unsigned short a) { return make_ushort2(a, a); }
+
+short4 make_vector4(short a) { return make_short4(a, a, a, a); }
+
+int4 make_vector4(int a) { return make_int4(a, a, a, a); }
+
+uint2 make_vector2(unsigned int a) { return make_uint2(a, a); }
+
+uint4 make_vector4(unsigned int a) { return make_uint4(a, a, a, a); }
+
+float2 make_vector2(float a) { return make_float2(a, a); }
+
+float4 make_vector4(float a) { return make_float4(a, a, a, a); }
+
+uchar2 make_vector2(unsigned char a) { return make_uchar2(a, a); }
+
+uchar4 make_vector4(unsigned char a) { return make_uchar4(a, a, a, a); }
+
+double2 make_vector2(double a) { return make_double2(a, a); }
+
+
+template <typename T, typename U> int dataTypesRun() {
+  T* hostA;
+  T* hostB;
+
+  T* deviceA;
+  T* deviceB;
+
+  int i;
+  int errors;
+
+  hostA = (T*)malloc(NUM * sizeof(T));
+  hostB = (T*)malloc(NUM * sizeof(T));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    hostB[i] = (U)i;
+  }
+
+  HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
+  HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
+
+  HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
+
+
+  hipLaunchKernelGGL(vectoradd_float,
+                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
+                     static_cast<const T*>(deviceB), WIDTH, HEIGHT);
+
+
+  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost));
+
+  // verify the results
+  errors = 0;
+  for (i = 0; i < NUM; i++) {
+    if (hostA[i] != (hostB[i])) {
+      errors++;
+    }
+  }
+
+  HIP_CHECK(hipFree(deviceA));
+  HIP_CHECK(hipFree(deviceB));
+
+  free(hostA);
+  free(hostB);
+
+  return errors;
+}
+
+
+template <typename T, typename U> int dataTypesRun2() {
+  T* hostA;
+  T* hostB;
+
+  T* deviceA;
+  T* deviceB;
+
+  int i;
+  int errors;
+
+  hostA = (T*)malloc(NUM * sizeof(T));
+  hostB = (T*)malloc(NUM * sizeof(T));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    hostB[i] = make_vector2((U)i);
+  }
+
+  HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
+  HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
+
+  HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
+  hipLaunchKernelGGL(vectoradd_float,
+                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
+                     static_cast<const T*>(deviceB), WIDTH, HEIGHT);
+
+
+  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost));
+
+  // verify the results
+  errors = 0;
+  for (i = 0; i < NUM; i++) {
+    if (hostA[i].x != (hostB[i].x) && hostA[i].y != (hostB[i].y)) {
+      errors++;
+    }
+  }
+
+  HIP_CHECK(hipFree(deviceA));
+  HIP_CHECK(hipFree(deviceB));
+
+  free(hostA);
+  free(hostB);
+
+  return errors;
+}
+
+
+template <typename T, typename U> int dataTypesRun4() {
+  T* hostA;
+  T* hostB;
+
+  T* deviceA;
+  T* deviceB;
+
+  int i;
+  int errors;
+
+  hostA = (T*)malloc(NUM * sizeof(T));
+  hostB = (T*)malloc(NUM * sizeof(T));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    hostB[i] = make_vector4((U)i);
+  }
+
+  HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
+  HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
+
+  HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
+
+
+  hipLaunchKernelGGL(vectoradd_float,
+                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
+                     static_cast<const T*>(deviceB), WIDTH, HEIGHT);
+
+
+  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost));
+
+  // verify the results
+  errors = 0;
+  for (i = 0; i < NUM; i++) {
+    if (hostA[i].x != (hostB[i].x) && hostA[i].y != (hostB[i].y) && hostA[i].z != (hostB[i].z) &&
+        hostA[i].w != (hostB[i].w)) {
+      errors++;
+    }
+  }
+
+  HIP_CHECK(hipFree(deviceA));
+  HIP_CHECK(hipFree(deviceB));
+
+  free(hostA);
+  free(hostB);
+
+  return errors;
+}
+
+TEST_CASE("Unit_ldg") {
+  using namespace std;
+
+  int errors;
+
+  errors = dataTypesRun<char, char>() | dataTypesRun<short, short>() | dataTypesRun<int, int>() |
+      dataTypesRun<long, long>() | dataTypesRun<long long, long long>() |
+      dataTypesRun<signed char, signed char>() | dataTypesRun<unsigned char, unsigned char>() |
+      dataTypesRun<unsigned short, unsigned short>() | dataTypesRun<unsigned int, unsigned int>() |
+      dataTypesRun<unsigned long, unsigned long>() |
+      dataTypesRun<unsigned long long, unsigned long long>() | dataTypesRun<float, float>() |
+      dataTypesRun<double, double>();
+
+  REQUIRE(errors == 0);
+
+  errors = dataTypesRun2<int2, int>() | dataTypesRun2<short2, short>() |
+      dataTypesRun2<ushort2, unsigned short>() | dataTypesRun2<char2, signed char>() |
+      dataTypesRun2<uchar2, unsigned char>() | dataTypesRun2<uint2, unsigned int>() |
+      dataTypesRun2<float2, float>() | dataTypesRun2<double2, double>();
+
+  REQUIRE(errors == 0);
+
+  errors = dataTypesRun4<int4, int>() | dataTypesRun4<char4, signed char>() |
+      dataTypesRun4<uchar4, unsigned char>() | dataTypesRun4<short4, short>() |
+      dataTypesRun4<uint4, unsigned int>() | dataTypesRun4<float4, float>();
+
+  REQUIRE(errors == 0);
+}
\ No newline at end of file
diff --git a/catch/unit/deviceLib/mbcnt.cc b/catch/unit/deviceLib/mbcnt.cc
new file mode 100644
index 0000000000..eeeb0f1c54
--- /dev/null
+++ b/catch/unit/deviceLib/mbcnt.cc
@@ -0,0 +1,103 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+#include <stdio.h>
+#include <algorithm>
+#include <stdlib.h>
+#include <hip/hip_runtime.h>
+#include <hip/device_functions.h>
+
+__global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
+  int x = blockDim.x * blockIdx.x + threadIdx.x;
+  mbcnt_lo[x] = __builtin_amdgcn_mbcnt_lo(0xFFFFFFFF, 0);
+  mbcnt_hi[x] = __builtin_amdgcn_mbcnt_hi(0xFFFFFFFF, 0);
+  lane_id[x] = __lane_id();
+}
+
+
+TEST_CASE("Unit_mbcnt") {
+  using namespace std;
+  unsigned int* device_mbcnt_lo;
+  unsigned int* device_mbcnt_hi;
+  unsigned int* device_lane_id;
+
+  hipDeviceProp_t devProp;
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+  INFO("System minor : " << devProp.minor);
+  INFO("System major : " << devProp.major);
+  INFO("agent prop name : " << devProp.name);
+
+  INFO("hip Device prop succeeded");
+
+  constexpr unsigned int num_waves_per_block = 2;
+  const unsigned int wave_size = devProp.warpSize;
+  const unsigned int num_threads_per_block = wave_size * num_waves_per_block;
+  const unsigned int num_blocks = 2;
+  const unsigned int num_threads = num_threads_per_block * num_blocks;
+  const size_t buffer_size = num_threads * sizeof(unsigned int);
+
+  HIP_CHECK(hipMalloc((void**)&device_mbcnt_lo, buffer_size));
+  HIP_CHECK(hipMalloc((void**)&device_mbcnt_hi, buffer_size));
+  HIP_CHECK(hipMalloc((void**)&device_lane_id, buffer_size));
+
+  hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block), 0, 0,
+                     device_mbcnt_lo, device_mbcnt_hi, device_lane_id);
+
+  unsigned int* host_mbcnt_lo = (unsigned int*)malloc(buffer_size);
+  unsigned int* host_mbcnt_hi = (unsigned int*)malloc(buffer_size);
+  unsigned int* host_lane_id = (unsigned int*)malloc(buffer_size);
+
+  HIP_CHECK(hipMemcpy(host_mbcnt_lo, device_mbcnt_lo, buffer_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(host_mbcnt_hi, device_mbcnt_hi, buffer_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(host_lane_id, device_lane_id, buffer_size, hipMemcpyDeviceToHost));
+
+  // verify the results
+  int mbcnt_lo_errors = 0;
+  int mbcnt_hi_errors = 0;
+  int lane_id_errors = 0;
+  for (unsigned int i = 0; i < num_threads; i++) {
+    unsigned int this_lane_id = i % wave_size;
+    unsigned int this_mbcnt_lo = this_lane_id >= 32 ? 32 : this_lane_id;
+    unsigned int this_mbcnt_hi = this_lane_id < 32 ? 0 : (this_lane_id - 32);
+
+    if (host_mbcnt_lo[i] != this_mbcnt_lo) mbcnt_lo_errors++;
+
+    if (host_mbcnt_hi[i] != this_mbcnt_hi) mbcnt_hi_errors++;
+
+    if (host_lane_id[i] != this_lane_id) lane_id_errors++;
+  }
+
+
+  HIP_CHECK(hipFree(device_mbcnt_lo));
+  HIP_CHECK(hipFree(device_mbcnt_hi));
+  HIP_CHECK(hipFree(device_lane_id));
+
+  free(host_mbcnt_lo);
+  free(host_mbcnt_hi);
+  free(host_lane_id);
+
+  REQUIRE(mbcnt_lo_errors == 0);
+  REQUIRE(mbcnt_hi_errors == 0);
+  REQUIRE(lane_id_errors == 0);
+}
\ No newline at end of file
diff --git a/catch/unit/deviceLib/popc.cc b/catch/unit/deviceLib/popc.cc
new file mode 100644
index 0000000000..e84d715ede
--- /dev/null
+++ b/catch/unit/deviceLib/popc.cc
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <cstdio>
+
+constexpr int WIDTH = 16;
+constexpr int HEIGHT = 16;
+constexpr int NUM = WIDTH * HEIGHT;
+
+constexpr int THREADS_PER_BLOCK_X = 8;
+constexpr int THREADS_PER_BLOCK_Y = 8;
+
+// CPU implementation of popcount
+template <typename T> unsigned int popcountCPU(T value) {
+  unsigned int ret = 0;
+  while (value) {
+    if (value & 0x1) ++ret;
+    value >>= 1;
+  }
+  return ret;
+}
+
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
+                           unsigned long long int* d, int width, int height) {
+  int x = blockDim.x * blockIdx.x + threadIdx.x;
+  int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+  int i = y * width + x;
+  if (i < (width * height)) {
+    a[i] = __popc(b[i]);
+    c[i] = __popcll(d[i]);
+  }
+}
+
+
+TEST_CASE("Unit_popc") {
+  using namespace std;
+  unsigned int* hostA;
+  unsigned int* hostB;
+  unsigned int* hostC;
+  unsigned long long int* hostD;
+
+  unsigned int* deviceA;
+  unsigned int* deviceB;
+  unsigned int* deviceC;
+  unsigned long long int* deviceD;
+
+  hipDeviceProp_t devProp;
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+  INFO("System minor : " << devProp.minor);
+  INFO("System major : " << devProp.major);
+  INFO("agent prop name : " << devProp.name);
+
+  INFO("hip Device prop succeeded");
+
+
+  int i;
+  int errors;
+
+  hostA = (unsigned int*)malloc(NUM * sizeof(unsigned int));
+  hostB = (unsigned int*)malloc(NUM * sizeof(unsigned int));
+  hostC = (unsigned int*)malloc(NUM * sizeof(unsigned int));
+  hostD = (unsigned long long int*)malloc(NUM * sizeof(unsigned long long int));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    hostB[i] = i;
+    hostD[i] = 1099511627776 - i;
+  }
+
+  HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(unsigned int)));
+  HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(unsigned int)));
+  HIP_CHECK(hipMalloc((void**)&deviceC, NUM * sizeof(unsigned int)));
+  HIP_CHECK(hipMalloc((void**)&deviceD, NUM * sizeof(unsigned long long int)));
+
+  HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(unsigned int), hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));
+
+
+  hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB,
+                     deviceC, deviceD, WIDTH, HEIGHT);
+
+
+  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(hostC, deviceC, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
+
+  // verify the results
+  errors = 0;
+  for (i = 0; i < NUM; i++) {
+    if (hostA[i] != popcountCPU(hostB[i])) {
+      errors++;
+    }
+  }
+
+  CHECK(errors == 0);
+
+  for (i = 0; i < NUM; i++) {
+    if (hostC[i] != popcountCPU(hostD[i])) {
+      errors++;
+    }
+  }
+
+
+  HIP_CHECK(hipFree(deviceA));
+  HIP_CHECK(hipFree(deviceB));
+  HIP_CHECK(hipFree(deviceC));
+  HIP_CHECK(hipFree(deviceD));
+
+  free(hostA);
+  free(hostB);
+  free(hostC);
+  free(hostD);
+
+  REQUIRE(errors == 0);
+}
\ No newline at end of file
diff --git a/catch/unit/deviceLib/syncthreadsand.cc b/catch/unit/deviceLib/syncthreadsand.cc
new file mode 100644
index 0000000000..83c2b21f8f
--- /dev/null
+++ b/catch/unit/deviceLib/syncthreadsand.cc
@@ -0,0 +1,127 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime.h>
+
+
+static __global__ void kernel_syncthreads_and(int* syncTestD, int* allThreadsZeroD,
+                                              int* allThreadsOneD, int* oneThreadZeroD,
+                                              int* allThreadsMinusOneD) {
+  int blockSize = blockDim.x;
+  int predicate = 10;
+  // First block index starts with 0, and second block index starts
+  // with blockSize
+  int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x;
+
+  // At very first, we need to ensure work-group level syncronization
+  // properly happened, don't bother about predicate testing for now.
+  // Thread 0 and thread 1 writes to shared memory. After call to api,
+  // every thread reads shared memory, and store product for verification
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  __syncthreads_and(predicate);
+  syncTestD[i] = sm[0] * sm[1];
+
+  // All threads pass 0 as predicate value, result should be 0
+  predicate = 0;
+  allThreadsZeroD[i] = __syncthreads_and(predicate);
+
+  // All threads pass 1 as predicate value, result should be 1
+  predicate = 1;
+  allThreadsOneD[i] = __syncthreads_and(predicate);
+
+  // Thread 0 pass 0, and all other threads 1 as predicate value,
+  // result should be 0
+  predicate = (threadIdx.x == 0) ? 0 : 1;
+  oneThreadZeroD[i] = __syncthreads_and(predicate);
+
+  // All threads pass -1 as predicate value, result should be 1
+  predicate = -1;
+  allThreadsMinusOneD[i] = __syncthreads_and(predicate);
+}
+
+static void test_syncthreads_and(int blockSize) {
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int *syncTestD, *syncTestH;
+  int *allThreadsZeroD, *allThreadsZeroH;
+  int *allThreadsOneD, *allThreadsOneH;
+  int *oneThreadZeroD, *oneThreadZeroH;
+  int *allThreadsMinusOneD, *allThreadsMinusOneH;
+
+  // Allocate device memory
+  HIP_CHECK(hipMalloc((void**)&syncTestD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsZeroD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsOneD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&oneThreadZeroD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsMinusOneD, nBytes));
+
+  // Allocate host memory
+  HIP_CHECK(hipHostMalloc((void**)&syncTestH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsZeroH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsOneH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&oneThreadZeroH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes));
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_syncthreads_and, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD,
+                     allThreadsOneD, oneThreadZeroD, allThreadsMinusOneD);
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(oneThreadZeroH, oneThreadZeroD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost));
+
+  // Validate results for both blocks together
+  for (int i = 0; i < 2 * blockSize; ++i) {
+    REQUIRE(syncTestH[i] == 200);
+    REQUIRE(allThreadsZeroH[i] == 0);
+    REQUIRE(allThreadsOneH[i] == 1);
+    REQUIRE(oneThreadZeroH[i] == 0);
+    REQUIRE(allThreadsMinusOneH[i] == 1);
+  }
+
+  // Free device memory
+  HIP_CHECK(hipFree(syncTestD));
+  HIP_CHECK(hipFree(allThreadsZeroD));
+  HIP_CHECK(hipFree(allThreadsOneD));
+  HIP_CHECK(hipFree(oneThreadZeroD));
+  HIP_CHECK(hipFree(allThreadsMinusOneD));
+
+  // Free host memory
+  HIP_CHECK(hipHostFree(syncTestH));
+  HIP_CHECK(hipHostFree(allThreadsZeroH));
+  HIP_CHECK(hipHostFree(allThreadsOneH));
+  HIP_CHECK(hipHostFree(oneThreadZeroH));
+  HIP_CHECK(hipHostFree(allThreadsMinusOneH));
+}
+
+TEST_CASE("Unit_syncthreads_and") {
+  int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024};
+  for (unsigned long i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i)
+    test_syncthreads_and(blockSizes[i]);
+}
diff --git a/catch/unit/deviceLib/syncthreadscount.cc b/catch/unit/deviceLib/syncthreadscount.cc
new file mode 100644
index 0000000000..ec8140ccac
--- /dev/null
+++ b/catch/unit/deviceLib/syncthreadscount.cc
@@ -0,0 +1,138 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime.h>
+
+
+static __global__ void kernel_syncthreads_count(int* syncTestD, int* allThreadsZeroD,
+                                                int* allThreadsOneD, int* oddThreadsOneD,
+                                                int* allThreadsMinusOneD, int* allThreadsIdD) {
+  int blockSize = blockDim.x;
+  int predicate = 10;
+  // First block index starts with 0, and second block index starts
+  // with blockSize
+  int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x;
+
+  // At very first, we need to ensure work-group level syncronization
+  // properly happened, don't bother about predicate testing for now.
+  // Thread 0 and thread 1 writes to shared memory. After call to api,
+  // every thread reads shared memory, and store sum for verification
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  __syncthreads_count(predicate);
+  syncTestD[i] = sm[0] + sm[1];
+
+  // All threads pass 0 as predicate value, result should be 0
+  predicate = 0;
+  allThreadsZeroD[i] = __syncthreads_count(predicate);
+
+  // All threads pass 1 as predicate value, result should be blockSize
+  predicate = 1;
+  allThreadsOneD[i] = __syncthreads_count(predicate);
+
+  // Odd numbered threads pass 1, and even numbered threads pass 0, as
+  // predicate value, result should be blockSize / 2
+  predicate = threadIdx.x % 2;
+  oddThreadsOneD[i] = __syncthreads_count(predicate);
+
+  // All threads pass -1 as predicate value, result should blockSize
+  predicate = -1;
+  allThreadsMinusOneD[i] = __syncthreads_count(predicate);
+
+  // Each thread pass its ID as predicate value, result should be blockSize - 1
+  predicate = threadIdx.x;
+  allThreadsIdD[i] = __syncthreads_count(predicate);
+}
+
+void test_syncthreads_count(int blockSize) {
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int *syncTestD, *syncTestH;
+  int *allThreadsZeroD, *allThreadsZeroH;
+  int *allThreadsOneD, *allThreadsOneH;
+  int *oddThreadsOneD, *oddThreadsOneH;
+  int *allThreadsMinusOneD, *allThreadsMinusOneH;
+  int *allThreadsIdD, *allThreadsIdH;
+
+  // Allocate device memory
+  HIP_CHECK(hipMalloc((void**)&syncTestD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsZeroD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsOneD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&oddThreadsOneD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsMinusOneD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsIdD, nBytes));
+
+  // Allocate host memory
+  HIP_CHECK(hipHostMalloc((void**)&syncTestH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsZeroH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsOneH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&oddThreadsOneH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsIdH, nBytes));
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_syncthreads_count, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD,
+                     allThreadsOneD, oddThreadsOneD, allThreadsMinusOneD, allThreadsIdD);
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(oddThreadsOneH, oddThreadsOneD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsIdH, allThreadsIdD, nBytes, hipMemcpyDeviceToHost));
+
+  // Validate results for both the blocks together
+  for (int i = 0; i < 2 * blockSize; ++i) {
+    REQUIRE(syncTestH[i] == 30);
+    REQUIRE(allThreadsZeroH[i] == 0);
+    REQUIRE(allThreadsOneH[i] == blockSize);
+    REQUIRE(oddThreadsOneH[i] == blockSize / 2);
+    REQUIRE(allThreadsMinusOneH[i] == blockSize);
+    REQUIRE(allThreadsIdH[i] == (blockSize - 1));
+  }
+
+  // Free device memory
+  HIP_CHECK(hipFree(syncTestD));
+  HIP_CHECK(hipFree(allThreadsZeroD));
+  HIP_CHECK(hipFree(allThreadsOneD));
+  HIP_CHECK(hipFree(oddThreadsOneD));
+  HIP_CHECK(hipFree(allThreadsMinusOneD));
+  HIP_CHECK(hipFree(allThreadsIdD));
+
+  // Free host memory
+  HIP_CHECK(hipHostFree(syncTestH));
+  HIP_CHECK(hipHostFree(allThreadsZeroH));
+  HIP_CHECK(hipHostFree(allThreadsOneH));
+  HIP_CHECK(hipHostFree(oddThreadsOneH));
+  HIP_CHECK(hipHostFree(allThreadsMinusOneH));
+  HIP_CHECK(hipHostFree(allThreadsIdH));
+}
+
+TEST_CASE("Unit_syncthreads_count") {
+  int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024};
+  for (unsigned long i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i)
+    test_syncthreads_count(blockSizes[i]);
+}
diff --git a/catch/unit/deviceLib/syncthreadsor.cc b/catch/unit/deviceLib/syncthreadsor.cc
new file mode 100644
index 0000000000..5b67432357
--- /dev/null
+++ b/catch/unit/deviceLib/syncthreadsor.cc
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime.h>
+
+
+static __global__ void kernel_syncthreads_or(int* syncTestD, int* allThreadsZeroD,
+                                             int* allThreadsOneD, int* oneThreadOneD,
+                                             int* allThreadsMinusOneD) {
+  int blockSize = blockDim.x;
+  int predicate = 10;
+  // First block index starts with 0, and second block index starts
+  // with blockSize
+  int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x;
+
+  // At very first, we need to ensure work-group level syncronization
+  // properly happened, don't bother about predicate testing for now.
+  // Thread 0 and thread 1 writes to shared memory. After call to api,
+  // every thread reads shared memory, and store subtraction for verification
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  __syncthreads_or(predicate);
+  syncTestD[i] = sm[1] - sm[0];
+
+  // All threads pass 0 as predicate value, result should be 0
+  predicate = 0;
+  allThreadsZeroD[i] = __syncthreads_or(predicate);
+
+  // All threads pass 1 as predicate value, result should be 1
+  predicate = 1;
+  allThreadsOneD[i] = __syncthreads_or(predicate);
+
+  // Thread 0 pass 1, and all other threads 0 as predicate value,
+  // result should be 1
+  predicate = (threadIdx.x == 0) ? 1 : 0;
+  oneThreadOneD[i] = __syncthreads_or(predicate);
+
+  // All threads pass -1 as predicate value, result should be 1
+  predicate = -1;
+  allThreadsMinusOneD[i] = __syncthreads_or(predicate);
+}
+
+static void test_syncthreads_or(int blockSize) {
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int *syncTestD, *syncTestH;
+  int *allThreadsZeroD, *allThreadsZeroH;
+  int *allThreadsOneD, *allThreadsOneH;
+  int *oneThreadOneD, *oneThreadOneH;
+  int *allThreadsMinusOneD, *allThreadsMinusOneH;
+
+  // Allocate device memory
+  HIP_CHECK(hipMalloc((void**)&syncTestD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsZeroD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsOneD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&oneThreadOneD, nBytes));
+  HIP_CHECK(hipMalloc((void**)&allThreadsMinusOneD, nBytes));
+
+  // Allocate host memory
+  HIP_CHECK(hipHostMalloc((void**)&syncTestH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsZeroH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsOneH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&oneThreadOneH, nBytes));
+  HIP_CHECK(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes));
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_syncthreads_or, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD,
+                     allThreadsOneD, oneThreadOneD, allThreadsMinusOneD);
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(oneThreadOneH, oneThreadOneD, nBytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost));
+
+  // Validate results for both blocks together
+  for (int i = 0; i < 2 * blockSize; ++i) {
+    REQUIRE(syncTestH[i] == 10);
+    REQUIRE(allThreadsZeroH[i] == 0);
+    REQUIRE(allThreadsOneH[i] == 1);
+    REQUIRE(oneThreadOneH[i] == 1);
+    REQUIRE(allThreadsMinusOneH[i] == 1);
+  }
+
+  // Free device memory
+  HIP_CHECK(hipFree(syncTestD));
+  HIP_CHECK(hipFree(allThreadsZeroD));
+  HIP_CHECK(hipFree(allThreadsOneD));
+  HIP_CHECK(hipFree(oneThreadOneD));
+  HIP_CHECK(hipFree(allThreadsMinusOneD));
+
+  // Free host memory
+  HIP_CHECK(hipHostFree(syncTestH));
+  HIP_CHECK(hipHostFree(allThreadsZeroH));
+  HIP_CHECK(hipHostFree(allThreadsOneH));
+  HIP_CHECK(hipHostFree(oneThreadOneH));
+  HIP_CHECK(hipHostFree(allThreadsMinusOneH));
+}
+
+
+TEST_CASE("Unit_syncthreads_or") {
+  int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024};
+  for (unsigned long i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i)
+    test_syncthreads_or(blockSizes[i]);
+}
diff --git a/catch/unit/deviceLib/threadfence_system.cc b/catch/unit/deviceLib/threadfence_system.cc
new file mode 100644
index 0000000000..a90c511ed7
--- /dev/null
+++ b/catch/unit/deviceLib/threadfence_system.cc
@@ -0,0 +1,112 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime.h>
+
+#include <atomic>
+#include <thread>
+
+__host__ __device__ void fence_system() {
+#ifdef __HIP_DEVICE_COMPILE__
+  __threadfence_system();
+#else
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+#endif
+}
+
+__host__ __device__ void round_robin(const int id, const int num_dev, const int num_iter,
+                                     volatile int* data, volatile int* flag) {
+  for (int i = 0; i < num_iter; i++) {
+    while (*flag % num_dev != id) fence_system();  // invalid the cache for read
+
+    (*data)++;
+    fence_system();  // make sure the store to data is sequenced before the store to flag
+    (*flag)++;
+    fence_system();  // invalid the cache to flush out flag
+  }
+}
+
+__global__ void gpu_round_robin(const int id, const int num_dev, const int num_iter,
+                                volatile int* data, volatile int* flag) {
+  round_robin(id, num_dev, num_iter, data, flag);
+}
+
+TEST_CASE("Unit_threadfence_system") {
+  int num_gpus = 0;
+  HIP_CHECK(hipGetDeviceCount(&num_gpus));
+  REQUIRE(num_gpus > 0);
+
+  volatile int* data;
+  if (hipHostMalloc(&data, sizeof(int), hipHostMallocCoherent) != hipSuccess) {
+    SUCCEED("Memory allocation failed. Skip test. Is SVM atomic supported?");
+  }
+
+  constexpr int init_data = 1000;
+  *data = init_data;
+
+  volatile int* flag;
+  if (hipHostMalloc(&flag, sizeof(int), hipHostMallocCoherent) != hipSuccess) {
+    SUCCEED("Memory allocation failed. Skip test. Is SVM atomic supported?");
+  }
+  *flag = 0;
+
+  // number of rounds per device
+  constexpr int num_iter = 1000;
+
+  // one CPU thread + 1 kernel/GPU
+  const int num_dev = num_gpus + 1;
+
+  int next_id = 0;
+  std::vector<std::thread> threads;
+
+  // create a CPU thread for the round_robin
+  threads.push_back(std::thread(round_robin, next_id++, num_dev, num_iter, data, flag));
+
+  // run one thread per GPU
+  dim3 dim_block(1, 1, 1);
+  dim3 dim_grid(1, 1, 1);
+
+  // launch one kernel per device for the round robin
+  for (; next_id < num_dev; ++next_id) {
+    threads.push_back(std::thread([=]() {
+      HIP_CHECK(hipSetDevice(next_id - 1));
+      hipLaunchKernelGGL(gpu_round_robin, dim_grid, dim_block, 0, 0x0, next_id, num_dev, num_iter,
+                         data, flag);
+      HIP_CHECK(hipDeviceSynchronize());
+    }));
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  int expected_data = init_data + num_dev * num_iter;
+  int expected_flag = num_dev * num_iter;
+
+  bool passed = *data == expected_data && *flag == expected_flag;
+
+  HIP_CHECK(hipHostFree((void*)data));
+  HIP_CHECK(hipHostFree((void*)flag));
+
+  REQUIRE(passed == true);
+}