SWDEV-388833 - [catch2][dtest] Sync tests migrated from dtests to catch2

Change-Id: I73664579ab9475e51db436b57ca6538aa7a8d5d7 [ROCm/hip-tests commit: 902c993615]
2023-11-15 14:55:06 +05:30
@@ -54,3 +54,4 @@ add_subdirectory(clock)
 add_subdirectory(vulkan_interop)
 add_subdirectory(gl_interop) # Disabled on NVIDIA due to defect - EXSWHTEC-246
 endif()
+add_subdirectory(synchronization)
@@ -0,0 +1,25 @@
+# Common Tests - Test independent of all platforms
+set(TEST_SRC
+    copy_coherency.cc
+)
+add_custom_target(memcpyInt.hsaco COMMAND ${CMAKE_CXX_COMPILER} --genco ${OFFLOAD_ARCH_STR}
+                  ${CMAKE_CURRENT_SOURCE_DIR}/memcpyIntDevice.cpp -o
+                  ${CMAKE_CURRENT_BINARY_DIR}/../synchronization/memcpyInt.hsaco -I
+                  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/ -I
+                  ${CMAKE_CURRENT_SOURCE_DIR}/../../include -L
+                  ${HIP_PATH}/${CMAKE_INSTALL_LIBDIR}/../../include --rocm-path=${ROCM_PATH})
+# only for AMD
+if(HIP_PLATFORM MATCHES "amd")
+  set(AMD_SRC
+    cache_coherency_cpu_gpu.cc
+    cache_coherency_gpu_gpu.cc
+  )
+  set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
+endif()
+
+hip_add_exe_to_target(NAME synchronizationTests
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      COMPILE_OPTIONS -std=c++14)
+add_dependencies(synchronizationTests memcpyInt.hsaco)
+
@@ -0,0 +1,282 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Simple test for Fine Grained CPU-GPU coherency.
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+
+typedef _Atomic(unsigned int) atomic_uint;
+
+// Helper function to spin on address until address equals value.
+// If the address holds the value of -1, abort because the other thread failed.
+__device__ int
+gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
+                                       unsigned int value) {
+  unsigned int compare;
+  bool check = false;
+  do {
+    compare = value;
+    check = __opencl_atomic_compare_exchange_strong(
+      reinterpret_cast<atomic_uint*>(address), /*expected=*/ &compare,
+       /*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+      /*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    if (compare == -1)
+      return -1;
+  } while (!check);
+  return 0;
+}
+
+// This kernel requires a single block, single thread dispatch.
+__global__ void
+gpu_kernel(int *A, int *B, int *X, int *Y, size_t N,
+           unsigned int *AA1, unsigned int *AA2,
+           unsigned int *BA1, unsigned int *BA2, unsigned int *dresult) {
+  for (size_t i = 0; i < N; i++) {
+    // Store data into A, system fence, and atomically mark flag.
+    // This guarantees this global write is visible by device 1.
+    A[i] = X[i];
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA1), 1,
+                      __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    // Wait on device 1's global write to B.
+    if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) {
+      *dresult = -1;
+      break;
+    }
+
+    // Check device 1 properly stored Y into B.
+    bool stored_data_matches = (B[i] == Y[i]);
+    if (!stored_data_matches) {
+      // If the data does not match, alert other thread and abort.
+      printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n",
+             i, B[i], Y[i]);
+      __opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(AA2), -1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+      *dresult = -1;
+    }
+    // Otherwise tell the other thread to continue.
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA2), 1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    // Wait on kernel gpu_cache1 to finish checking X is stored in A.
+    if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) {
+      *dresult = -1;
+      break;
+    }
+  }
+  *dresult = 0;
+}
+
+__host__ int
+cpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
+                                       unsigned int value) {
+  unsigned int compare;
+  bool check = false;
+  do {
+    compare = value;
+    check = __atomic_compare_exchange_n(
+      address, /*expected=*/ &compare, /*desired=*/ value,
+      /*weak=*/ false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
+    if (compare == -1)
+      return -1;
+  } while (!check);
+  return 0;
+}
+
+// This host thread runs only on a single CPU thread.
+__host__ void
+cpu_thread(int *A, int *B, int *X, int *Y, size_t N,
+           unsigned int *AA1, unsigned int *AA2,
+           unsigned int *BA1, unsigned int *BA2, unsigned int *hresult) {
+  for (size_t i = 0; i < N; i++) {
+    B[i] = Y[i];
+    __atomic_fetch_add(BA1, 1, __ATOMIC_RELEASE);
+    if (cpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) {
+      *hresult = -1;
+      break;
+    }
+
+    bool stored_data_matches = (A[i] == X[i]);
+    if (!stored_data_matches) {
+      printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n",
+             i, A[i], X[i]);
+      __atomic_exchange_n(BA2, -1, __ATOMIC_RELEASE);
+      *hresult = -1;
+      break;
+    }
+    __atomic_fetch_add(BA2, 1, __ATOMIC_RELEASE);
+    if (cpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) {
+      *hresult = -1;
+      break;
+    }
+  }
+  *hresult = 0;
+}
+
+static bool cpu_to_gpu_coherency() {
+  int *A_d, *B_d, *X_d, *Y_d;
+  int *A_res, *A_h, *B_h, *X_h, *Y_h;
+  unsigned int hresult, dresult;
+  size_t N = 1024;
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices < 1) {
+    HipTest::HIP_SKIP_TEST("Skipping because devices < 1");
+    return 0;
+  }
+
+  // Skip this test if feature is not supported.
+  static int device0 = 0;
+  hipDeviceProp_t props;
+  HIP_CHECK(hipGetDeviceProperties(&props, device0));
+  if (strncmp(props.gcnArchName, "gfx90a", 6) != 0 &&
+      strncmp(props.gcnArchName, "gfx940", 6) != 0) {
+    printf("info: skipping test on devices other than gfx90a and gfx940.\n");
+    return true;
+  }
+
+  // Allocate Host Side Memory. Coherent Fine-grained Memory for array B.
+  printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  HIP_CHECK(hipHostMalloc(&B_h, Nbytes,
+                         (hipHostMallocCoherent | hipHostMallocMapped)));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&B_d), B_h, 0));
+  X_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  Y_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+
+  // Initialize the arrays and atomic variables.
+  for (size_t i = 0; i < N; i++) {
+    X_h[i] = 100000000 + i;
+    Y_h[i] = 300000000 + i;
+  }
+
+  // Initialize shared atomic flags between CPU and GPU.
+  unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
+  unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
+  HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA1_d),
+                                    AA1_h, 0));
+  *AA1_h = 0;
+  HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA2_d),
+                                    AA2_h, 0));
+  *AA2_h = 0;
+  HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA1_d),
+                                    BA1_h, 0));
+  *BA1_h = 0;
+  HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA2_d),
+                                    BA2_h, 0));
+  *BA2_h = 0;
+
+  // Skip the first stream, ensure stream is non-blocking.
+  hipStream_t stream[2];
+  HIP_CHECK(hipStreamCreate(&stream[0]));
+  HIP_CHECK(hipSetDevice(0));
+  HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
+
+  // Allocate Device Side Memory. Coherent Fine-grained Memory for array A.
+  printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  hipError_t status = hipExtMallocWithFlags(reinterpret_cast<void**>(&A_d),
+                                           Nbytes, hipDeviceMallocFinegrained);
+  REQUIRE(status == hipSuccess);
+  // SVM memory - host pointer is the same as device pointer to array A.
+  A_h = A_d;
+  HIP_CHECK(hipMalloc(&X_d, Nbytes));
+  HIP_CHECK(hipMalloc(&Y_d, Nbytes));
+
+  HIP_CHECK(hipMemcpy(X_d, X_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(Y_d, Y_h, Nbytes, hipMemcpyHostToDevice));
+
+  // Launch the GPU kernel.
+  const unsigned blocks = 1;
+  const unsigned threadsPerBlock = 1;
+  hipLaunchKernelGGL(gpu_kernel, dim3(blocks), dim3(threadsPerBlock),
+                     0, stream[1],
+                     A_d, B_d, X_d, Y_d, N,
+                     AA1_d, AA2_d, BA1_d, BA2_d, &dresult);
+  // Check if launch failed.
+  HIP_CHECK(hipGetLastError());
+  REQUIRE(dresult == 0);
+
+  // Do not sync the launched stream, instead run the cpu_thread.
+  std::thread host_thread(cpu_thread,
+                          A_h, B_h, X_h, Y_h, N,
+                          AA1_h, AA2_h, BA1_h, BA2_h, &hresult);
+  host_thread.detach();
+  REQUIRE(hresult == 0);
+  // Wait for Device side to finish.
+  HIP_CHECK(hipStreamSynchronize(stream[1]));
+
+  // Evaluate the resultant arrays A and B.
+  A_res = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(A_res == 0 ? hipErrorOutOfMemory : hipSuccess);
+  HIP_CHECK(hipMemcpy(A_res, A_d, Nbytes, hipMemcpyDeviceToHost));
+
+  for (size_t i = 0; i < N; i++)  {
+    REQUIRE(A_res[i] == (100000000 + i));
+    REQUIRE(B_h[i] == (300000000 + i));
+  }
+
+  // Free all the device and host memory allocated.
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(X_d));
+  HIP_CHECK(hipFree(Y_d));
+  HIP_CHECK(hipHostFree(AA1_h));
+  HIP_CHECK(hipHostFree(AA2_h));
+  HIP_CHECK(hipHostFree(BA1_h));
+  HIP_CHECK(hipHostFree(BA2_h));
+  HIP_CHECK(hipHostFree(B_h));
+  free(X_h);
+  free(Y_h);
+  free(A_res);
+
+  return true;
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - This test runs on devices where XGMI enables fine-grained communication
+ * between GPUs. This performs a message passing test.
+ * Array A is allocated on Device 0, and remotely on host.
+ * Device 0 also increments atomic ints AA1 and AA2.
+ * Array B is allocated on host, and remotely on Device 0.
+ * Host also increments atomic ints BA1 and BA2.
+ * Kernel will launch on Device 0, and store array X into array A.
+ * Host Thread will store array Y into array B.
+ * Kernel will validate that the correct values of array Y are stored in B.
+ * Host Thread will validate that the correct values of array X are stored in A.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/synchronization/cache_coherency_cpu_gpu.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ *    - Test to be run only on AMD.
+ */
+
+TEST_CASE("Unit_cache_coherency_cpu_gpu") {
+  bool passed = true;
+  // Coherency between CPU and GPU sharing host and device memory.
+  REQUIRE(passed == cpu_to_gpu_coherency());
+}
@@ -0,0 +1,294 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Simple test for Fine Grained GPU-GPU coherency.
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+
+typedef _Atomic(unsigned int) atomic_uint;
+
+// Helper function to spin on address until address equals value.
+// If the address holds the value of -1, abort because the other thread failed.
+__device__ int
+gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
+                                       unsigned int value) {
+  unsigned int compare;
+  bool check = false;
+  do {
+    compare = value;
+    check = __opencl_atomic_compare_exchange_strong(
+      reinterpret_cast<atomic_uint*>(address), /*expected=*/ &compare,
+       /*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+      /*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    if (compare == -1)
+      return -1;
+  } while (!check);
+  return 0;
+}
+
+// This kernel requires a single block, single thread dispatch.
+__global__ void
+gpu_cache0(int *A, int *B, int *X, int *Y, size_t N,
+           unsigned int *AA1, unsigned int *AA2,
+           unsigned int *BA1, unsigned int *BA2, unsigned int *cache0_result) {
+  for (size_t i = 0; i < N; i++) {
+    // Store data into A, system fence, and atomically mark flag.
+    // This guarantees this global write is visible by device 1.
+    A[i] = X[i];
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA1), 1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    // Wait on device 1's global write to B.
+    if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) {
+      *cache0_result = -1;
+      break;
+    }
+
+    // Check device 1 properly stored Y into B.
+    bool stored_data_matches = (B[i] == Y[i]);
+    if (!stored_data_matches) {
+      // If the data does not match, alert other thread and abort.
+      printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n",
+             i, B[i], Y[i]);
+      __opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(AA2), -1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+      *cache0_result = -1;
+    }
+    // Otherwise tell the other thread to continue.
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA2), 1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    // Wait on kernel gpu_cache1 to finish checking X is stored in A.
+    if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) {
+      *cache0_result = -1;
+      break;
+    }
+  }
+  *cache0_result = 0;
+}
+
+// This kernel requires a single block, single thread dispatch.
+__global__ void
+gpu_cache1(int *A, int *B, int *X, int *Y, size_t N,
+           unsigned int *AA1, unsigned int *AA2,
+           unsigned int *BA1, unsigned int *BA2, unsigned int *cache1_result) {
+  for (size_t i = 0; i < N; i++) {
+    B[i] = Y[i];
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(BA1), 1,
+                __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    if (gpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) {
+      *cache1_result = -1;
+      break;
+    }
+
+    bool stored_data_matches = (A[i] == X[i]);
+    if (!stored_data_matches) {
+      printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n",
+             i, A[i], X[i]);
+      __opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(BA2), -1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+      *cache1_result = -1;
+    }
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(BA2), 1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    if (gpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) {
+      *cache1_result = -1;
+      break;
+    }
+  }
+  *cache1_result = 0;
+}
+
+static bool gpu_to_gpu_coherency() {
+  int *A_d, *B_d, *X_d0, *X_d1, *Y_d0, *Y_d1;
+  int *A_h, *B_h, *X_h, *Y_h;
+  unsigned int cache0_result, cache1_result;
+  size_t N = 1024;
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int numTestDevices = 2;
+
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices < numTestDevices) {
+    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
+    return 0;
+  }
+
+  // Skip this test if either device does not support this feature.
+  hipDeviceProp_t props0, props1;
+  HIP_CHECK(hipGetDeviceProperties(&props0, 0));
+  HIP_CHECK(hipGetDeviceProperties(&props1, 1));
+  if ((strncmp(props0.gcnArchName, "gfx90a", 6) != 0 ||
+       strncmp(props1.gcnArchName, "gfx90a", 6) != 0) &&
+      (strncmp(props0.gcnArchName, "gfx940", 6) != 0 ||
+       strncmp(props1.gcnArchName, "gfx940", 6) != 0)) {
+    printf("info: skipping test on devices other than gfx90a and gfx940.\n");
+    return true;
+  }
+
+  // Allocate Host Side Memory.
+  printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  A_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  B_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(B_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  X_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  Y_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+
+  // Initialize the arrays and atomic variables.
+  for (size_t i = 0; i < N; i++) {
+    X_h[i] = 100000000 + i;
+    Y_h[i] = 300000000 + i;
+  }
+
+  // Initialize shared atomic flags on host coherent memory.
+  unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
+  unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
+  HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA1_d),
+                                     AA1_h, 0));
+  *AA1_h = 0;
+  HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA2_d),
+                                     AA2_h, 0));
+  *AA2_h = 0;
+  HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA1_d),
+                                     BA1_h, 0));
+  *BA1_h = 0;
+  HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA2_d),
+                                     BA2_h, 0));
+  *BA2_h = 0;
+
+  // Skip the first stream.
+  hipStream_t stream[3];
+  HIP_CHECK(hipStreamCreate(&stream[0]));
+
+  // Set-up Device 0.
+  HIP_CHECK(hipSetDevice(0));
+  // Enable P2P access to Device 1.
+  HIP_CHECK(hipDeviceEnablePeerAccess(1, 0));
+  HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
+  // Allocating Coherent Memory for Array A_d on Device 0.
+  printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  hipError_t status = hipExtMallocWithFlags(reinterpret_cast<void**>(&A_d),
+                                           Nbytes, hipDeviceMallocFinegrained);
+  REQUIRE(status == hipSuccess);
+  HIP_CHECK(hipMalloc(&X_d0, Nbytes));
+  HIP_CHECK(hipMalloc(&Y_d0, Nbytes));
+
+  // Set-up Device 1.
+  HIP_CHECK(hipSetDevice(1));
+  // Enable P2P access to Device 0.
+  HIP_CHECK(hipDeviceEnablePeerAccess(0, 0));
+  HIP_CHECK(hipStreamCreateWithFlags(&stream[2], hipStreamNonBlocking));
+  // Allocating Coherent Memory for Array B_d on Device 1.
+  printf("info: allocate device 1 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  status = hipExtMallocWithFlags(reinterpret_cast<void**>(&B_d),
+                                 Nbytes, hipDeviceMallocFinegrained);
+  REQUIRE(status == hipSuccess);
+  HIP_CHECK(hipMalloc(&X_d1, Nbytes));
+  HIP_CHECK(hipMalloc(&Y_d1, Nbytes));
+
+  // Transfer initialized data onto the device arrays.
+  HIP_CHECK(hipMemcpy(X_d0, X_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(X_d1, X_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(Y_d0, Y_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(Y_d1, Y_h, Nbytes, hipMemcpyHostToDevice));
+
+  // Prepare and launch the device kernels.
+  const unsigned blocks = 1;
+  const unsigned threadsPerBlock = 1;
+  HIP_CHECK(hipSetDevice(0));
+  hipLaunchKernelGGL(gpu_cache0, dim3(blocks), dim3(threadsPerBlock),
+                     0, stream[1],
+                     A_d, B_d, X_d0, Y_d0, N,
+                     AA1_d, AA2_d, BA1_d, BA2_d, &cache0_result);
+  // Check if launch failed.
+  HIP_CHECK(hipGetLastError());
+  REQUIRE(cache0_result == 0);
+  HIP_CHECK(hipSetDevice(1));
+  hipLaunchKernelGGL(gpu_cache1, dim3(blocks), dim3(threadsPerBlock),
+                     0, stream[2],
+                     A_d, B_d, X_d1, Y_d1, N,
+                     AA1_d, AA2_d, BA1_d, BA2_d, &cache1_result);
+  HIP_CHECK(hipGetLastError());
+  REQUIRE(cache1_result == 0);
+
+  // Wait for kernels on both devices.
+  HIP_CHECK(hipStreamSynchronize(stream[1]));
+  HIP_CHECK(hipStreamSynchronize(stream[2]));
+
+  // Evaluate the resultant arrays A and B.
+  HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost));
+
+  for (size_t i = 0; i < N; i++)  {
+    REQUIRE(A_h[i] == (100000000 + i));
+    REQUIRE(B_h[i] == (300000000 + i));
+  }
+
+  // Free all the device and host memory allocated.
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(B_d));
+  HIP_CHECK(hipFree(X_d0));
+  HIP_CHECK(hipFree(Y_d0));
+  HIP_CHECK(hipFree(X_d1));
+  HIP_CHECK(hipFree(Y_d1));
+  HIP_CHECK(hipHostFree(AA1_h));
+  HIP_CHECK(hipHostFree(AA2_h));
+  HIP_CHECK(hipHostFree(BA1_h));
+  HIP_CHECK(hipHostFree(BA2_h));
+  free(A_h);
+  free(B_h);
+  free(X_h);
+  free(Y_h);
+
+  return true;
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - This test runs on devices where XGMI enables fine-grained communication
+ * between GPUs. This performs a message passing test.
+ * Array A is allocated on Device 0, and remotely on Device 1.
+ * Device 0 also increments atomic ints AA1 and AA2.
+ * Array B is allocated on Device 1, and remotely on Device 0.
+ * Device 1 also increments atomic ints BA1 and BA2.
+ * Kernel 0 will launch on Device 0, and store array X into array A.
+ * Kernel 1 will launch on Device 1, and store array Y into array B.
+ * Kernel 0 will validate that the correct values of array Y are stored in B.
+ * Kernel 1 will validate that the correct values of array X are stored in A.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/synchronization/cache_coherency_gpu_gpu.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ *    - Test to be run only on AMD.
+ */
+
+TEST_CASE("Unit_cache_coherency_gpu_gpu") {
+  bool passed = true;
+  // Coherency between GPUs accessing local or remote FB.
+  REQUIRE(passed == gpu_to_gpu_coherency());
+}
@@ -0,0 +1,340 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+
+unsigned threadsPerBlock = 256;
+unsigned blocksPerCU = 6;
+
+class MemcpyFunction {
+ public:
+    MemcpyFunction(const char* fileName, const char* functionName) {
+      load(fileName, functionName);
+    }
+    void load(const char* fileName, const char* functionName);
+    void launch(int* dst, const int* src, size_t numElements, hipStream_t s);
+
+ private:
+    hipFunction_t _function;
+    hipModule_t _module;
+};
+
+
+void MemcpyFunction::load(const char* fileName, const char* functionName) {
+    HIP_CHECK(hipModuleLoad(&_module, fileName));
+    HIP_CHECK(hipModuleGetFunction(&_function, _module, functionName));
+}
+
+void MemcpyFunction::launch(int* dst, const int* src, size_t numElements, hipStream_t s) { // NOLINT
+  struct {
+    int* _dst;
+    const int* _src;
+    size_t _numElements;
+  } args;
+
+  args._dst = dst;
+  args._src = src;
+  args._numElements = numElements;
+
+  size_t size = sizeof(args);
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock,
+                    numElements);
+  HIP_CHECK(hipModuleLaunchKernel(_function, blocks, 1, 1, threadsPerBlock,
+            1, 1, 0, s, NULL,
+            reinterpret_cast<void**>(&config)));
+}
+
+bool g_warnOnFail = true;
+int g_elementSizes[] = {128 * 1000, 256 * 1000, 16 * 1000 * 1000};
+
+// Set value of array to specified 32-bit integer:
+__global__ void memsetIntKernel(int* ptr, const int val, size_t numElements) {
+  int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int stride = blockDim.x * gridDim.x;
+  for (size_t i = gid; i < numElements; i += stride) {
+    ptr[i] = val;
+  }
+}
+
+__global__ void memcpyIntKernel(int* dst, const int* src, size_t numElements) {
+  int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int stride = blockDim.x * gridDim.x;
+  for (size_t i = gid; i < numElements; i += stride) {
+      dst[i] = src[i];
+  }
+}
+
+// Check arrays in reverse order, to more easily detect cases where
+// the copy is "partially" done.
+void checkReverse(const int* ptr, int numElements, int expected) {
+  int mismatchCnt = 0;
+  for (int i = numElements - 1; i >= 0; i--) {
+    if (!g_warnOnFail) {
+      REQUIRE(ptr[i] == expected);
+    }
+    if (++mismatchCnt >= 10) {
+        break;
+    }
+  }
+}
+
+#define ENUM_CASE_STR(x)                                                      \
+    case x:                                                                   \
+        return #x
+
+enum CmdType { COPY, KERNEL, MODULE_KERNEL, MAX_CmdType };
+
+const char* CmdTypeStr(CmdType c) {
+    switch (c) {
+        ENUM_CASE_STR(COPY);
+        ENUM_CASE_STR(KERNEL);
+        ENUM_CASE_STR(MODULE_KERNEL);
+        default:
+            return "UNKNOWN";
+    }
+}
+
+enum SyncType {
+  NONE,
+  EVENT_QUERY,
+  EVENT_SYNC,
+  STREAM_WAIT_EVENT,
+  STREAM_QUERY,
+  STREAM_SYNC,
+  DEVICE_SYNC,
+  MAX_SyncType
+};
+
+const char* SyncTypeStr(SyncType s) {
+  switch (s) {
+    ENUM_CASE_STR(NONE);
+    ENUM_CASE_STR(EVENT_QUERY);
+    ENUM_CASE_STR(EVENT_SYNC);
+    ENUM_CASE_STR(STREAM_WAIT_EVENT);
+    ENUM_CASE_STR(STREAM_QUERY);
+    ENUM_CASE_STR(STREAM_SYNC);
+    ENUM_CASE_STR(DEVICE_SYNC);
+    default:
+      return "UNKNOWN";
+  }
+}
+
+void runCmd(CmdType cmd, int* dst, const int* src, hipStream_t s,
+             size_t numElements) {
+  switch (cmd) {
+    case COPY:
+      HIP_CHECK(
+        hipMemcpyAsync(dst, src, numElements * sizeof(int),
+                        hipMemcpyDeviceToDevice, s));
+      break;
+    case KERNEL: {
+      unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
+                                 threadsPerBlock, numElements);
+      hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock),
+                          0, s, dst, src, numElements);
+    } break;
+    case MODULE_KERNEL: {
+      MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
+      g_moduleMemcpy.launch(dst, src, numElements, s);
+    } break;
+    default:
+      printf("Info:unknown cmd=%d type", cmd);
+  }
+}
+
+void resetInputs(int* Ad, int* Bd, int* Ch,
+                 size_t numElements, int expected) {
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
+                                          threadsPerBlock, numElements);
+  hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
+                      0, hipStream_t(0), Ad, expected, numElements);
+  // poison with bad value to ensure is overwritten correctly
+  hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
+                      0, hipStream_t(0), Bd, 0xDEADBEEF, numElements);
+  hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
+                      0, hipStream_t(0), Bd, 0xF000BA55, numElements);
+  memset(Ch, 13, numElements * sizeof(int));
+  HIP_CHECK(hipDeviceSynchronize());
+}
+
+// Intended to test proper synchronization and cache flushing
+// between CMDA and CMDB. CMD are of type CmdType. All command copy memory,
+// using either hipMemcpyAsync or kernel implementations.
+// Some form of synchronization is applied. Then cmdB copies from Bd to Cd.
+// CmdA copies from Ad to Bd, Cd is then copied to host Ch using a memory copy.
+// Correct result at the end is that Ch contains the
+// contents originally in Ad (integer 0x42)
+
+void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
+                 hipStream_t stream1, hipStream_t stream2, int numElements,
+                 int* Ad, int* Bd, int* Cd, int* Ch, int expected) {
+  hipEvent_t e;
+  HIP_CHECK(hipEventCreateWithFlags(&e, 0));
+
+  resetInputs(Ad, Bd, Ch, numElements, expected);
+
+  const size_t sizeElements = numElements * sizeof(int);
+  fprintf(stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", // NOLINT
+          sizeElements, static_cast<double>(sizeElements / 1024.0),
+          CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType));
+
+  /*if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) { // NOLINT
+    fprintf(stderr, "warn: skipping since test infra does not yet support modules\n"); // NOLINT
+    return;
+  }*/
+
+  // Step A:
+  runCmd(cmdAType, Bd, Ad, stream1, numElements);
+
+  // Sync in-between?
+  switch (syncType) {
+    case NONE:
+      break;
+    case EVENT_QUERY: {
+      hipError_t st = hipErrorNotReady;
+      HIP_CHECK(hipEventRecord(e, stream1));
+      do {
+          st = hipEventQuery(e);
+      } while (st == hipErrorNotReady);
+      HIP_CHECK(st);
+    } break;
+    case EVENT_SYNC:
+      HIP_CHECK(hipEventRecord(e, stream1));
+      HIP_CHECK(hipEventSynchronize(e));
+      break;
+    case STREAM_WAIT_EVENT:
+      HIP_CHECK(hipEventRecord(e, stream1));
+      HIP_CHECK(hipStreamWaitEvent(stream2, e, 0));
+      break;
+    case STREAM_QUERY: {
+      hipError_t st = hipErrorNotReady;
+      do {
+          st = hipStreamQuery(stream1);
+      } while (st == hipErrorNotReady);
+      HIP_CHECK(st);
+    } break;
+    case STREAM_SYNC:
+      HIP_CHECK(hipStreamSynchronize(stream1));
+      break;
+    case DEVICE_SYNC:
+      HIP_CHECK(hipDeviceSynchronize());
+      break;
+    default:
+      fprintf(stderr, "warning: unknown sync type=%s", SyncTypeStr(syncType));
+      return;
+  }
+  runCmd(cmdBType, Cd, Bd, stream2, numElements);
+
+  // Copy back to host, use async copy to avoid any extra synchronization
+  //  that might mask issues.
+  HIP_CHECK(hipMemcpyAsync(Ch, Cd, sizeElements, hipMemcpyDeviceToHost,
+                            stream2));
+  HIP_CHECK(hipStreamSynchronize(stream2));
+
+  checkReverse(Ch, numElements, expected);
+
+  HIP_CHECK(hipEventDestroy(e));
+}
+
+void testWrapper(size_t numElements) {
+  const size_t sizeElements = numElements * sizeof(int);
+  const int expected = 0x42;
+  int *Ad, *Bd, *Cd, *Ch;
+
+  HIP_CHECK(hipMalloc(&Ad, sizeElements));
+  HIP_CHECK(hipMalloc(&Bd, sizeElements));
+  HIP_CHECK(hipMalloc(&Cd, sizeElements));
+  HIP_CHECK(hipHostMalloc(&Ch, sizeElements));
+
+  hipStream_t stream1, stream2;
+
+  HIP_CHECK(hipStreamCreate(&stream1));
+  HIP_CHECK(hipStreamCreate(&stream2));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements,
+              Ad, Bd, Cd, Ch, expected);
+
+  for (int cmdA = 0; cmdA < MAX_CmdType; cmdA++) {
+    for (int cmdB = 0; cmdB < MAX_CmdType; cmdB++) {
+      for (int syncMode = 0; syncMode < MAX_SyncType; syncMode++) {
+        switch (syncMode) {
+          // case NONE::
+          case EVENT_QUERY:
+          case EVENT_SYNC:
+          case STREAM_WAIT_EVENT:
+          // case STREAM_QUERY:
+          case STREAM_SYNC:
+          case DEVICE_SYNC:
+            runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB),
+                      stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
+            break;
+          default:
+            break;
+        }
+      }
+    }
+  }
+
+#if 0
+  runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2,
+              numElements, Ad, Bd, Cd, Ch, expected);
+  runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements,
+              Ad, Bd, Cd, Ch, expected);
+  runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2,
+               numElements, Ad, Bd, Cd, Ch, expected);
+  runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements,
+              Ad, Bd, Cd, Ch, expected);
+#endif
+
+  HIP_CHECK(hipFree(Ad));
+  HIP_CHECK(hipFree(Bd));
+  HIP_CHECK(hipFree(Cd));
+  HIP_CHECK(hipHostFree(Ch));
+
+  HIP_CHECK(hipStreamDestroy(stream1));
+  HIP_CHECK(hipStreamDestroy(stream2));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test cache management (fences) and synchronization between
+ * kernel and copy commands. Exhaustively tests 3 command types
+ * (copy, kernel, module kernel), many sync types (see SyncType), followed by
+ *  another command, across a sweep of data sizes designed to stress
+ * various levels of the memory hierarchy.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/synchronization/copy_coherency.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_Copy_Coherency") {
+  for (int index = 0; index < sizeof(g_elementSizes) / sizeof(int); index++) {
+    size_t numElements = g_elementSizes[index];
+    testWrapper(numElements);
+  }
+}
@@ -0,0 +1,10 @@
+#include <hip/hip_runtime.h>
+extern "C" __global__ void memcpyIntKernel(int* dst, const int* src,
+                                           size_t numElements) {
+    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+    int stride = blockDim.x * gridDim.x;
+    for (size_t i = gid; i < numElements; i += stride) {
+        dst[i] = src[i];
+    }
+};
+