SWDEV-313387 - Port CTS SVM atomic tests to Hip

Change-Id: I32c9bed860ddf4fe1d7bba21dce9bd728168c398 [ROCm/hip-tests commit: f425bee1dc]
2023-09-07 18:27:56 -04:00
parent 9ead75d6a1
commit 3bcee40ac1
@@ -178,5 +178,16 @@ hip_add_exe_to_target(NAME MemoryTest2
  TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})

 if(HIP_PLATFORM MATCHES "amd")
+  set(TEST_SRC
+    hipSVMTestByteGranularity.cpp
+    hipSVMTestFineGrainMemoryConsistency.cpp
+    hipSVMTestFineGrainSyncBuffers.cpp
+    hipSVMTestSharedAddressSpaceFineGrain.cpp
+  )
+
+  hip_add_exe_to_target(NAME SVMAtomicTest
+    TEST_SRC ${TEST_SRC}
+    TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
+
  add_dependencies(build_tests hipHostRegisterPerf)
 endif()
@@ -0,0 +1,141 @@
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+/*
+ * Modifications Copyright (C)2023 Advanced
+ * Micro Devices, Inc. All rights reserved.
+ */
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include <vector>
+#include <string>
+
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+    #include <windows.h>
+#endif
+
+// SVM Atomic wrappers.
+// Platforms that support SVM atomics (atomics that work across the host and devices) need to
+// implement these host side functions correctly. Platforms that do not support SVM atomics can
+// simpy implement these functions as empty stubs since the functions will not be called. For now
+// only Windows x86 is implemented, add support for other platforms as needed.
+unsigned int inline AtomicLoad32(unsigned int* pValue) {
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+  return (unsigned int)InterlockedExchangeAdd((LONG*)pValue, 0l);
+#elif defined(__GNUC__)
+  return __sync_add_and_fetch(pValue, 0);
+#else
+  return -1;
+#endif
+}
+
+// all the x86 atomics are seq_cst, so don't need to do anything with the memory order parameter.
+unsigned int inline AtomicFetchAdd32(unsigned int* object, int operand) {
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+  return InterlockedExchangeAdd((LONG*)object, operand);
+#elif defined(__GNUC__)
+  return __sync_fetch_and_add(object, operand);
+#else
+  return -1;
+#endif
+}
+
+template <typename T>
+T inline AtomicFetchAdd64(T* object, T operand) {
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+  return (T)InterlockedExchangeAdd64((LONG64*)object, (LONG64)operand);
+#elif defined(__GNUC__)
+  return (T)__sync_fetch_and_add((intptr_t*)object, (intptr_t)operand);
+#else
+  return -1;
+#endif
+}
+
+unsigned int inline AtomicExchange32(unsigned int* object, unsigned int desired) {
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+  return (unsigned int)InterlockedExchange((LONG*)object, (LONG)desired);
+#elif defined(__GNUC__)
+  return __sync_lock_test_and_set(object, desired);
+#else
+  return -1;
+#endif
+}
+
+template <typename T>
+T inline AtomicExchange64(T* a, T expected) {
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
+  return (T)InterlockedExchangePointer((PVOID volatile*)a, (PVOID)expected);
+#elif defined(__GNUC__)
+  return (T)__sync_lock_test_and_set((long long*)a, (long long)expected);
+#else
+  tmp = 0;
+#endif
+}
+
+template <typename T>
+bool AtomicCompareExchange64(T* a, T* expected, T desired)
+{
+#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32))
+  T tmp = (T)InterlockedCompareExchange64((LONG64 *)a, (LONG64)desired,
+                                          *(LONG64 *)expected);
+#elif defined(__GNUC__)
+  T tmp = (T)__sync_val_compare_and_swap((intptr_t*)a, (intptr_t)(*expected),
+                                         (intptr_t)desired);
+#else
+  tmp = 0;
+#endif
+  if(tmp == *expected)
+    return true;
+  *expected = tmp;
+  return false;
+}
+
+inline void* align_malloc(size_t size, size_t alignment) {
+#if defined(_WIN32) && defined(_MSC_VER)
+  return _aligned_malloc(size, alignment);
+#elif defined(__linux__) || defined(linux) || defined(__APPLE__)
+  void* ptr = NULL;
+#if defined(__ANDROID__)
+  ptr = memalign(alignment, size);
+  if (ptr) return ptr;
+#else
+  if (alignment < sizeof(void*)) {
+    alignment = sizeof(void*);
+  }
+  if (0 == posix_memalign(&ptr, alignment, size)) return ptr;
+#endif
+  return NULL;
+#elif defined(__MINGW32__)
+  return __mingw_aligned_malloc(size, alignment);
+#else
+#error "Please add support OS for aligned malloc"
+#endif
+}
+
+inline void align_free(void* ptr) {
+#if defined(_WIN32) && defined(_MSC_VER)
+  _aligned_free(ptr);
+#elif defined(__linux__) || defined(linux) || defined(__APPLE__)
+  return free(ptr);
+#elif defined(__MINGW32__)
+  return __mingw_aligned_free(ptr);
+#else
+#error "Please add support OS for aligned free"
+#endif
+}
+
+#endif    // #ifndef __COMMON_H__
+
@@ -0,0 +1,154 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+/*
+ * Modifications Copyright (C)2023 Advanced
+ * Micro Devices, Inc. All rights reserved.
+ */
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+// Each device will write it's id into the bytes that it "owns", ownership is based on round robin
+// (global_id % num_id) num_id is equal to number of SVM devices in the system plus one (for the
+// host code). id is the index (id) of the device that this kernel is executing on. For example, if
+// there are 2 SVM devices and the host; the buffer should look like this after each device and the
+// host write their id's: 0, 1, 2, 0, 1, 2, 0, 1, 2...
+__global__ void write_owned_locations(char* a, unsigned int num_id, unsigned int id) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  int owner = i % num_id;
+  if (id == owner) a[i] = id;  // modify location if it belongs to this device, write id
+}
+
+// Verify that a device can see the byte sized updates from the other devices, sum up the device
+// id's and see if they match expected value. Note: this must be called with a reduced NDRange so
+// that neighbor acesses don't go past end of buffer. For example if there are two SVM devices and
+// the host (3 total devices) the buffer should look like this: 0,1,2,0,1,2... and the expected sum
+// at each point is 0+1+2 = 3.
+__global__ void sum_neighbor_locations(char* a, unsigned int num_devices,
+                                       unsigned int* error_count) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int expected_sum = (num_devices * (num_devices - 1)) / 2;
+  unsigned int sum = 0;
+  for (unsigned int j = 0; j < num_devices; j++) {
+    sum += a[i + j];  // add my neighbors to the right
+  }
+  if (sum != expected_sum)
+    atomicAdd_system(error_count, 1u); // like opencl atomic_inc()
+}
+
+/**
+* Test Description
+* ------------------------
+* - The suite will test the following functions,
+      hipHostMalloc() with following flags,
+        hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
+        hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
+      atomicAdd_system()(in kernel)
+      hipStreamCreate()
+      hipStreamSynchronize()
+*   It will demonstrate use of SVM's atomics to do fine grain synchronization among
+*   devices with each stream on each device. The result will be verified on the host.
+* Test source
+* ------------------------
+* - catch/unit/memory/hipSVMTestByteGranularity.cpp
+* Test requirements
+* ------------------------
+*  - Host specific (WINDOWS and LINUX)
+*  - Fine grain access and atomics supported on devices
+*  - HIP_VERSION >= 5.7
+*/
+TEST_CASE("test_svm_byte_granularity") {
+  const int num_elements = 2048;
+  int num_devices = 0;
+  HIP_CHECK(hipGetDeviceCount(&num_devices));
+  int num_devices_plus_host = num_devices + 1;
+  std::vector<hipStream_t> streams(num_devices);
+
+  for (int d = 0; d < num_devices; d++) {
+    HIP_CHECK(hipSetDevice(d));
+    HIP_CHECK(hipStreamCreate(&streams[d]));
+  }
+  HIP_CHECK(hipSetDevice(0));
+  char* pA = nullptr;
+  // hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER
+  HIP_CHECK(hipHostMalloc(&pA, sizeof(char) * num_elements, hipHostMallocNonCoherent));
+  unsigned int** error_counts = (unsigned int**)malloc(sizeof(void*) * num_devices);
+
+  for(unsigned int i=0; i < num_devices; i++) {
+    // hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS
+    // We need atomic inc among different GPUs
+    HIP_CHECK(hipHostMalloc(&error_counts[i], sizeof(unsigned int) * num_elements,
+                            hipHostMallocCoherent));
+    *error_counts[i] = 0;
+  }
+  for(int i = 0; i < num_elements; i++) pA[i] = -1;
+
+  // get all the devices going simultaneously
+  for(unsigned int d = 0; d < num_devices; d++)  // device ids starting at 1.
+  {
+    write_owned_locations<<<num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host, d);
+    HIP_CHECK(hipGetLastError());
+  }
+  unsigned int host_id = num_devices;  // host code will take the id above the devices.
+  for(unsigned int i = num_devices; i < num_elements; i+= num_devices_plus_host)
+    pA[i] = host_id;
+
+  for (unsigned int d = 0; d < num_devices; d++) {
+    HIP_CHECK(hipStreamSynchronize(streams[d]));
+  }
+
+  // now check that each device can see the byte writes made by the other devices.
+  // adjusted so sum_neighbor_locations doesn't read past end of buffer
+  size_t adjusted_num_elements = num_elements - num_devices;
+  for(unsigned int d = 0; d < num_devices; d++)
+  {
+    sum_neighbor_locations<<<adjusted_num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host,
+                                                                     error_counts[d]);
+    HIP_CHECK(hipGetLastError());
+  }
+
+  for (unsigned int d = 0; d < num_devices; d++) {
+    HIP_CHECK(hipStreamSynchronize(streams[d]));
+  }
+  // see if any of the devices found errors
+  for(unsigned int d = 0; d < num_devices; d++) {
+    if (*error_counts[d] > 0) {
+      fprintf(stderr, "*error_counts[%u] = %u\n", d, *error_counts[d]);
+      REQUIRE(false);
+    }
+  }
+  unsigned int expected = (num_devices_plus_host * (num_devices_plus_host - 1))/2;
+  // check that host can see the byte writes made by the devices.
+  for(unsigned int i = 0; i < num_elements - num_devices_plus_host; i++)
+  {
+    unsigned int sum = 0;
+    for(unsigned int j = 0; j < num_devices_plus_host; j++) sum += pA[i+j];
+    if (sum != expected) {
+      fprintf(stderr, "[%u]: sum %u != expected %u", i, sum, expected);
+      REQUIRE(false);
+    }
+  }
+  for (unsigned int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipStreamDestroy(streams[i]));
+    HIP_CHECK(hipHostFree(error_counts[i]));
+  }
+  free(error_counts);
+  HIP_CHECK(hipHostFree(pA));
+  REQUIRE(true);
+}
@@ -0,0 +1,261 @@
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+/*
+ * Modifications Copyright (C)2023 Advanced
+ * Micro Devices, Inc. All rights reserved.
+ */
+#include <chrono>
+#include <thread>
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <utils.hh>
+#include "hipSVMCommon.h"
+//#define DEBUG_ATOMIC  // To provide additional data for debugging
+#ifdef DEBUG_ATOMIC
+//#define DEBUG_ATOMIC_PRINT_THREAD
+#endif
+
+typedef struct BinNode {
+#ifdef DEBUG_ATOMIC
+  unsigned int n;
+  unsigned int d;
+  unsigned int i;
+#endif
+  unsigned int value;
+  struct BinNode* pNext;
+} BinNode;
+
+__global__ void build_hash_table_on_device(unsigned int* input, size_t inputSize,
+                                          BinNode* pNodes,
+                                          unsigned int* pNumNodes, unsigned int numBins,
+                                          unsigned int dev) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= inputSize) return;
+
+  unsigned int n = atomicAdd_system((unsigned int*)pNumNodes, 1u);
+  BinNode* pNew = &pNodes[n];
+  unsigned int b = input[i] % numBins;
+
+  pNew->value = input[i];
+#ifdef DEBUG_ATOMIC
+  pNew->d = dev;
+  pNew->i = i;
+  pNew->n = n;
+#endif
+  unsigned long long next = 0;
+  unsigned long long old = atomicAdd_system((unsigned long long*)&(pNodes[b].pNext),
+                                            0ull);  // Because of no atomicLoad()
+  do {
+    next = old;
+    // Use CAS to ensure atomic operation
+    //pNew->pNext =  (BinNode*)next;
+    atomicExch((unsigned long long*)&(pNew->pNext), next);
+    old = atomicCAS_system((unsigned long long *)&(pNodes[b].pNext), next,
+                           (unsigned long long )pNew);
+  } while (old != next);
+#ifdef DEBUG_ATOMIC_PRINT_THREAD
+  printf("k%u: i=%zu, n=%u, pNew=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p), pNodes[%u]=%p,"
+      " old=%p, input[%zu]=%u\n", dev, i, n,
+      pNew, pNew->n, pNew->d, pNew->i, pNew->value, pNew->pNext, b, &pNodes[b], (void*)old,
+      i, input[i]);
+#else
+  (void)dev;
+#endif
+}
+
+void build_hash_table_on_host(unsigned int* input, size_t inputSize, BinNode* pNodes,
+                              unsigned int* pNumNodes, unsigned int numBins,
+                              unsigned int dev) {
+  // wait until we see some activity from a device (try to run host side simultaneously).
+  while (numBins == AtomicLoad32(pNumNodes));
+  for(unsigned int i = 0; i < inputSize; i++)
+  {
+    unsigned int n = AtomicFetchAdd32(pNumNodes, 1u);
+    BinNode* pNew = &pNodes[n];
+    unsigned int b = input[i] % numBins;
+#ifdef DEBUG_ATOMIC
+    pNew->d = dev;
+    pNew->i = i;
+    pNew->n = n;
+#endif
+    pNew->value = input[i];
+    BinNode* next = AtomicFetchAdd64(&pNodes[b].pNext, (BinNode*)0ll);
+    do {
+      AtomicExchange64(&(pNew->pNext), next);
+      // always inserting at head of list
+    } while (!AtomicCompareExchange64(&(pNodes[b].pNext), &next,
+                                                (BinNode*)pNew));
+#ifdef DEBUG_ATOMIC_PRINT_THREAD
+      fprintf(stderr,
+        "k%u: i=%u, n=%u, pNew=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p), pNodes[%u]=%p, "
+        "input[%u]=%u\n",
+        dev, i, n, pNew, pNew->n, pNew->d, pNew->i, pNew->value, pNew->pNext, b, &pNodes[b],
+        i, input[i]);
+#else
+    (void)dev;
+#endif
+  }
+}
+
+void launch_kernels_and_verify(std::vector<hipStream_t> &streams, unsigned int num_devices,
+  unsigned int numBins, size_t num_pixels) {
+  unsigned int* pInputImage = nullptr;
+  BinNode* pNodes = nullptr;
+  unsigned int* pNumNodes = nullptr;
+  unsigned int total_items = num_pixels * (num_devices + 1);
+  HIP_CHECK(hipHostMalloc(&pInputImage, sizeof(unsigned int) * num_pixels, hipHostMallocCoherent));
+  HIP_CHECK(
+      hipHostMalloc(&pNodes, sizeof(BinNode) * (total_items + numBins), hipHostMallocCoherent));
+  HIP_CHECK(hipHostMalloc(&pNumNodes, sizeof(unsigned int), hipHostMallocCoherent));
+
+  *pNumNodes = numBins;  // using the first numBins nodes to hold the list heads.
+  for(unsigned int i = 0; i < numBins; i++) pNodes[i].pNext = nullptr;
+  for(unsigned int i = 0; i < num_pixels; i++) pInputImage[i] = i;
+
+  // Get all the devices going simultaneously, each device (and the host) will insert
+  // all the pixels.
+  for(unsigned int d=0; d < num_devices; d++)
+  {
+    build_hash_table_on_device<<<(num_pixels + 255) / 256, 256, 0, streams[d]>>>(
+        pInputImage, num_pixels, pNodes, pNumNodes, numBins, d);
+    HIP_CHECK(hipGetLastError());
+  }
+
+  std::vector<std::thread> threads;
+  threads.push_back(std::thread(build_hash_table_on_host, pInputImage, num_pixels, pNodes,
+                                pNumNodes, numBins, num_devices));
+  for (unsigned int d = 0; d < num_devices; d++) {
+    threads.push_back(std::thread(
+        [](hipStream_t s) {
+          HIP_CHECK(hipStreamSynchronize(s));  // To workarround batch dispatching on Windows
+        }, streams[d]));
+  }
+  std::for_each(threads.begin(), threads.end(), [](std::thread& t) { t.join(); });
+
+  for (unsigned int d = 0; d < num_devices; d++) {
+    HIP_CHECK(hipSetDevice(d));
+    HIP_CHECK(hipDeviceSynchronize());
+  }
+  HIP_CHECK(hipSetDevice(0));
+  unsigned int num_items = 0;
+  // check correctness of each bin in the hash table.
+  for(unsigned int i = 0; i < numBins; i++)
+  {
+    BinNode *pNode = pNodes[i].pNext;
+    unsigned int num_items_bin = 0;
+    unsigned int total_num_items_bin =
+        (num_pixels % numBins <= i) ? (num_pixels / numBins) : (num_pixels / numBins + 1);
+    total_num_items_bin *= (num_devices + 1);  // The item number of the list in i-th bin
+    while(pNode)
+    {
+#ifdef DEBUG_ATOMIC_PRINT_THREAD
+      fprintf(stderr, "v%u/%u: %u, pNode=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p)\n", i, numBins,
+              num_items_bin, pNode, pNode->n, pNode->d, pNode->i, pNode->value, pNode->pNext);
+#endif
+      if((pNode->value % numBins) != i)
+      {
+        fprintf(stderr,
+                "Something went wrong at i=%u, item is in wrong hash bucket:" \
+                "pNode->value=%u, numBins=%u\n",  i, pNode->value, numBins);
+        REQUIRE(false);
+      }
+      num_items++;
+      num_items_bin++;
+      if (num_items_bin > total_num_items_bin) {
+        fprintf(stderr,
+                "Something went wrong at i=%u/%u, num_items_bin(%u)>total_num_items_bin(%u)\n",
+                i, numBins, num_items_bin, total_num_items_bin);
+        REQUIRE(false);
+      }
+      pNode = pNode->pNext;
+    }
+    if (num_items_bin != total_num_items_bin) {
+      fprintf(stderr,
+              "Something went wrong at i=%u/%u, num_items_bin(%u)!=total_num_items_bin(%u)\n",
+              i, numBins, num_items_bin, total_num_items_bin);
+    }
+  }
+  HIP_CHECK(hipHostFree(pInputImage));
+  HIP_CHECK(hipHostFree(pNodes));
+  HIP_CHECK(hipHostFree(pNumNodes));
+
+  // each device and the host inserted all of the pixels, check that none are missing.
+  if (num_items != total_items)
+  {
+    fprintf(stderr, "The hash table is not correct, num items %u != expected num items: %u\n",
+            num_items, total_items);
+    REQUIRE(false); // test did not pass
+  }
+  REQUIRE(true);
+}
+
+/**
+* Test Description
+* ------------------------
+* - The suite will test the following functions,
+      hipHostMalloc() with following flags,
+        hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
+      atomicAdd_system()(in kernel)
+      atomicCAS_system()(in kernel)
+      atomicExch()(in kernel)
+      InterlockedExchangeAdd()(in WINDOWS host)
+      __sync_add_and_fetch()(in LINUX host)
+      InterlockedExchangeAdd64()(in WINDOWS host)
+      InterlockedExchangePointer()(in WINDOWS host)
+      __sync_lock_test_and_set()(in LINUX host)
+      InterlockedCompareExchange64()(in WINDOWS host)
+      __sync_val_compare_and_swap()(in LINUX host)
+      hipDeviceSynchronize()
+*   It will demonstrate use of SVM's atomics to do fine grain synchronization among
+*   devices and the host.
+*   Concept: Each device and the host simultaneously insert values into a single hash table.
+*   Each bin in the hash table is a linked list.  Each bin is protected against simultaneous
+*   update using a lock free technique.  The correctness of the list is verified on the host.
+* Test source
+* ------------------------
+* - catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp
+* Test requirements
+* ------------------------
+*  - Host specific (WINDOWS and LINUX)
+*  - Fine grain access and atomics supported on devices and host
+*  - HIP_VERSION >= 5.7
+*/
+TEST_CASE("test_svm_fine_grain_memory_consistency") {
+  const int num_elements = 2167;
+  int num_devices = 0;
+  HIP_CHECK(hipGetDeviceCount(&num_devices));
+  std::vector<hipStream_t> streams(num_devices);
+
+  for (int d = 0; d < num_devices; d++) {
+    HIP_CHECK(hipSetDevice(d));
+    HIP_CHECK(hipStreamCreate(&streams[d]));
+  }
+  HIP_CHECK(hipSetDevice(0));
+
+  // all work groups in all devices and the host code will hammer on this one lock.
+  unsigned int numBins = 1;
+  launch_kernels_and_verify(streams, num_devices, numBins, num_elements);
+
+  numBins = 2;  // 2 locks within in same cache line will get hit from different devices and host.
+  launch_kernels_and_verify(streams, num_devices, numBins, num_elements);
+
+  numBins = 29;  // locks span a few cache lines.
+  launch_kernels_and_verify(streams, num_devices, numBins, num_elements);
+
+  for (unsigned int i = 0; i < num_devices; i++) {
+    HIP_CHECK(hipStreamDestroy(streams[i]));
+  }
+}
@@ -0,0 +1,129 @@
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+/*
+ * Modifications Copyright (C)2023 Advanced
+ * Micro Devices, Inc. All rights reserved.
+ */
+
+#include <chrono>
+#include <thread>
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <utils.hh>
+#include "hipSVMCommon.h"
+#define MAX_TARGETS 1024
+
+__global__ void find_targets(unsigned int* image, unsigned int target,
+                             unsigned int* numTargetsFound,
+                             unsigned int* targetLocations) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int index = 0;
+  if (image[i] == target) {
+    index = atomicAdd((unsigned int*)numTargetsFound, 1u);
+    if (index < MAX_TARGETS) {
+      atomicExch_system((unsigned int *)&targetLocations[index], (unsigned int)i);
+    }
+  }
+}
+
+void spawnAnalysisTask(int location)
+{
+  printf("found target at location %d\n", location);
+}
+
+/**
+* Test Description
+* ------------------------
+* - The suite will test the following functions,
+      hipHostMalloc() with following flags,
+        hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
+        hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
+      atomicAdd()(in kernel)
+      atomicExch_system()(in kernel)
+      InterlockedExchangeAdd()(in WINDOWS host)
+      __sync_add_and_fetch()(in LINUX host)
+      hipStreamCreate()
+      hipEventCreate()
+      hipEventRecord()
+      hipEventQuery()
+*   It will demonstrate use of SVM's atomics to do fine grain synchronization between
+*   a device and the host. The result will be verified on the host.
+*   Concept: a device kernel is used to search an input image for regions that match a
+*   target pattern. The device immediately notifies the host when it finds a target
+*   (via an atomic operation that works across host and devices). The host is then able
+*   to spawn a task that further analyzes the target while the device continues searching
+*   for more targets.
+* Test source
+* ------------------------
+* - catch/unit/memory/hipSVMTestFineGrainSyncBuffers.cpp
+* Test requirements
+* ------------------------
+*  - Host specific (WINDOWS and LINUX)
+*  - Fine grain access and atomics supported on device and host
+*  - HIP_VERSION >= 5.7
+*/
+TEST_CASE("test_svm_fine_grain_sync_buffers") {
+  size_t num_pixels = 1024 * 1024 * 2;
+  hipStream_t stream;
+  HIP_CHECK(hipSetDevice(0));
+  HIP_CHECK(hipStreamCreate(&stream));
+  hipEvent_t event;
+  HIP_CHECK(hipEventCreate(&event));
+  unsigned int *pInputImage, *pNumTargetsFound, *pTargetLocations;
+  HIP_CHECK(hipHostMalloc(&pInputImage, sizeof(unsigned int) * num_pixels, hipHostMallocNonCoherent));
+  HIP_CHECK(hipHostMalloc(&pNumTargetsFound, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostMalloc(&pTargetLocations, sizeof(int) * MAX_TARGETS, hipHostMallocCoherent));
+  unsigned int targetDescriptor = 777;
+  *pNumTargetsFound = 0;
+
+  unsigned int i;
+  for(i = 0; i < MAX_TARGETS; i++) pTargetLocations[i] = -1;
+  for(i = 0; i < num_pixels; i++) pInputImage[i] = 0;
+  pInputImage[0] = targetDescriptor;
+  pInputImage[3] = targetDescriptor;
+  pInputImage[num_pixels - 1] = targetDescriptor;
+
+  find_targets<<<(num_pixels + 255) / 256, 256, 0, stream>>>(pInputImage, targetDescriptor,
+                                                             pNumTargetsFound, pTargetLocations);
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipEventRecord(event, stream));
+
+  i=0;
+  hipError_t status = hipSuccess;
+  unsigned int loc = 0;
+  // check for new targets, if found spawn a task to analyze target.
+  do {
+    status = hipEventQuery(event);
+    if (status != hipErrorNotReady && status != hipSuccess) {
+      fprintf(stderr, "Unexpected status = %d\n", status);
+      REQUIRE(false);
+    }
+    loc = AtomicLoad32(&pTargetLocations[i]);
+    if (loc != -1)  // -1 indicates slot not used yet.
+    {
+      spawnAnalysisTask(loc); // Do something...
+      i++;
+    }
+  } while (status == hipErrorNotReady ||
+           AtomicLoad32(&pTargetLocations[i]) != -1);
+
+  HIP_CHECK(hipHostFree(pInputImage));
+  HIP_CHECK(hipHostFree(pNumTargetsFound));
+  HIP_CHECK(hipHostFree(pTargetLocations));
+  HIP_CHECK(hipEventDestroy(event));
+  HIP_CHECK(hipStreamDestroy(stream));
+  REQUIRE(i == 3);
+}
@@ -0,0 +1,297 @@
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+/*
+ * Modifications Copyright (C)2023 Advanced
+ * Micro Devices, Inc. All rights reserved.
+ */
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <utils.hh>
+#include "hipSVMCommon.h"
+
+// const char *linked_list_create_and_verify_kernels[] = {
+typedef struct Node {
+  unsigned int global_id;
+  unsigned int position_in_list;
+  struct Node* pNext;
+} Node;
+
+// The allocation_index parameter must be initialized on the host to N work-items
+// The first N nodes in pNodes will be the heads of the lists.
+__global__ void create_linked_lists_on_device(Node* pNodes,
+                                    unsigned int* allocation_index,
+                                    unsigned int list_length) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  Node* pNode = &pNodes[i];
+
+  pNode->global_id = i;
+  pNode->position_in_list = 0;
+
+  Node* pNew;
+  for (unsigned int j = 1; j < list_length; j++) {
+    pNew = &pNodes[atomicAdd(allocation_index, 1u)];  // allocate a new node
+    pNew->global_id = i;
+    pNew->position_in_list = j;
+    pNode->pNext = pNew;  // link new node onto end of list
+    pNode = pNew;         // move to end of list
+  }
+}
+
+__global__ void verify_linked_lists_on_device(Node* pNodes, unsigned int* num_correct,
+    unsigned int list_length) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  Node* pNode = &pNodes[i];
+
+  for (unsigned int j = 0; j < list_length; j++) {
+    if (pNode->global_id == i && pNode->position_in_list == j) {
+      atomicAdd(num_correct, 1u);
+    } else {
+      break;
+    }
+    pNode = pNode->pNext;
+  }
+}
+
+// The first N nodes in pNodes will be the heads of the lists.
+void create_linked_lists_on_host(Node* pNodes, unsigned int num_lists, unsigned int list_length) {
+  unsigned int allocation_index = num_lists;  // heads of lists are in first num_lists nodes.
+  for (unsigned int i = 0; i < num_lists; i++) {
+    Node* pNode = &pNodes[i];
+    pNode->global_id = i;
+    pNode->position_in_list = 0;
+    Node* pNew;
+    for (unsigned int j = 1; j < list_length; j++) {
+      pNew = &pNodes[allocation_index++];  // allocate a new node
+      pNew->global_id = i;
+      pNew->position_in_list = j;
+      pNode->pNext = pNew;  // link new node onto end of list
+      pNode = pNew;         // move to end of list
+    }
+  }
+}
+
+void verify_linked_lists_on_host(Node* pNodes, unsigned int num_lists, unsigned int list_length) {
+  unsigned int numCorrect = 0;
+  for (unsigned int i = 0; i < num_lists; i++) {
+    Node* pNode = &pNodes[i];
+    for (int j = 0; j < list_length; j++) {
+      if (pNode->global_id == i && pNode->position_in_list == j) {
+        numCorrect++;
+      } else {
+        break;
+      }
+      pNode = pNode->pNext;
+    }
+  }
+  if (numCorrect != list_length * num_lists) {
+    fprintf(stderr, "Failed\n");
+    REQUIRE(false);
+  }
+}
+
+void create_linked_lists_on_device(hipStream_t stream, Node* pNodes,
+                                          unsigned int* pAllocator, unsigned int numLists,
+                                          unsigned int ListLength) {
+  // reset allocator index
+  *pAllocator = numLists;  // the first numLists elements of the nodes array are already
+                           // allocated (they hold the head of each list).
+  create_linked_lists_on_device<<<(numLists + 255) / 256, 256, 0, stream>>>(pNodes, pAllocator,
+                                                                     ListLength);
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipStreamSynchronize(stream));
+}
+
+void verify_linked_lists_on_device(hipStream_t stream, Node* pNodes,
+                                          unsigned int* pNumCorrect, unsigned int numLists,
+                                          unsigned int ListLength) {
+  *pNumCorrect = 0;     // reset numCorrect to zero
+ 
+  verify_linked_lists_on_device<<<(numLists + 255) / 256, 256, 0, stream>>>(pNodes, pNumCorrect,
+                                                                     ListLength);
+
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipStreamSynchronize(stream));
+
+  int correct_count = *pNumCorrect;
+  if(correct_count != ListLength * numLists)
+  {
+    fprintf(stderr,"Failed\n");
+    REQUIRE(false);
+  }
+}
+
+/**
+* Test Description
+* ------------------------
+* - The suite will test the following functions,
+      hipHostMalloc() with following flags,
+        hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
+      atomicAdd()(in kernel)
+      hipStreamCreate()
+      hipStreamSynchronize()
+*   It will test that all devices and the host share a common address space using fine-grain
+*   host buffers.
+*   Concept: This is done by creating a linked list on a device and then verifying the
+*   correctness of the list on another device or the host.  This basic test is performed for all
+*   combinations of devices and the host that exist within the platform. The test passes only if
+*   every combination passes.
+* Test source
+* ------------------------
+* - catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp
+* Test requirements
+* ------------------------
+*  - Host specific (WINDOWS and LINUX)
+*  - Fine grain access supported on devices and host
+*  - HIP_VERSION >= 5.7
+*/
+TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") {
+  const unsigned int num_elements = 1024;
+  int num_devices = 0;
+  HIP_CHECK(hipGetDeviceCount(&num_devices));
+  int num_devices_plus_host = num_devices + 1;
+  std::vector<hipStream_t> streams(num_devices);
+
+  for (int d = 0; d < num_devices; d++) {
+    HIP_CHECK(hipSetDevice(d));
+    HIP_CHECK(hipStreamCreate(&streams[d]));
+  }
+  HIP_CHECK(hipSetDevice(0));
+
+  unsigned int numLists =  num_elements;
+  unsigned int ListLength = 32;
+  Node* pNodes = nullptr;
+  unsigned int* pAllocator = nullptr;
+  unsigned int* pNumCorrect = nullptr;
+  HIP_CHECK(hipHostMalloc(&pNodes, sizeof(Node) * ListLength * numLists, hipHostMallocNonCoherent));
+  HIP_CHECK(hipHostMalloc(&pAllocator, sizeof(unsigned int), hipHostMallocNonCoherent));
+  HIP_CHECK(hipHostMalloc(&pNumCorrect, sizeof(unsigned int), hipHostMallocNonCoherent));
+
+  // Create linked list on one device and verify on another device (or the host).
+  // Do this for all possible combinations of devices and host within the platform.
+  // ci is CreationIndex, index of device/q to create linked list on
+  for (int ci=0; ci<num_devices_plus_host; ci++)
+  {
+      // vi is VerificationIndex, index of device/q to verify linked list on
+    for (int vi = 0; vi < num_devices_plus_host; vi++)
+    {
+      if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+      {
+        create_linked_lists_on_host(pNodes, numLists, ListLength);
+      }
+      else
+      {
+        create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists,
+                                             ListLength);
+      }
+
+      if(vi == num_devices)
+      {
+        verify_linked_lists_on_host(pNodes, numLists, ListLength);
+      }
+      else
+      {
+        verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists,
+                                             ListLength);
+      }
+    }
+  }
+
+  HIP_CHECK(hipHostFree(pNodes));
+  HIP_CHECK(hipHostFree(pAllocator));
+  HIP_CHECK(hipHostFree(pNumCorrect));
+  for (int d = 0; d < num_devices; d++) {
+    HIP_CHECK(hipStreamDestroy(streams[d]));
+  }
+  REQUIRE(true);
+}
+
+/**
+* Test Description
+* ------------------------
+* - The suite will test the following functions,
+      align_malloc()
+      atomicAdd()(in kernel)
+      hipStreamCreate()
+      hipStreamSynchronize()
+*   It will test that all devices and the host share a common address space using fine-grain mode
+*   with regular host buffers.
+*   Concept: This is done by creating a linked list on a device and then verifying the
+*   correctness of the list on another device or the host.  This basic test is performed for all
+*   combinations of devices and the host that exist within the platform.  The test passes only if
+*   every combination passes.
+* Test source
+* ------------------------
+* - catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp
+* Test requirements
+* ------------------------
+*  - Host specific (WINDOWS and LINUX)
+*  - System fine grain access supported on devices
+*  - HIP_VERSION >= 5.7
+*/
+TEST_CASE("test_svm_shared_address_space_fine_grain_system") {
+  fprintf(stderr, "test_svm_shared_address_space_fine_grain_system ignored\n");
+  return;// blocked by SWDEV-422544 add HIP flag for APU device
+  const unsigned int num_elements = 1024;
+  int num_devices = 0;
+  HIP_CHECK(hipGetDeviceCount(&num_devices));
+  int num_devices_plus_host = num_devices + 1;
+  std::vector<hipStream_t> streams(num_devices);
+
+  for (int d = 0; d < num_devices; d++) {
+    HIP_CHECK(hipSetDevice(d));
+    HIP_CHECK(hipStreamCreate(&streams[d]));
+  }
+  HIP_CHECK(hipSetDevice(0));
+
+  unsigned int numLists = num_elements;
+  unsigned int ListLength = 32;
+
+  // this allocation holds the linked list nodes.
+  Node* pNodes = (Node*)align_malloc(numLists * ListLength * sizeof(Node), 128);
+  // this allocation holds an index into the nodes buffer, it is used for node allocation
+  unsigned int* pAllocator = (unsigned int*)align_malloc(sizeof(unsigned int), 128);
+  // this allocation holds the count of correct nodes, which is computed by the verify kernel.
+  unsigned int* pNumCorrect = (unsigned int*)align_malloc(sizeof(unsigned int), 128);
+
+  // ci is CreationIndex, index of device/q to create linked list on
+  for (int ci = 0; ci < num_devices_plus_host; ci++) {
+    // vi is VerificationIndex, index of device/q to verify linked list on
+    for (int vi = 0; vi < num_devices_plus_host; vi++) {
+      if (ci == num_devices) // last device index represents the host, note the num_device+1 above.
+      {
+        create_linked_lists_on_host(pNodes, numLists, ListLength);
+      } else {
+        create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists,
+                                             ListLength);
+      }
+
+      if (vi == num_devices) {
+        verify_linked_lists_on_host(pNodes, numLists, ListLength);
+      } else {
+        verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists,
+                                             ListLength);
+      }
+    }
+  }
+  align_free(pNodes);
+  align_free(pAllocator);
+  align_free(pNumCorrect);
+  for (int d = 0; d < num_devices; d++) {
+    HIP_CHECK(hipStreamDestroy(streams[d]));
+  }
+  REQUIRE(true);
+}