Merge pull request #31 from BKP/ipc_bringup_fine_unit_09-26-24

Add IPC Simple Buffer Fine-grained Unit Tests
2024-10-01 15:12:30 -05:00
@@ -21,7 +21,7 @@ cmake \
    -DUSE_IPC=ON \
    -DUSE_THREADS=ON \
    -DUSE_WF_COAL=OFF \
-    -DUSE_COHERENT_HEAP=ON \
+    -DUSE_COHERENT_HEAP=OFF \
    $src_path
 cmake --build . --parallel 8
 cmake --install .
@@ -21,7 +21,7 @@ cmake \
    -DUSE_IPC=OFF \
    -DUSE_THREADS=ON \
    -DUSE_WF_COAL=OFF \
-    -DUSE_COHERENT_HEAP=ON \
+    -DUSE_COHERENT_HEAP=OFF \
    $src_path
 cmake --build . --parallel 8
 cmake --install .
@@ -0,0 +1,141 @@
+/******************************************************************************
+ * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef LIBRARY_SRC_ATOMIC_HPP
+#define LIBRARY_SRC_ATOMIC_HPP
+
+#include <hip/hip_runtime.h>
+
+namespace rocshmem {
+namespace detail {
+namespace atomic {
+
+typedef enum rocshmem_memory_scope {
+  memory_scope_thread = __HIP_MEMORY_SCOPE_SINGLETHREAD,
+  memory_scope_wavefront = __HIP_MEMORY_SCOPE_WAVEFRONT,
+  memory_scope_workgroup = __HIP_MEMORY_SCOPE_WORKGROUP,
+  memory_scope_agent = __HIP_MEMORY_SCOPE_AGENT,
+  memory_scope_system = __HIP_MEMORY_SCOPE_SYSTEM,
+} rocshmem_memory_scope;
+
+typedef enum rocshmem_memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} rocshmem_memory_order;
+
+struct rocshmem_memory_orders {
+  rocshmem_memory_order load {memory_order_acquire};
+  rocshmem_memory_order store {memory_order_release};
+  rocshmem_memory_order atomic {memory_order_acq_rel};
+  rocshmem_memory_order weak_cas_success {memory_order_acq_rel};
+  rocshmem_memory_order weak_cas_failure {memory_order_acq_rel};
+  rocshmem_memory_order strong_cas_success {memory_order_acq_rel};
+  rocshmem_memory_order strong_cas_failure {memory_order_acq_rel};
+};
+
+template <typename T, rocshmem_memory_scope s>
+__host__ __device__
+T load(const T* address, rocshmem_memory_orders o) {
+  return __hip_atomic_load(address, o.load, s);
+}
+
+template <typename T, rocshmem_memory_scope s>
+__host__ __device__
+void store(T* address, const T value, rocshmem_memory_orders o) {
+  return __hip_atomic_store(address, value, o.store, s);
+}
+
+template <typename T, rocshmem_memory_scope s>
+__host__ __device__
+bool compare_exchange_weak(T& expected, T desired, rocshmem_memory_orders o) {
+  return __hip_atomic_compare_exchange_weak(expected, desired, o.weak_cas_success, o.weak_cas_failure, s);
+}
+
+template <typename T, rocshmem_memory_scope s>
+__host__ __device__
+bool compare_exchange_strong(T& expected, T desired, rocshmem_memory_orders o) {
+  return __hip_atomic_compare_exchange_strong(expected, desired, o.strong_cas_success, o.strong_cas_failure, s);
+}
+
+template <class T, class U, rocshmem_memory_scope s>
+__host__ __device__
+T fetch_add(T* obj, U arg, rocshmem_memory_orders o) {
+  return __hip_atomic_fetch_add(obj, arg, o.atomic, s);
+}
+
+template <class T, class U, rocshmem_memory_scope s>
+__host__ __device__
+T fetch_sub(T* obj, U arg, rocshmem_memory_orders o) {
+  return __hip_atomic_fetch_sub(obj, arg, o.atomic, s);
+}
+
+template <class T, class U, rocshmem_memory_scope s>
+__host__ __device__
+T fetch_and(T* obj, U arg, rocshmem_memory_orders o) {
+  return __hip_atomic_fetch_and(obj, arg, o.atomic, s);
+}
+
+template <class T, class U, rocshmem_memory_scope s>
+__host__ __device__
+T fetch_or(T* obj, U arg, rocshmem_memory_orders o) {
+  return __hip_atomic_fetch_or(obj, arg, o, s);
+}
+
+template <class T, class U, rocshmem_memory_scope s>
+__host__ __device__
+T fetch_xor(T* obj, U arg, rocshmem_memory_orders o) {
+  return __hip_atomic_fetch_xor(obj, arg, o.atomic, s);
+}
+
+template <class T, class U, rocshmem_memory_scope s>
+__host__ __device__
+T fetch_max(T* obj, U arg, rocshmem_memory_orders o) {
+  return __hip_atomic_fetch_max(obj, arg, o.atomic, s);
+}
+
+template <class T, class U, rocshmem_memory_scope s>
+__host__ __device__
+T fetch_min(T* obj, U arg, rocshmem_memory_orders o) {
+  return __hip_atomic_fetch_min(obj, arg, o.atomic, s);
+}
+
+template <rocshmem_memory_scope s>
+__device__
+void threadfence() {
+  if constexpr (s == memory_scope_system) {
+    __threadfence_system();
+  } else if constexpr (s == memory_scope_agent) {
+    __threadfence();
+  } else if constexpr (s == memory_scope_workgroup) {
+    __threadfence_block();
+  }
+}
+
+} // namespace atomic
+} // namespace detail
+} // namespace rocshmem
+
+#endif  // LIBRARY_SRC_ATOMIC_HPP_
@@ -64,38 +64,38 @@ class IpcOnImpl {

  __device__ void ipcCopy_wave(void *dst, void *src, size_t size);

-  __device__ void ipcFence() { __threadfence(); }
+  __device__ void ipcFence() { __threadfence_system(); }

  template <typename T>
  __device__ T ipcAMOFetchAdd(T *val, T value) {
-    return __hip_atomic_fetch_add(val, value, __ATOMIC_RELAXED,
-                                  __HIP_MEMORY_SCOPE_AGENT);
+    return __hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST,
+                                  __HIP_MEMORY_SCOPE_SYSTEM);
  }

  template <typename T>
  __device__ T ipcAMOFetchCas(T *val, T cond, T value) {
-    __hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_RELAXED,
-                                         __ATOMIC_RELAXED,
-                                         __HIP_MEMORY_SCOPE_AGENT);
+    __hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST,
+                                         __ATOMIC_SEQ_CST,
+                                         __HIP_MEMORY_SCOPE_SYSTEM);
    return cond;
  }

  template <typename T>
  __device__ void ipcAMOAdd(T *val, T value) {
-    __hip_atomic_fetch_add(val, value, __ATOMIC_RELAXED,
-                           __HIP_MEMORY_SCOPE_AGENT);
+    __hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_SYSTEM);
  }

  template <typename T>
  __device__ void ipcAMOCas(T *val, T cond, T value) {
-    __hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_RELAXED,
-                                         __ATOMIC_RELAXED,
-                                         __HIP_MEMORY_SCOPE_AGENT);
+    __hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST,
+                                         __ATOMIC_SEQ_CST,
+                                         __HIP_MEMORY_SCOPE_SYSTEM);
  }

  template <typename T>
  __device__ void ipcAMOSet(T *val, T value) {
-    __hip_atomic_store(val, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    __hip_atomic_store(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
  }

  __device__ void zero_byte_read(int pe) {
@@ -20,59 +20,96 @@
 * IN THE SOFTWARE.
 *****************************************************************************/

-/**
- * @file notifier.hpp
- *
- * @brief Contains the notification memory space for threads to communicate
- * results with one another.
- *
- * Assume one thread does work on behalf of other threads (as a leader) and
- * that work needs to be communicated to the other threads as a result.
- * To expose the result, the threads need to share a memory space where the
- * result can be written. The leader thread writes the result out to this
- * memory space and all threads synchronize on it.
- *
- * This class allows the leader thread to notify other threads of the update.
- */
-
 #ifndef LIBRARY_SRC_MEMORY_NOTIFIER_HPP_
 #define LIBRARY_SRC_MEMORY_NOTIFIER_HPP_

 #include "../device_proxy.hpp"
 #include "../util.hpp"
+#include "../atomic.hpp"

 namespace rocshmem {

+template<detail::atomic::rocshmem_memory_scope scope>
 class Notifier {
- public:
-  __device__ uint64_t read() { return value_; }

-  __device__ void write(uint64_t val) {
-    if (is_thread_zero_in_block()) {
-      value_ = val;
-    }
-    publish();
+ public:
+  __device__ uint64_t load() {
+    return detail::atomic::load<uint64_t, scope>(&value_, orders_);
  }

-  __device__ void done() { __syncthreads(); }
+  __device__ void store(uint64_t val) {
+    detail::atomic::store<uint64_t, scope>(&value_, val, orders_);
+  }

- private:
-  __device__ void publish() {
-    if (is_thread_zero_in_block()) {
-      __threadfence();
+  __device__ void fence() {
+    detail::atomic::threadfence<scope>();
+  }
+
+  __device__ void sync() {
+    if constexpr (scope == detail::atomic::memory_scope_thread ||
+                  scope == detail::atomic::memory_scope_wavefront) {
+      return;
+    }
+    if constexpr (scope == detail::atomic::memory_scope_workgroup) {
+      __syncthreads();
+      return;
+    }
+    if constexpr (scope == detail::atomic::memory_scope_system) {
+      assert(false);
+      return;
+    }
+
+    uint32_t done {signal_ + 1};
+    __syncthreads();
+
+    uint32_t retval {0};
+    bool executor {!threadIdx.x && !threadIdx.y && !threadIdx.z};
+    if (executor) {
+      retval = detail::atomic::fetch_add<uint32_t, uint32_t, scope>(&count_, 1, orders_);
+      fence();
+    }
+    __syncthreads();
+
+    if (retval == ((gridDim.x * gridDim.y * gridDim.z) - 1)) {
+      if (executor) {
+        detail::atomic::store<uint32_t, scope>(&count_, 0, orders_);
+        fence();
+        detail::atomic::fetch_add<uint32_t, uint32_t, scope>(&signal_, 1, orders_);
+      }
+    }
+
+    if (executor) {
+      while (detail::atomic::load<uint32_t, scope>(&signal_, orders_) != done) {
+        ;
+      }
    }
    __syncthreads();
  }

+ private:
+  detail::atomic::rocshmem_memory_orders orders_{};
+
  uint64_t value_{};
+
+  uint32_t signal_ {};
+
+  uint32_t count_ {};
 };

-template <typename ALLOCATOR>
+template <typename ALLOCATOR, detail::atomic::rocshmem_memory_scope scope>
 class NotifierProxy {
-  using ProxyT = DeviceProxy<ALLOCATOR, Notifier, 1>;
+  using ProxyT = DeviceProxy<ALLOCATOR, Notifier<scope>>;

 public:
-  __host__ __device__ Notifier* get() { return proxy_.get(); }
+  NotifierProxy() {
+    new (proxy_.get()) Notifier<scope>();
+  }
+
+  ~NotifierProxy() {
+    proxy_.get()->~Notifier<scope>();
+  }
+
+  __host__ __device__ Notifier<scope>* get() { return proxy_.get(); }

 private:
  ProxyT proxy_{};
@@ -75,9 +75,12 @@ __device__ void SlabHeap::malloc(void** ptr, size_t size) {
   * Notify other threads in block about the allocation result.
   */
  auto notifier{notifier_.get()};
-  notifier->write(ptr_deref_u64);
-  uint64_t notification_u64{notifier->read()};
-  notifier->done();
+  if (!threadIdx.x) {
+    notifier->store(ptr_deref_u64);
+    notifier->fence();
+  }
+  __syncthreads();
+  uint64_t notification_u64{notifier->load()};

  /*
   * Write to the ptr parameter (to return it back up the call stack).
@@ -48,7 +48,7 @@ class SlabHeap {
  /**
   * @brief Helper type for notifier
   */
-  using NOTIFIER_PROXY_T = NotifierProxy<HIPAllocator>;
+  using NOTIFIER_PROXY_T = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_workgroup>;

  /**
   * @brief Helper type for notifier
@@ -93,6 +93,7 @@ __device__ __forceinline__ bool is_thread_zero_in_block() {
 __device__ __forceinline__ bool is_block_zero_in_grid() {
  return hipBlockIdx_x == 0 && hipBlockIdx_y == 0 && hipBlockIdx_z == 0;
 }
+
 /*
 * Returns the number of threads in the caller's flattened thread block.
 */
@@ -100,6 +101,13 @@ __device__ __forceinline__ int get_flat_block_size() {
  return hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
 }

+/*
+ * Returns the number of threads in the caller's flattened grid.
+ */
+__device__ __forceinline__ int get_flat_grid_size() {
+  return get_flat_block_size() * hipGridDim_x * hipGridDim_y * hipGridDim_z;
+}
+
 /*
 * Returns the flattened thread index of the calling thread within its
 * thread block.
@@ -90,6 +90,7 @@ target_sources(
    free_list_gtest.cpp
    #context_ipc_gtest.cpp
    ipc_impl_simple_coarse_gtest.cpp
+    ipc_impl_simple_fine_gtest.cpp
 )

 ###############################################################################
@@ -33,6 +33,7 @@ TEST_F(IPCImplSimpleCoarseTestFixture, MPI_num_pes) {
 }

 TEST_F(IPCImplSimpleCoarseTestFixture, IPC_bases) {
+  ASSERT_NE(ipc_impl_.ipc_bases, nullptr);
  for(int i{0}; i < mpi_.num_pes(); i++) {
    ASSERT_NE(ipc_impl_.ipc_bases[i], nullptr);
  }
@@ -218,8 +218,6 @@ class IPCImplSimpleCoarseTestFixture : public ::testing::Test {
  protected:
    std::vector<int> golden_;

-    std::vector<int> output_;
-
    HEAP_T heap_mem_ {};

    MPI_T mpi_ {heap_mem_.get_ptr(), heap_mem_.get_size()};
@@ -0,0 +1,344 @@
+/******************************************************************************
+ * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP
+#define ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP
+
+#include "gtest/gtest.h"
+
+#include <numeric>
+#include <mpi.h>
+
+#include "../src/atomic.hpp"
+#include "../src/ipc_policy.hpp"
+#include "../src/memory/notifier.hpp"
+#include "../src/memory/symmetric_heap.hpp"
+#include "../src/util.hpp"
+
+namespace rocshmem {
+
+enum TestType {
+    READ = 0,
+    WRITE = 1
+};
+
+const uint32_t SIGNAL_OFFSET {67108864};
+
+__device__
+void
+validator(bool *error, int *golden, int *dest, size_t bytes) {
+    size_t elements {bytes / sizeof(int)};
+    for (int i {get_flat_id()}; i < elements; i += get_flat_grid_size()) {
+        if (golden[i] != dest[i]) {
+            printf("golden[%d] %d != dest[%d] %d\n", i, golden[i], i, dest[i]);
+            *error = true;
+        }
+    }
+}
+
+template <typename NotifierT>
+__global__
+void
+kernel_put_with_signal_validator(bool *error, int *golden, int *dest, size_t bytes, NotifierT *notifier) {
+    detail::atomic::rocshmem_memory_orders orders{};
+    if (!get_flat_id()) {
+        while (detail::atomic::load<int, detail::atomic::memory_scope_system>(dest + SIGNAL_OFFSET, orders) == 0) {
+            ;
+        }
+    }
+    notifier->sync();
+    validator(error, golden, dest, bytes);
+}
+
+template <typename NotifierT>
+__global__
+void
+kernel_simple_fine_copy(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
+    if (!get_flat_id()) {
+        ipc_impl->ipcCopy(dest, src, bytes);
+        ipc_impl->ipcFence();
+        if (test == WRITE) {
+            ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1);
+        }
+    }
+    if (test == READ) {
+        notifier->sync();
+        validator(error, golden, dest, bytes);
+    }
+}
+
+template <typename NotifierT>
+__global__
+void
+kernel_simple_fine_copy_wg(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
+    if (!blockIdx.x) {
+        ipc_impl->ipcCopy_wg(dest, src, bytes);
+        ipc_impl->ipcFence();
+        if (test == WRITE) {
+            if (!threadIdx.x) {
+                ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1);
+            }
+        }
+    }
+    if (test == READ) {
+        notifier->sync();
+        validator(error, golden, dest, bytes);
+    }
+}
+
+template <typename NotifierT>
+__global__
+void
+kernel_simple_fine_copy_wave(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
+    if (!blockIdx.x && threadIdx.x < 64) {
+        ipc_impl->ipcCopy_wave(dest, src, bytes);
+        ipc_impl->ipcFence();
+        if (test == WRITE) {
+            if (!threadIdx.x) {
+                ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1);
+            }
+        }
+    }
+    __syncthreads();
+    if (test == READ) {
+        notifier->sync();
+        validator(error, golden, dest, bytes);
+    }
+}
+
+class IPCImplSimpleFineTestFixture : public ::testing::Test {
+    using HEAP_T = HeapMemory<HIPDefaultFinegrainedAllocator>;
+    using MPI_T = RemoteHeapInfo<CommunicatorMPI>;
+    using NotifierT = Notifier<detail::atomic::memory_scope_agent>;
+    using NotifierProxyT = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_agent>;
+    using FN_T1 = void (*)(IpcImpl*, bool*, int*, int*, int*, size_t, TestType, NotifierT*);
+    using FN_T2 = void (*)(bool*, int*, int*, size_t, NotifierT*);
+
+  public:
+    IPCImplSimpleFineTestFixture() {
+        ipc_impl_.ipcHostInit(mpi_.my_pe(), mpi_.get_heap_bases() , MPI_COMM_WORLD);
+
+        assert(ipc_impl_dptr_ == nullptr);
+        hip_allocator_.allocate((void**)&ipc_impl_dptr_, sizeof(IpcImpl));
+        CHECK_HIP(hipMemcpy(ipc_impl_dptr_, &ipc_impl_, sizeof(IpcImpl), hipMemcpyHostToDevice));
+
+        assert(error_dptr_ == nullptr);
+        hip_allocator_.allocate((void**)&error_dptr_, sizeof(bool));
+        *error_dptr_ = false;
+    }
+
+    ~IPCImplSimpleFineTestFixture() {
+        if (ipc_impl_dptr_) {
+            hip_allocator_.deallocate(ipc_impl_dptr_);
+        }
+        if (error_dptr_) {
+            hip_allocator_.deallocate(error_dptr_);
+        }
+        if (golden_dptr_) {
+            hip_allocator_.deallocate(golden_dptr_);
+        }
+
+        ipc_impl_.ipcHostStop();
+    }
+
+    void launch(FN_T1 f, const dim3 grid, const dim3 block, int* src, int* dest, size_t bytes, TestType test) {
+        f<<<grid, block>>>(ipc_impl_dptr_, error_dptr_, golden_dptr_, src, dest, bytes, test, notifier_.get());
+        CHECK_HIP(hipStreamSynchronize(nullptr));
+    }
+
+    void launch(FN_T2 f, const dim3 grid, const dim3 block, int* dest, size_t bytes) {
+        f<<<grid, block>>>(error_dptr_, golden_dptr_, dest, bytes, notifier_.get());
+        CHECK_HIP(hipStreamSynchronize(nullptr));
+    }
+
+    void write(const dim3 grid, const dim3 block, size_t elems) {
+        iota_golden(elems);
+        initialize_signal(WRITE);
+        initialize_src_buffer(WRITE);
+        copy(WRITE, grid, block);
+        check_device_validation_errors(WRITE);
+    }
+
+    void write_wg(const dim3 grid, const dim3 block, size_t elems) {
+        iota_golden(elems);
+        initialize_signal(WRITE);
+        initialize_src_buffer(WRITE);
+        copy_wg(WRITE, grid, block);
+        check_device_validation_errors(WRITE);
+    }
+
+    void write_wave(const dim3 grid, const dim3 block, size_t elems) {
+        iota_golden(elems);
+        initialize_signal(WRITE);
+        initialize_src_buffer(WRITE);
+        copy_wave(WRITE, grid, block);
+        check_device_validation_errors(WRITE);
+    }
+
+    void read(const dim3 grid, const dim3 block, size_t elems) {
+        iota_golden(elems);
+        initialize_signal(READ);
+        initialize_src_buffer(READ);
+        copy(READ, grid, block);
+        check_device_validation_errors(READ);
+    }
+
+    void read_wg(const dim3 grid, const dim3 block, size_t elems) {
+        iota_golden(elems);
+        initialize_signal(READ);
+        initialize_src_buffer(READ);
+        copy_wg(READ, grid, block);
+        check_device_validation_errors(READ);
+    }
+
+    void read_wave(const dim3 grid, const dim3 block, size_t elems) {
+        iota_golden(elems);
+        initialize_signal(READ);
+        initialize_src_buffer(READ);
+        copy_wave(READ, grid, block);
+        check_device_validation_errors(READ);
+    }
+
+    void iota_golden(size_t elems) {
+        golden_.resize(elems);
+        std::iota(golden_.begin(), golden_.end(), 0);
+
+        assert(golden_dptr_ == nullptr);
+        size_t golden_dptr_bytes {golden_.size() * sizeof(int)};
+        hip_allocator_.allocate((void**)&golden_dptr_, golden_dptr_bytes);
+        CHECK_HIP(hipMemcpy(golden_dptr_, golden_.data(), golden_dptr_bytes, hipMemcpyHostToDevice));
+    }
+
+    void validate_golden(size_t elems) {
+        ASSERT_EQ(golden_.size(), elems);
+        for (int i{0}; i < golden_.size(); i++) {
+            ASSERT_EQ(golden_[i], i);
+        }
+    }
+
+    void initialize_signal(TestType test) {
+        bool is_write_test = test;
+        if (is_write_test && mpi_.my_pe() == 0) {
+            int *dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
+            *(dest + SIGNAL_OFFSET) = 0;
+        }
+    }
+
+    void initialize_src_buffer(TestType test) {
+        if (!pe_initializes_src_buffer(test)) {
+            return;
+        }
+        size_t bytes = golden_.size() * sizeof(int);
+        auto dev_src = reinterpret_cast<int*>(ipc_impl_.ipc_bases[mpi_.my_pe()]);
+        CHECK_HIP(hipMemcpy(dev_src, golden_.data(), bytes, hipMemcpyHostToDevice));
+    }
+
+    bool pe_initializes_src_buffer(TestType test) {
+        bool is_write_test = test;
+        bool is_read_test = !test;
+        return (is_write_test && mpi_.my_pe() == 0) ||
+               (is_read_test && mpi_.my_pe() == 1);
+    }
+
+    void execute(TestType test, FN_T1 fn, const dim3 grid, const dim3 block) {
+        size_t bytes = golden_.size() * sizeof(int);
+        if (mpi_.my_pe()) {
+            mpi_.barrier();
+            if (test == WRITE) {
+                int *dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
+                FN_T2 val_fn = kernel_put_with_signal_validator;
+                launch(val_fn, grid, block, dest, bytes);
+            }
+            mpi_.barrier();
+            return;
+        }
+        int *src{nullptr};
+        int *dest{nullptr};
+        if (test == WRITE) {
+            src = reinterpret_cast<int*>(ipc_impl_.ipc_bases[0]);
+            dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
+        } else {
+            src = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
+            dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[0]);
+        }
+        mpi_.barrier();
+        launch(fn, grid, block, src, dest, bytes, test);
+        mpi_.barrier();
+    }
+
+    void copy(TestType test, dim3 grid, dim3 block) {
+        execute(test, kernel_simple_fine_copy, grid, block);
+    }
+
+    void copy_wg(TestType test, dim3 grid, dim3 block) {
+        execute(test, kernel_simple_fine_copy_wg, grid, block);
+    }
+
+    void copy_wave(TestType test, dim3 grid, dim3 block) {
+        execute(test, kernel_simple_fine_copy_wave, grid, block);
+    }
+
+    void check_device_validation_errors(TestType test) {
+        if (!pe_validates_dest_buffer(test)) {
+            return;
+        }
+        ASSERT_EQ(*error_dptr_, false);
+    }
+
+    void validate_dest_buffer(TestType test) {
+        if (!pe_validates_dest_buffer(test)) {
+            return;
+        }
+
+        auto dev_dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[mpi_.my_pe()]);
+        for (int i{0}; i < golden_.size(); i++) {
+            ASSERT_EQ(golden_[i], dev_dest[i]);
+        }
+    }
+
+    bool pe_validates_dest_buffer(TestType test) {
+        return !pe_initializes_src_buffer(test);
+    }
+
+  protected:
+    HIPDefaultFinegrainedAllocator hip_allocator_ {};
+
+    NotifierProxyT notifier_ {};
+
+    HEAP_T heap_mem_ {};
+
+    MPI_T mpi_ {heap_mem_.get_ptr(), heap_mem_.get_size()};
+
+    std::vector<int> golden_;
+
+    int *golden_dptr_ {nullptr};
+
+    IpcImpl ipc_impl_ {};
+
+    IpcImpl *ipc_impl_dptr_ {nullptr};
+
+    bool *error_dptr_ {nullptr};
+};
+
+} // namespace rocshmem
+
+#endif  // ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP
@@ -28,30 +28,98 @@ using namespace rocshmem;
 ******************************* Fixture Tests *******************************
 *****************************************************************************/

-TEST_F(NotifierTestFixture, run_all_threads_once_1_1) {
+TEST_F(NotifierBlockTestFixture, run_all_threads_once_1_1) {
  run_all_threads_once(1, 1);
 }

-TEST_F(NotifierTestFixture, run_all_threads_once_2_1) {
+TEST_F(NotifierBlockTestFixture, run_all_threads_once_2_1) {
  run_all_threads_once(2, 1);
 }

-TEST_F(NotifierTestFixture, run_all_threads_once_64_1) {
+TEST_F(NotifierBlockTestFixture, run_all_threads_once_64_1) {
  run_all_threads_once(64, 1);
 }

-TEST_F(NotifierTestFixture, run_all_threads_once_128_1) {
+TEST_F(NotifierBlockTestFixture, run_all_threads_once_128_1) {
  run_all_threads_once(128, 1);
 }

-TEST_F(NotifierTestFixture, run_all_threads_once_256_1) {
+TEST_F(NotifierBlockTestFixture, run_all_threads_once_256_1) {
  run_all_threads_once(256, 1);
 }

-TEST_F(NotifierTestFixture, run_all_threads_once_512_1) {
+TEST_F(NotifierBlockTestFixture, run_all_threads_once_512_1) {
  run_all_threads_once(512, 1);
 }

-TEST_F(NotifierTestFixture, run_all_threads_once_1024_1) {
+TEST_F(NotifierBlockTestFixture, run_all_threads_once_1024_1) {
  run_all_threads_once(1024, 1);
 }
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_1) {
+  run_all_threads_once(1, 1);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_2_1) {
+  run_all_threads_once(2, 1);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_64_1) {
+  run_all_threads_once(64, 1);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_128_1) {
+  run_all_threads_once(128, 1);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_256_1) {
+  run_all_threads_once(256, 1);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_512_1) {
+  run_all_threads_once(512, 1);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_1) {
+  run_all_threads_once(1024, 1);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_2) {
+  run_all_threads_once(1, 2);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_2) {
+  run_all_threads_once(1024, 2);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_4) {
+  run_all_threads_once(1, 4);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_4) {
+  run_all_threads_once(1024, 4);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_8) {
+  run_all_threads_once(1, 8);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_8) {
+  run_all_threads_once(1024, 8);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_32) {
+  run_all_threads_once(1, 32);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_32) {
+  run_all_threads_once(1024, 32);
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_38) {
+  run_all_threads_once(1, 38);  // MI300 CPX
+}
+
+TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_38) {
+  run_all_threads_once(1024, 38);  // MI300 CPX
+}
@@ -44,66 +44,45 @@ static const uint64_t NOTIFIER_OFFSET {0x100B00};
 inline __device__
 void
 write_to_memory(uint8_t* raw_memory) {
-    auto thread_idx {get_flat_block_id()};
+    auto thread_idx {get_flat_id()};
    raw_memory[thread_idx] = THREAD_VALUE;
    __threadfence();
 }

+template <typename NotifierT>
 __global__
 void
 all_threads_once(uint8_t* raw_memory,
-                 Notifier* notifier) {
-    notifier->write(NOTIFIER_OFFSET);
-    uint64_t offset_u64 {notifier->read()};
-    notifier->done();
-
+                 NotifierT * notifier) {
+    if (!get_flat_id()) {
+      notifier->store(NOTIFIER_OFFSET);
+      notifier->fence();
+    }
+    notifier->sync();
+    uint64_t offset_u64 {notifier->load()};
    uint64_t raw_memory_u64 {reinterpret_cast<uint64_t>(raw_memory)};
    uint64_t address_u64 {raw_memory_u64 + offset_u64};
    uint8_t* address {reinterpret_cast<uint8_t*>(address_u64)};
    write_to_memory(address);
-    __syncthreads();
 }

-class NotifierTestFixture : public ::testing::Test {
-    using NotifierProxyT = NotifierProxy<HIPAllocator>;
-
+class NotifierBase : public ::testing::Test {
  public:
-    NotifierTestFixture() {
+    NotifierBase() {
        assert(raw_memory_ == nullptr);
        hip_allocator_.allocate((void**)&raw_memory_, GIBIBYTE_);
        assert(raw_memory_);
    }

-    ~NotifierTestFixture() {
+    ~NotifierBase() {
        if (raw_memory_) {
            hip_allocator_.deallocate(raw_memory_);
        }
    }

    void
-    run_all_threads_once(uint32_t x_block_dim,
-                         uint32_t x_grid_dim) {
-        const dim3 hip_blocksize(x_block_dim, 1, 1);
-        const dim3 hip_gridsize(x_grid_dim, 1, 1);
-
-        hipLaunchKernelGGL(all_threads_once,
-                           hip_gridsize,
-                           hip_blocksize,
-                           0,
-                           nullptr,
-                           raw_memory_,
-                           notifier_.get());
-
-        hipError_t return_code = hipStreamSynchronize(nullptr);
-        if (return_code != hipSuccess) {
-            printf("Failed in stream synchronize\n");
-            assert(return_code == hipSuccess);
-        }
-
-        size_t number_threads {x_block_dim * x_grid_dim};
-
+    verify(size_t number_threads) {
        uint8_t* offset_addr {compute_offset_addr()};
-
        for (size_t i {0}; i < number_threads; i++) {
            ASSERT_EQ(offset_addr[i], THREAD_VALUE);
        }
@@ -136,12 +115,51 @@ class NotifierTestFixture : public ::testing::Test {
     */
    uint8_t *raw_memory_ {nullptr};

+};
+
+class NotifierBlockTestFixture : public NotifierBase {
+    using NotifierT = Notifier<detail::atomic::memory_scope_workgroup>;
+    using NotifierProxyT = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_workgroup>;
+
+  public:
+    void
+    run_all_threads_once(uint32_t x_block_dim,
+                         uint32_t x_grid_dim) {
+        new (notifier_.get()) NotifierT();
+        const dim3 block(x_block_dim, 1, 1);
+        const dim3 grid(x_grid_dim, 1, 1);
+        all_threads_once<NotifierT><<<grid, block>>>(raw_memory_, notifier_.get());
+        CHECK_HIP(hipStreamSynchronize(nullptr));
+        verify(x_block_dim * x_grid_dim);
+    }
+
    /**
     * @brief Used to broadcast base offset for writing.
     */
    NotifierProxyT notifier_ {};
 };

+class NotifierAgentTestFixture : public NotifierBase {
+    using NotifierT = Notifier<detail::atomic::memory_scope_agent>;
+    using NotifierProxyT = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_agent>;
+
+  public:
+    void
+    run_all_threads_once(uint32_t x_block_dim,
+                         uint32_t x_grid_dim) {
+        new (notifier_.get()) NotifierT();
+        const dim3 block(x_block_dim, 1, 1);
+        const dim3 grid(x_grid_dim, 1, 1);
+        all_threads_once<NotifierT><<<grid, block>>>(raw_memory_, notifier_.get());
+        CHECK_HIP(hipStreamSynchronize(nullptr));
+        verify(x_block_dim * x_grid_dim);
+    }
+
+    /**
+     * @brief Used to broadcast base offset for writing.
+     */
+    NotifierProxyT notifier_ {};
+};

 } // namespace rocshmem