Merge pull request #31 from BKP/ipc_bringup_fine_unit_09-26-24
Add IPC Simple Buffer Fine-grained Unit Tests
Этот коммит содержится в:
@@ -21,7 +21,7 @@ cmake \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_THREADS=ON \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_COHERENT_HEAP=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
|
||||
@@ -21,7 +21,7 @@ cmake \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_THREADS=ON \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_COHERENT_HEAP=OFF \
|
||||
$src_path
|
||||
cmake --build . --parallel 8
|
||||
cmake --install .
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef LIBRARY_SRC_ATOMIC_HPP
|
||||
#define LIBRARY_SRC_ATOMIC_HPP
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace rocshmem {
|
||||
namespace detail {
|
||||
namespace atomic {
|
||||
|
||||
typedef enum rocshmem_memory_scope {
|
||||
memory_scope_thread = __HIP_MEMORY_SCOPE_SINGLETHREAD,
|
||||
memory_scope_wavefront = __HIP_MEMORY_SCOPE_WAVEFRONT,
|
||||
memory_scope_workgroup = __HIP_MEMORY_SCOPE_WORKGROUP,
|
||||
memory_scope_agent = __HIP_MEMORY_SCOPE_AGENT,
|
||||
memory_scope_system = __HIP_MEMORY_SCOPE_SYSTEM,
|
||||
} rocshmem_memory_scope;
|
||||
|
||||
typedef enum rocshmem_memory_order {
|
||||
memory_order_relaxed = __ATOMIC_RELAXED,
|
||||
memory_order_consume = __ATOMIC_CONSUME,
|
||||
memory_order_acquire = __ATOMIC_ACQUIRE,
|
||||
memory_order_release = __ATOMIC_RELEASE,
|
||||
memory_order_acq_rel = __ATOMIC_ACQ_REL,
|
||||
memory_order_seq_cst = __ATOMIC_SEQ_CST
|
||||
} rocshmem_memory_order;
|
||||
|
||||
struct rocshmem_memory_orders {
|
||||
rocshmem_memory_order load {memory_order_acquire};
|
||||
rocshmem_memory_order store {memory_order_release};
|
||||
rocshmem_memory_order atomic {memory_order_acq_rel};
|
||||
rocshmem_memory_order weak_cas_success {memory_order_acq_rel};
|
||||
rocshmem_memory_order weak_cas_failure {memory_order_acq_rel};
|
||||
rocshmem_memory_order strong_cas_success {memory_order_acq_rel};
|
||||
rocshmem_memory_order strong_cas_failure {memory_order_acq_rel};
|
||||
};
|
||||
|
||||
template <typename T, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
T load(const T* address, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_load(address, o.load, s);
|
||||
}
|
||||
|
||||
template <typename T, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
void store(T* address, const T value, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_store(address, value, o.store, s);
|
||||
}
|
||||
|
||||
template <typename T, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
bool compare_exchange_weak(T& expected, T desired, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_compare_exchange_weak(expected, desired, o.weak_cas_success, o.weak_cas_failure, s);
|
||||
}
|
||||
|
||||
template <typename T, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
bool compare_exchange_strong(T& expected, T desired, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_compare_exchange_strong(expected, desired, o.strong_cas_success, o.strong_cas_failure, s);
|
||||
}
|
||||
|
||||
template <class T, class U, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
T fetch_add(T* obj, U arg, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_fetch_add(obj, arg, o.atomic, s);
|
||||
}
|
||||
|
||||
template <class T, class U, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
T fetch_sub(T* obj, U arg, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_fetch_sub(obj, arg, o.atomic, s);
|
||||
}
|
||||
|
||||
template <class T, class U, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
T fetch_and(T* obj, U arg, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_fetch_and(obj, arg, o.atomic, s);
|
||||
}
|
||||
|
||||
template <class T, class U, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
T fetch_or(T* obj, U arg, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_fetch_or(obj, arg, o, s);
|
||||
}
|
||||
|
||||
template <class T, class U, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
T fetch_xor(T* obj, U arg, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_fetch_xor(obj, arg, o.atomic, s);
|
||||
}
|
||||
|
||||
template <class T, class U, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
T fetch_max(T* obj, U arg, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_fetch_max(obj, arg, o.atomic, s);
|
||||
}
|
||||
|
||||
template <class T, class U, rocshmem_memory_scope s>
|
||||
__host__ __device__
|
||||
T fetch_min(T* obj, U arg, rocshmem_memory_orders o) {
|
||||
return __hip_atomic_fetch_min(obj, arg, o.atomic, s);
|
||||
}
|
||||
|
||||
template <rocshmem_memory_scope s>
|
||||
__device__
|
||||
void threadfence() {
|
||||
if constexpr (s == memory_scope_system) {
|
||||
__threadfence_system();
|
||||
} else if constexpr (s == memory_scope_agent) {
|
||||
__threadfence();
|
||||
} else if constexpr (s == memory_scope_workgroup) {
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace atomic
|
||||
} // namespace detail
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // LIBRARY_SRC_ATOMIC_HPP_
|
||||
+12
-12
@@ -64,38 +64,38 @@ class IpcOnImpl {
|
||||
|
||||
__device__ void ipcCopy_wave(void *dst, void *src, size_t size);
|
||||
|
||||
__device__ void ipcFence() { __threadfence(); }
|
||||
__device__ void ipcFence() { __threadfence_system(); }
|
||||
|
||||
template <typename T>
|
||||
__device__ T ipcAMOFetchAdd(T *val, T value) {
|
||||
return __hip_atomic_fetch_add(val, value, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
return __hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T ipcAMOFetchCas(T *val, T cond, T value) {
|
||||
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST,
|
||||
__ATOMIC_SEQ_CST,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
return cond;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void ipcAMOAdd(T *val, T value) {
|
||||
__hip_atomic_fetch_add(val, value, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
__hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void ipcAMOCas(T *val, T cond, T value) {
|
||||
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST,
|
||||
__ATOMIC_SEQ_CST,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void ipcAMOSet(T *val, T value) {
|
||||
__hip_atomic_store(val, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
__hip_atomic_store(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__ void zero_byte_read(int pe) {
|
||||
|
||||
@@ -20,59 +20,96 @@
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**
|
||||
* @file notifier.hpp
|
||||
*
|
||||
* @brief Contains the notification memory space for threads to communicate
|
||||
* results with one another.
|
||||
*
|
||||
* Assume one thread does work on behalf of other threads (as a leader) and
|
||||
* that work needs to be communicated to the other threads as a result.
|
||||
* To expose the result, the threads need to share a memory space where the
|
||||
* result can be written. The leader thread writes the result out to this
|
||||
* memory space and all threads synchronize on it.
|
||||
*
|
||||
* This class allows the leader thread to notify other threads of the update.
|
||||
*/
|
||||
|
||||
#ifndef LIBRARY_SRC_MEMORY_NOTIFIER_HPP_
|
||||
#define LIBRARY_SRC_MEMORY_NOTIFIER_HPP_
|
||||
|
||||
#include "../device_proxy.hpp"
|
||||
#include "../util.hpp"
|
||||
#include "../atomic.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
template<detail::atomic::rocshmem_memory_scope scope>
|
||||
class Notifier {
|
||||
public:
|
||||
__device__ uint64_t read() { return value_; }
|
||||
|
||||
__device__ void write(uint64_t val) {
|
||||
if (is_thread_zero_in_block()) {
|
||||
value_ = val;
|
||||
}
|
||||
publish();
|
||||
public:
|
||||
__device__ uint64_t load() {
|
||||
return detail::atomic::load<uint64_t, scope>(&value_, orders_);
|
||||
}
|
||||
|
||||
__device__ void done() { __syncthreads(); }
|
||||
__device__ void store(uint64_t val) {
|
||||
detail::atomic::store<uint64_t, scope>(&value_, val, orders_);
|
||||
}
|
||||
|
||||
private:
|
||||
__device__ void publish() {
|
||||
if (is_thread_zero_in_block()) {
|
||||
__threadfence();
|
||||
__device__ void fence() {
|
||||
detail::atomic::threadfence<scope>();
|
||||
}
|
||||
|
||||
__device__ void sync() {
|
||||
if constexpr (scope == detail::atomic::memory_scope_thread ||
|
||||
scope == detail::atomic::memory_scope_wavefront) {
|
||||
return;
|
||||
}
|
||||
if constexpr (scope == detail::atomic::memory_scope_workgroup) {
|
||||
__syncthreads();
|
||||
return;
|
||||
}
|
||||
if constexpr (scope == detail::atomic::memory_scope_system) {
|
||||
assert(false);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t done {signal_ + 1};
|
||||
__syncthreads();
|
||||
|
||||
uint32_t retval {0};
|
||||
bool executor {!threadIdx.x && !threadIdx.y && !threadIdx.z};
|
||||
if (executor) {
|
||||
retval = detail::atomic::fetch_add<uint32_t, uint32_t, scope>(&count_, 1, orders_);
|
||||
fence();
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (retval == ((gridDim.x * gridDim.y * gridDim.z) - 1)) {
|
||||
if (executor) {
|
||||
detail::atomic::store<uint32_t, scope>(&count_, 0, orders_);
|
||||
fence();
|
||||
detail::atomic::fetch_add<uint32_t, uint32_t, scope>(&signal_, 1, orders_);
|
||||
}
|
||||
}
|
||||
|
||||
if (executor) {
|
||||
while (detail::atomic::load<uint32_t, scope>(&signal_, orders_) != done) {
|
||||
;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
private:
|
||||
detail::atomic::rocshmem_memory_orders orders_{};
|
||||
|
||||
uint64_t value_{};
|
||||
|
||||
uint32_t signal_ {};
|
||||
|
||||
uint32_t count_ {};
|
||||
};
|
||||
|
||||
template <typename ALLOCATOR>
|
||||
template <typename ALLOCATOR, detail::atomic::rocshmem_memory_scope scope>
|
||||
class NotifierProxy {
|
||||
using ProxyT = DeviceProxy<ALLOCATOR, Notifier, 1>;
|
||||
using ProxyT = DeviceProxy<ALLOCATOR, Notifier<scope>>;
|
||||
|
||||
public:
|
||||
__host__ __device__ Notifier* get() { return proxy_.get(); }
|
||||
NotifierProxy() {
|
||||
new (proxy_.get()) Notifier<scope>();
|
||||
}
|
||||
|
||||
~NotifierProxy() {
|
||||
proxy_.get()->~Notifier<scope>();
|
||||
}
|
||||
|
||||
__host__ __device__ Notifier<scope>* get() { return proxy_.get(); }
|
||||
|
||||
private:
|
||||
ProxyT proxy_{};
|
||||
|
||||
@@ -75,9 +75,12 @@ __device__ void SlabHeap::malloc(void** ptr, size_t size) {
|
||||
* Notify other threads in block about the allocation result.
|
||||
*/
|
||||
auto notifier{notifier_.get()};
|
||||
notifier->write(ptr_deref_u64);
|
||||
uint64_t notification_u64{notifier->read()};
|
||||
notifier->done();
|
||||
if (!threadIdx.x) {
|
||||
notifier->store(ptr_deref_u64);
|
||||
notifier->fence();
|
||||
}
|
||||
__syncthreads();
|
||||
uint64_t notification_u64{notifier->load()};
|
||||
|
||||
/*
|
||||
* Write to the ptr parameter (to return it back up the call stack).
|
||||
|
||||
@@ -48,7 +48,7 @@ class SlabHeap {
|
||||
/**
|
||||
* @brief Helper type for notifier
|
||||
*/
|
||||
using NOTIFIER_PROXY_T = NotifierProxy<HIPAllocator>;
|
||||
using NOTIFIER_PROXY_T = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_workgroup>;
|
||||
|
||||
/**
|
||||
* @brief Helper type for notifier
|
||||
|
||||
@@ -93,6 +93,7 @@ __device__ __forceinline__ bool is_thread_zero_in_block() {
|
||||
__device__ __forceinline__ bool is_block_zero_in_grid() {
|
||||
return hipBlockIdx_x == 0 && hipBlockIdx_y == 0 && hipBlockIdx_z == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the number of threads in the caller's flattened thread block.
|
||||
*/
|
||||
@@ -100,6 +101,13 @@ __device__ __forceinline__ int get_flat_block_size() {
|
||||
return hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the number of threads in the caller's flattened grid.
|
||||
*/
|
||||
__device__ __forceinline__ int get_flat_grid_size() {
|
||||
return get_flat_block_size() * hipGridDim_x * hipGridDim_y * hipGridDim_z;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the flattened thread index of the calling thread within its
|
||||
* thread block.
|
||||
|
||||
@@ -90,6 +90,7 @@ target_sources(
|
||||
free_list_gtest.cpp
|
||||
#context_ipc_gtest.cpp
|
||||
ipc_impl_simple_coarse_gtest.cpp
|
||||
ipc_impl_simple_fine_gtest.cpp
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
|
||||
@@ -33,6 +33,7 @@ TEST_F(IPCImplSimpleCoarseTestFixture, MPI_num_pes) {
|
||||
}
|
||||
|
||||
TEST_F(IPCImplSimpleCoarseTestFixture, IPC_bases) {
|
||||
ASSERT_NE(ipc_impl_.ipc_bases, nullptr);
|
||||
for(int i{0}; i < mpi_.num_pes(); i++) {
|
||||
ASSERT_NE(ipc_impl_.ipc_bases[i], nullptr);
|
||||
}
|
||||
|
||||
@@ -218,8 +218,6 @@ class IPCImplSimpleCoarseTestFixture : public ::testing::Test {
|
||||
protected:
|
||||
std::vector<int> golden_;
|
||||
|
||||
std::vector<int> output_;
|
||||
|
||||
HEAP_T heap_mem_ {};
|
||||
|
||||
MPI_T mpi_ {heap_mem_.get_ptr(), heap_mem_.get_size()};
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -0,0 +1,344 @@
|
||||
/******************************************************************************
|
||||
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP
|
||||
#define ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include <numeric>
|
||||
#include <mpi.h>
|
||||
|
||||
#include "../src/atomic.hpp"
|
||||
#include "../src/ipc_policy.hpp"
|
||||
#include "../src/memory/notifier.hpp"
|
||||
#include "../src/memory/symmetric_heap.hpp"
|
||||
#include "../src/util.hpp"
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
enum TestType {
|
||||
READ = 0,
|
||||
WRITE = 1
|
||||
};
|
||||
|
||||
const uint32_t SIGNAL_OFFSET {67108864};
|
||||
|
||||
__device__
|
||||
void
|
||||
validator(bool *error, int *golden, int *dest, size_t bytes) {
|
||||
size_t elements {bytes / sizeof(int)};
|
||||
for (int i {get_flat_id()}; i < elements; i += get_flat_grid_size()) {
|
||||
if (golden[i] != dest[i]) {
|
||||
printf("golden[%d] %d != dest[%d] %d\n", i, golden[i], i, dest[i]);
|
||||
*error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename NotifierT>
|
||||
__global__
|
||||
void
|
||||
kernel_put_with_signal_validator(bool *error, int *golden, int *dest, size_t bytes, NotifierT *notifier) {
|
||||
detail::atomic::rocshmem_memory_orders orders{};
|
||||
if (!get_flat_id()) {
|
||||
while (detail::atomic::load<int, detail::atomic::memory_scope_system>(dest + SIGNAL_OFFSET, orders) == 0) {
|
||||
;
|
||||
}
|
||||
}
|
||||
notifier->sync();
|
||||
validator(error, golden, dest, bytes);
|
||||
}
|
||||
|
||||
template <typename NotifierT>
|
||||
__global__
|
||||
void
|
||||
kernel_simple_fine_copy(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
|
||||
if (!get_flat_id()) {
|
||||
ipc_impl->ipcCopy(dest, src, bytes);
|
||||
ipc_impl->ipcFence();
|
||||
if (test == WRITE) {
|
||||
ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1);
|
||||
}
|
||||
}
|
||||
if (test == READ) {
|
||||
notifier->sync();
|
||||
validator(error, golden, dest, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename NotifierT>
|
||||
__global__
|
||||
void
|
||||
kernel_simple_fine_copy_wg(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
|
||||
if (!blockIdx.x) {
|
||||
ipc_impl->ipcCopy_wg(dest, src, bytes);
|
||||
ipc_impl->ipcFence();
|
||||
if (test == WRITE) {
|
||||
if (!threadIdx.x) {
|
||||
ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (test == READ) {
|
||||
notifier->sync();
|
||||
validator(error, golden, dest, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename NotifierT>
|
||||
__global__
|
||||
void
|
||||
kernel_simple_fine_copy_wave(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
|
||||
if (!blockIdx.x && threadIdx.x < 64) {
|
||||
ipc_impl->ipcCopy_wave(dest, src, bytes);
|
||||
ipc_impl->ipcFence();
|
||||
if (test == WRITE) {
|
||||
if (!threadIdx.x) {
|
||||
ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
if (test == READ) {
|
||||
notifier->sync();
|
||||
validator(error, golden, dest, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
class IPCImplSimpleFineTestFixture : public ::testing::Test {
|
||||
using HEAP_T = HeapMemory<HIPDefaultFinegrainedAllocator>;
|
||||
using MPI_T = RemoteHeapInfo<CommunicatorMPI>;
|
||||
using NotifierT = Notifier<detail::atomic::memory_scope_agent>;
|
||||
using NotifierProxyT = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_agent>;
|
||||
using FN_T1 = void (*)(IpcImpl*, bool*, int*, int*, int*, size_t, TestType, NotifierT*);
|
||||
using FN_T2 = void (*)(bool*, int*, int*, size_t, NotifierT*);
|
||||
|
||||
public:
|
||||
IPCImplSimpleFineTestFixture() {
|
||||
ipc_impl_.ipcHostInit(mpi_.my_pe(), mpi_.get_heap_bases() , MPI_COMM_WORLD);
|
||||
|
||||
assert(ipc_impl_dptr_ == nullptr);
|
||||
hip_allocator_.allocate((void**)&ipc_impl_dptr_, sizeof(IpcImpl));
|
||||
CHECK_HIP(hipMemcpy(ipc_impl_dptr_, &ipc_impl_, sizeof(IpcImpl), hipMemcpyHostToDevice));
|
||||
|
||||
assert(error_dptr_ == nullptr);
|
||||
hip_allocator_.allocate((void**)&error_dptr_, sizeof(bool));
|
||||
*error_dptr_ = false;
|
||||
}
|
||||
|
||||
~IPCImplSimpleFineTestFixture() {
|
||||
if (ipc_impl_dptr_) {
|
||||
hip_allocator_.deallocate(ipc_impl_dptr_);
|
||||
}
|
||||
if (error_dptr_) {
|
||||
hip_allocator_.deallocate(error_dptr_);
|
||||
}
|
||||
if (golden_dptr_) {
|
||||
hip_allocator_.deallocate(golden_dptr_);
|
||||
}
|
||||
|
||||
ipc_impl_.ipcHostStop();
|
||||
}
|
||||
|
||||
void launch(FN_T1 f, const dim3 grid, const dim3 block, int* src, int* dest, size_t bytes, TestType test) {
|
||||
f<<<grid, block>>>(ipc_impl_dptr_, error_dptr_, golden_dptr_, src, dest, bytes, test, notifier_.get());
|
||||
CHECK_HIP(hipStreamSynchronize(nullptr));
|
||||
}
|
||||
|
||||
void launch(FN_T2 f, const dim3 grid, const dim3 block, int* dest, size_t bytes) {
|
||||
f<<<grid, block>>>(error_dptr_, golden_dptr_, dest, bytes, notifier_.get());
|
||||
CHECK_HIP(hipStreamSynchronize(nullptr));
|
||||
}
|
||||
|
||||
void write(const dim3 grid, const dim3 block, size_t elems) {
|
||||
iota_golden(elems);
|
||||
initialize_signal(WRITE);
|
||||
initialize_src_buffer(WRITE);
|
||||
copy(WRITE, grid, block);
|
||||
check_device_validation_errors(WRITE);
|
||||
}
|
||||
|
||||
void write_wg(const dim3 grid, const dim3 block, size_t elems) {
|
||||
iota_golden(elems);
|
||||
initialize_signal(WRITE);
|
||||
initialize_src_buffer(WRITE);
|
||||
copy_wg(WRITE, grid, block);
|
||||
check_device_validation_errors(WRITE);
|
||||
}
|
||||
|
||||
void write_wave(const dim3 grid, const dim3 block, size_t elems) {
|
||||
iota_golden(elems);
|
||||
initialize_signal(WRITE);
|
||||
initialize_src_buffer(WRITE);
|
||||
copy_wave(WRITE, grid, block);
|
||||
check_device_validation_errors(WRITE);
|
||||
}
|
||||
|
||||
void read(const dim3 grid, const dim3 block, size_t elems) {
|
||||
iota_golden(elems);
|
||||
initialize_signal(READ);
|
||||
initialize_src_buffer(READ);
|
||||
copy(READ, grid, block);
|
||||
check_device_validation_errors(READ);
|
||||
}
|
||||
|
||||
void read_wg(const dim3 grid, const dim3 block, size_t elems) {
|
||||
iota_golden(elems);
|
||||
initialize_signal(READ);
|
||||
initialize_src_buffer(READ);
|
||||
copy_wg(READ, grid, block);
|
||||
check_device_validation_errors(READ);
|
||||
}
|
||||
|
||||
void read_wave(const dim3 grid, const dim3 block, size_t elems) {
|
||||
iota_golden(elems);
|
||||
initialize_signal(READ);
|
||||
initialize_src_buffer(READ);
|
||||
copy_wave(READ, grid, block);
|
||||
check_device_validation_errors(READ);
|
||||
}
|
||||
|
||||
void iota_golden(size_t elems) {
|
||||
golden_.resize(elems);
|
||||
std::iota(golden_.begin(), golden_.end(), 0);
|
||||
|
||||
assert(golden_dptr_ == nullptr);
|
||||
size_t golden_dptr_bytes {golden_.size() * sizeof(int)};
|
||||
hip_allocator_.allocate((void**)&golden_dptr_, golden_dptr_bytes);
|
||||
CHECK_HIP(hipMemcpy(golden_dptr_, golden_.data(), golden_dptr_bytes, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
void validate_golden(size_t elems) {
|
||||
ASSERT_EQ(golden_.size(), elems);
|
||||
for (int i{0}; i < golden_.size(); i++) {
|
||||
ASSERT_EQ(golden_[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
void initialize_signal(TestType test) {
|
||||
bool is_write_test = test;
|
||||
if (is_write_test && mpi_.my_pe() == 0) {
|
||||
int *dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
|
||||
*(dest + SIGNAL_OFFSET) = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void initialize_src_buffer(TestType test) {
|
||||
if (!pe_initializes_src_buffer(test)) {
|
||||
return;
|
||||
}
|
||||
size_t bytes = golden_.size() * sizeof(int);
|
||||
auto dev_src = reinterpret_cast<int*>(ipc_impl_.ipc_bases[mpi_.my_pe()]);
|
||||
CHECK_HIP(hipMemcpy(dev_src, golden_.data(), bytes, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
bool pe_initializes_src_buffer(TestType test) {
|
||||
bool is_write_test = test;
|
||||
bool is_read_test = !test;
|
||||
return (is_write_test && mpi_.my_pe() == 0) ||
|
||||
(is_read_test && mpi_.my_pe() == 1);
|
||||
}
|
||||
|
||||
void execute(TestType test, FN_T1 fn, const dim3 grid, const dim3 block) {
|
||||
size_t bytes = golden_.size() * sizeof(int);
|
||||
if (mpi_.my_pe()) {
|
||||
mpi_.barrier();
|
||||
if (test == WRITE) {
|
||||
int *dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
|
||||
FN_T2 val_fn = kernel_put_with_signal_validator;
|
||||
launch(val_fn, grid, block, dest, bytes);
|
||||
}
|
||||
mpi_.barrier();
|
||||
return;
|
||||
}
|
||||
int *src{nullptr};
|
||||
int *dest{nullptr};
|
||||
if (test == WRITE) {
|
||||
src = reinterpret_cast<int*>(ipc_impl_.ipc_bases[0]);
|
||||
dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
|
||||
} else {
|
||||
src = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
|
||||
dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[0]);
|
||||
}
|
||||
mpi_.barrier();
|
||||
launch(fn, grid, block, src, dest, bytes, test);
|
||||
mpi_.barrier();
|
||||
}
|
||||
|
||||
void copy(TestType test, dim3 grid, dim3 block) {
|
||||
execute(test, kernel_simple_fine_copy, grid, block);
|
||||
}
|
||||
|
||||
void copy_wg(TestType test, dim3 grid, dim3 block) {
|
||||
execute(test, kernel_simple_fine_copy_wg, grid, block);
|
||||
}
|
||||
|
||||
void copy_wave(TestType test, dim3 grid, dim3 block) {
|
||||
execute(test, kernel_simple_fine_copy_wave, grid, block);
|
||||
}
|
||||
|
||||
void check_device_validation_errors(TestType test) {
|
||||
if (!pe_validates_dest_buffer(test)) {
|
||||
return;
|
||||
}
|
||||
ASSERT_EQ(*error_dptr_, false);
|
||||
}
|
||||
|
||||
void validate_dest_buffer(TestType test) {
|
||||
if (!pe_validates_dest_buffer(test)) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto dev_dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[mpi_.my_pe()]);
|
||||
for (int i{0}; i < golden_.size(); i++) {
|
||||
ASSERT_EQ(golden_[i], dev_dest[i]);
|
||||
}
|
||||
}
|
||||
|
||||
bool pe_validates_dest_buffer(TestType test) {
|
||||
return !pe_initializes_src_buffer(test);
|
||||
}
|
||||
|
||||
protected:
|
||||
HIPDefaultFinegrainedAllocator hip_allocator_ {};
|
||||
|
||||
NotifierProxyT notifier_ {};
|
||||
|
||||
HEAP_T heap_mem_ {};
|
||||
|
||||
MPI_T mpi_ {heap_mem_.get_ptr(), heap_mem_.get_size()};
|
||||
|
||||
std::vector<int> golden_;
|
||||
|
||||
int *golden_dptr_ {nullptr};
|
||||
|
||||
IpcImpl ipc_impl_ {};
|
||||
|
||||
IpcImpl *ipc_impl_dptr_ {nullptr};
|
||||
|
||||
bool *error_dptr_ {nullptr};
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
#endif // ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP
|
||||
@@ -28,30 +28,98 @@ using namespace rocshmem;
|
||||
******************************* Fixture Tests *******************************
|
||||
*****************************************************************************/
|
||||
|
||||
TEST_F(NotifierTestFixture, run_all_threads_once_1_1) {
|
||||
TEST_F(NotifierBlockTestFixture, run_all_threads_once_1_1) {
|
||||
run_all_threads_once(1, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierTestFixture, run_all_threads_once_2_1) {
|
||||
TEST_F(NotifierBlockTestFixture, run_all_threads_once_2_1) {
|
||||
run_all_threads_once(2, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierTestFixture, run_all_threads_once_64_1) {
|
||||
TEST_F(NotifierBlockTestFixture, run_all_threads_once_64_1) {
|
||||
run_all_threads_once(64, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierTestFixture, run_all_threads_once_128_1) {
|
||||
TEST_F(NotifierBlockTestFixture, run_all_threads_once_128_1) {
|
||||
run_all_threads_once(128, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierTestFixture, run_all_threads_once_256_1) {
|
||||
TEST_F(NotifierBlockTestFixture, run_all_threads_once_256_1) {
|
||||
run_all_threads_once(256, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierTestFixture, run_all_threads_once_512_1) {
|
||||
TEST_F(NotifierBlockTestFixture, run_all_threads_once_512_1) {
|
||||
run_all_threads_once(512, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierTestFixture, run_all_threads_once_1024_1) {
|
||||
TEST_F(NotifierBlockTestFixture, run_all_threads_once_1024_1) {
|
||||
run_all_threads_once(1024, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_1) {
|
||||
run_all_threads_once(1, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_2_1) {
|
||||
run_all_threads_once(2, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_64_1) {
|
||||
run_all_threads_once(64, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_128_1) {
|
||||
run_all_threads_once(128, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_256_1) {
|
||||
run_all_threads_once(256, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_512_1) {
|
||||
run_all_threads_once(512, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_1) {
|
||||
run_all_threads_once(1024, 1);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_2) {
|
||||
run_all_threads_once(1, 2);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_2) {
|
||||
run_all_threads_once(1024, 2);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_4) {
|
||||
run_all_threads_once(1, 4);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_4) {
|
||||
run_all_threads_once(1024, 4);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_8) {
|
||||
run_all_threads_once(1, 8);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_8) {
|
||||
run_all_threads_once(1024, 8);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_32) {
|
||||
run_all_threads_once(1, 32);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_32) {
|
||||
run_all_threads_once(1024, 32);
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_38) {
|
||||
run_all_threads_once(1, 38); // MI300 CPX
|
||||
}
|
||||
|
||||
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_38) {
|
||||
run_all_threads_once(1024, 38); // MI300 CPX
|
||||
}
|
||||
|
||||
@@ -44,66 +44,45 @@ static const uint64_t NOTIFIER_OFFSET {0x100B00};
|
||||
inline __device__
|
||||
void
|
||||
write_to_memory(uint8_t* raw_memory) {
|
||||
auto thread_idx {get_flat_block_id()};
|
||||
auto thread_idx {get_flat_id()};
|
||||
raw_memory[thread_idx] = THREAD_VALUE;
|
||||
__threadfence();
|
||||
}
|
||||
|
||||
template <typename NotifierT>
|
||||
__global__
|
||||
void
|
||||
all_threads_once(uint8_t* raw_memory,
|
||||
Notifier* notifier) {
|
||||
notifier->write(NOTIFIER_OFFSET);
|
||||
uint64_t offset_u64 {notifier->read()};
|
||||
notifier->done();
|
||||
|
||||
NotifierT * notifier) {
|
||||
if (!get_flat_id()) {
|
||||
notifier->store(NOTIFIER_OFFSET);
|
||||
notifier->fence();
|
||||
}
|
||||
notifier->sync();
|
||||
uint64_t offset_u64 {notifier->load()};
|
||||
uint64_t raw_memory_u64 {reinterpret_cast<uint64_t>(raw_memory)};
|
||||
uint64_t address_u64 {raw_memory_u64 + offset_u64};
|
||||
uint8_t* address {reinterpret_cast<uint8_t*>(address_u64)};
|
||||
write_to_memory(address);
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
class NotifierTestFixture : public ::testing::Test {
|
||||
using NotifierProxyT = NotifierProxy<HIPAllocator>;
|
||||
|
||||
class NotifierBase : public ::testing::Test {
|
||||
public:
|
||||
NotifierTestFixture() {
|
||||
NotifierBase() {
|
||||
assert(raw_memory_ == nullptr);
|
||||
hip_allocator_.allocate((void**)&raw_memory_, GIBIBYTE_);
|
||||
assert(raw_memory_);
|
||||
}
|
||||
|
||||
~NotifierTestFixture() {
|
||||
~NotifierBase() {
|
||||
if (raw_memory_) {
|
||||
hip_allocator_.deallocate(raw_memory_);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
run_all_threads_once(uint32_t x_block_dim,
|
||||
uint32_t x_grid_dim) {
|
||||
const dim3 hip_blocksize(x_block_dim, 1, 1);
|
||||
const dim3 hip_gridsize(x_grid_dim, 1, 1);
|
||||
|
||||
hipLaunchKernelGGL(all_threads_once,
|
||||
hip_gridsize,
|
||||
hip_blocksize,
|
||||
0,
|
||||
nullptr,
|
||||
raw_memory_,
|
||||
notifier_.get());
|
||||
|
||||
hipError_t return_code = hipStreamSynchronize(nullptr);
|
||||
if (return_code != hipSuccess) {
|
||||
printf("Failed in stream synchronize\n");
|
||||
assert(return_code == hipSuccess);
|
||||
}
|
||||
|
||||
size_t number_threads {x_block_dim * x_grid_dim};
|
||||
|
||||
verify(size_t number_threads) {
|
||||
uint8_t* offset_addr {compute_offset_addr()};
|
||||
|
||||
for (size_t i {0}; i < number_threads; i++) {
|
||||
ASSERT_EQ(offset_addr[i], THREAD_VALUE);
|
||||
}
|
||||
@@ -136,12 +115,51 @@ class NotifierTestFixture : public ::testing::Test {
|
||||
*/
|
||||
uint8_t *raw_memory_ {nullptr};
|
||||
|
||||
};
|
||||
|
||||
class NotifierBlockTestFixture : public NotifierBase {
|
||||
using NotifierT = Notifier<detail::atomic::memory_scope_workgroup>;
|
||||
using NotifierProxyT = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_workgroup>;
|
||||
|
||||
public:
|
||||
void
|
||||
run_all_threads_once(uint32_t x_block_dim,
|
||||
uint32_t x_grid_dim) {
|
||||
new (notifier_.get()) NotifierT();
|
||||
const dim3 block(x_block_dim, 1, 1);
|
||||
const dim3 grid(x_grid_dim, 1, 1);
|
||||
all_threads_once<NotifierT><<<grid, block>>>(raw_memory_, notifier_.get());
|
||||
CHECK_HIP(hipStreamSynchronize(nullptr));
|
||||
verify(x_block_dim * x_grid_dim);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Used to broadcast base offset for writing.
|
||||
*/
|
||||
NotifierProxyT notifier_ {};
|
||||
};
|
||||
|
||||
class NotifierAgentTestFixture : public NotifierBase {
|
||||
using NotifierT = Notifier<detail::atomic::memory_scope_agent>;
|
||||
using NotifierProxyT = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_agent>;
|
||||
|
||||
public:
|
||||
void
|
||||
run_all_threads_once(uint32_t x_block_dim,
|
||||
uint32_t x_grid_dim) {
|
||||
new (notifier_.get()) NotifierT();
|
||||
const dim3 block(x_block_dim, 1, 1);
|
||||
const dim3 grid(x_grid_dim, 1, 1);
|
||||
all_threads_once<NotifierT><<<grid, block>>>(raw_memory_, notifier_.get());
|
||||
CHECK_HIP(hipStreamSynchronize(nullptr));
|
||||
verify(x_block_dim * x_grid_dim);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Used to broadcast base offset for writing.
|
||||
*/
|
||||
NotifierProxyT notifier_ {};
|
||||
};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user