Merge pull request #31 from BKP/ipc_bringup_fine_unit_09-26-24

Add IPC Simple Buffer Fine-grained Unit Tests
Этот коммит содержится в:
Brandon Potter
2024-10-01 15:12:30 -05:00
коммит произвёл GitHub
родитель 64bc3c350a 24b928a007
Коммит 787cf0ff3f
15 изменённых файлов: 1747 добавлений и 91 удалений
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@ cmake \
-DUSE_IPC=ON \
-DUSE_THREADS=ON \
-DUSE_WF_COAL=OFF \
-DUSE_COHERENT_HEAP=ON \
-DUSE_COHERENT_HEAP=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@ cmake \
-DUSE_IPC=OFF \
-DUSE_THREADS=ON \
-DUSE_WF_COAL=OFF \
-DUSE_COHERENT_HEAP=ON \
-DUSE_COHERENT_HEAP=OFF \
$src_path
cmake --build . --parallel 8
cmake --install .
+141
Просмотреть файл
@@ -0,0 +1,141 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_ATOMIC_HPP
#define LIBRARY_SRC_ATOMIC_HPP
#include <hip/hip_runtime.h>
namespace rocshmem {
namespace detail {
namespace atomic {
typedef enum rocshmem_memory_scope {
memory_scope_thread = __HIP_MEMORY_SCOPE_SINGLETHREAD,
memory_scope_wavefront = __HIP_MEMORY_SCOPE_WAVEFRONT,
memory_scope_workgroup = __HIP_MEMORY_SCOPE_WORKGROUP,
memory_scope_agent = __HIP_MEMORY_SCOPE_AGENT,
memory_scope_system = __HIP_MEMORY_SCOPE_SYSTEM,
} rocshmem_memory_scope;
typedef enum rocshmem_memory_order {
memory_order_relaxed = __ATOMIC_RELAXED,
memory_order_consume = __ATOMIC_CONSUME,
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
memory_order_seq_cst = __ATOMIC_SEQ_CST
} rocshmem_memory_order;
struct rocshmem_memory_orders {
rocshmem_memory_order load {memory_order_acquire};
rocshmem_memory_order store {memory_order_release};
rocshmem_memory_order atomic {memory_order_acq_rel};
rocshmem_memory_order weak_cas_success {memory_order_acq_rel};
rocshmem_memory_order weak_cas_failure {memory_order_acq_rel};
rocshmem_memory_order strong_cas_success {memory_order_acq_rel};
rocshmem_memory_order strong_cas_failure {memory_order_acq_rel};
};
template <typename T, rocshmem_memory_scope s>
__host__ __device__
T load(const T* address, rocshmem_memory_orders o) {
return __hip_atomic_load(address, o.load, s);
}
template <typename T, rocshmem_memory_scope s>
__host__ __device__
void store(T* address, const T value, rocshmem_memory_orders o) {
return __hip_atomic_store(address, value, o.store, s);
}
template <typename T, rocshmem_memory_scope s>
__host__ __device__
bool compare_exchange_weak(T& expected, T desired, rocshmem_memory_orders o) {
return __hip_atomic_compare_exchange_weak(expected, desired, o.weak_cas_success, o.weak_cas_failure, s);
}
template <typename T, rocshmem_memory_scope s>
__host__ __device__
bool compare_exchange_strong(T& expected, T desired, rocshmem_memory_orders o) {
return __hip_atomic_compare_exchange_strong(expected, desired, o.strong_cas_success, o.strong_cas_failure, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_add(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_add(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_sub(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_sub(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_and(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_and(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_or(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_or(obj, arg, o, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_xor(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_xor(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_max(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_max(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_min(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_min(obj, arg, o.atomic, s);
}
template <rocshmem_memory_scope s>
__device__
void threadfence() {
if constexpr (s == memory_scope_system) {
__threadfence_system();
} else if constexpr (s == memory_scope_agent) {
__threadfence();
} else if constexpr (s == memory_scope_workgroup) {
__threadfence_block();
}
}
} // namespace atomic
} // namespace detail
} // namespace rocshmem
#endif // LIBRARY_SRC_ATOMIC_HPP_
+12 -12
Просмотреть файл
@@ -64,38 +64,38 @@ class IpcOnImpl {
__device__ void ipcCopy_wave(void *dst, void *src, size_t size);
__device__ void ipcFence() { __threadfence(); }
__device__ void ipcFence() { __threadfence_system(); }
template <typename T>
__device__ T ipcAMOFetchAdd(T *val, T value) {
return __hip_atomic_fetch_add(val, value, __ATOMIC_RELAXED,
__HIP_MEMORY_SCOPE_AGENT);
return __hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ T ipcAMOFetchCas(T *val, T cond, T value) {
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_RELAXED,
__ATOMIC_RELAXED,
__HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_SYSTEM);
return cond;
}
template <typename T>
__device__ void ipcAMOAdd(T *val, T value) {
__hip_atomic_fetch_add(val, value, __ATOMIC_RELAXED,
__HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ void ipcAMOCas(T *val, T cond, T value) {
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_RELAXED,
__ATOMIC_RELAXED,
__HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ void ipcAMOSet(T *val, T value) {
__hip_atomic_store(val, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_store(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
__device__ void zero_byte_read(int pe) {
+67 -30
Просмотреть файл
@@ -20,59 +20,96 @@
* IN THE SOFTWARE.
*****************************************************************************/
/**
* @file notifier.hpp
*
* @brief Contains the notification memory space for threads to communicate
* results with one another.
*
* Assume one thread does work on behalf of other threads (as a leader) and
* that work needs to be communicated to the other threads as a result.
* To expose the result, the threads need to share a memory space where the
* result can be written. The leader thread writes the result out to this
* memory space and all threads synchronize on it.
*
* This class allows the leader thread to notify other threads of the update.
*/
#ifndef LIBRARY_SRC_MEMORY_NOTIFIER_HPP_
#define LIBRARY_SRC_MEMORY_NOTIFIER_HPP_
#include "../device_proxy.hpp"
#include "../util.hpp"
#include "../atomic.hpp"
namespace rocshmem {
template<detail::atomic::rocshmem_memory_scope scope>
class Notifier {
public:
__device__ uint64_t read() { return value_; }
__device__ void write(uint64_t val) {
if (is_thread_zero_in_block()) {
value_ = val;
}
publish();
public:
__device__ uint64_t load() {
return detail::atomic::load<uint64_t, scope>(&value_, orders_);
}
__device__ void done() { __syncthreads(); }
__device__ void store(uint64_t val) {
detail::atomic::store<uint64_t, scope>(&value_, val, orders_);
}
private:
__device__ void publish() {
if (is_thread_zero_in_block()) {
__threadfence();
__device__ void fence() {
detail::atomic::threadfence<scope>();
}
__device__ void sync() {
if constexpr (scope == detail::atomic::memory_scope_thread ||
scope == detail::atomic::memory_scope_wavefront) {
return;
}
if constexpr (scope == detail::atomic::memory_scope_workgroup) {
__syncthreads();
return;
}
if constexpr (scope == detail::atomic::memory_scope_system) {
assert(false);
return;
}
uint32_t done {signal_ + 1};
__syncthreads();
uint32_t retval {0};
bool executor {!threadIdx.x && !threadIdx.y && !threadIdx.z};
if (executor) {
retval = detail::atomic::fetch_add<uint32_t, uint32_t, scope>(&count_, 1, orders_);
fence();
}
__syncthreads();
if (retval == ((gridDim.x * gridDim.y * gridDim.z) - 1)) {
if (executor) {
detail::atomic::store<uint32_t, scope>(&count_, 0, orders_);
fence();
detail::atomic::fetch_add<uint32_t, uint32_t, scope>(&signal_, 1, orders_);
}
}
if (executor) {
while (detail::atomic::load<uint32_t, scope>(&signal_, orders_) != done) {
;
}
}
__syncthreads();
}
private:
detail::atomic::rocshmem_memory_orders orders_{};
uint64_t value_{};
uint32_t signal_ {};
uint32_t count_ {};
};
template <typename ALLOCATOR>
template <typename ALLOCATOR, detail::atomic::rocshmem_memory_scope scope>
class NotifierProxy {
using ProxyT = DeviceProxy<ALLOCATOR, Notifier, 1>;
using ProxyT = DeviceProxy<ALLOCATOR, Notifier<scope>>;
public:
__host__ __device__ Notifier* get() { return proxy_.get(); }
NotifierProxy() {
new (proxy_.get()) Notifier<scope>();
}
~NotifierProxy() {
proxy_.get()->~Notifier<scope>();
}
__host__ __device__ Notifier<scope>* get() { return proxy_.get(); }
private:
ProxyT proxy_{};
+6 -3
Просмотреть файл
@@ -75,9 +75,12 @@ __device__ void SlabHeap::malloc(void** ptr, size_t size) {
* Notify other threads in block about the allocation result.
*/
auto notifier{notifier_.get()};
notifier->write(ptr_deref_u64);
uint64_t notification_u64{notifier->read()};
notifier->done();
if (!threadIdx.x) {
notifier->store(ptr_deref_u64);
notifier->fence();
}
__syncthreads();
uint64_t notification_u64{notifier->load()};
/*
* Write to the ptr parameter (to return it back up the call stack).
+1 -1
Просмотреть файл
@@ -48,7 +48,7 @@ class SlabHeap {
/**
* @brief Helper type for notifier
*/
using NOTIFIER_PROXY_T = NotifierProxy<HIPAllocator>;
using NOTIFIER_PROXY_T = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_workgroup>;
/**
* @brief Helper type for notifier
+8
Просмотреть файл
@@ -93,6 +93,7 @@ __device__ __forceinline__ bool is_thread_zero_in_block() {
__device__ __forceinline__ bool is_block_zero_in_grid() {
return hipBlockIdx_x == 0 && hipBlockIdx_y == 0 && hipBlockIdx_z == 0;
}
/*
* Returns the number of threads in the caller's flattened thread block.
*/
@@ -100,6 +101,13 @@ __device__ __forceinline__ int get_flat_block_size() {
return hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
}
/*
* Returns the number of threads in the caller's flattened grid.
*/
__device__ __forceinline__ int get_flat_grid_size() {
return get_flat_block_size() * hipGridDim_x * hipGridDim_y * hipGridDim_z;
}
/*
* Returns the flattened thread index of the calling thread within its
* thread block.
+1
Просмотреть файл
@@ -90,6 +90,7 @@ target_sources(
free_list_gtest.cpp
#context_ipc_gtest.cpp
ipc_impl_simple_coarse_gtest.cpp
ipc_impl_simple_fine_gtest.cpp
)
###############################################################################
+1
Просмотреть файл
@@ -33,6 +33,7 @@ TEST_F(IPCImplSimpleCoarseTestFixture, MPI_num_pes) {
}
TEST_F(IPCImplSimpleCoarseTestFixture, IPC_bases) {
ASSERT_NE(ipc_impl_.ipc_bases, nullptr);
for(int i{0}; i < mpi_.num_pes(); i++) {
ASSERT_NE(ipc_impl_.ipc_bases[i], nullptr);
}
-2
Просмотреть файл
@@ -218,8 +218,6 @@ class IPCImplSimpleCoarseTestFixture : public ::testing::Test {
protected:
std::vector<int> golden_;
std::vector<int> output_;
HEAP_T heap_mem_ {};
MPI_T mpi_ {heap_mem_.get_ptr(), heap_mem_.get_size()};
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+344
Просмотреть файл
@@ -0,0 +1,344 @@
/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP
#define ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP
#include "gtest/gtest.h"
#include <numeric>
#include <mpi.h>
#include "../src/atomic.hpp"
#include "../src/ipc_policy.hpp"
#include "../src/memory/notifier.hpp"
#include "../src/memory/symmetric_heap.hpp"
#include "../src/util.hpp"
namespace rocshmem {
enum TestType {
READ = 0,
WRITE = 1
};
const uint32_t SIGNAL_OFFSET {67108864};
__device__
void
validator(bool *error, int *golden, int *dest, size_t bytes) {
size_t elements {bytes / sizeof(int)};
for (int i {get_flat_id()}; i < elements; i += get_flat_grid_size()) {
if (golden[i] != dest[i]) {
printf("golden[%d] %d != dest[%d] %d\n", i, golden[i], i, dest[i]);
*error = true;
}
}
}
template <typename NotifierT>
__global__
void
kernel_put_with_signal_validator(bool *error, int *golden, int *dest, size_t bytes, NotifierT *notifier) {
detail::atomic::rocshmem_memory_orders orders{};
if (!get_flat_id()) {
while (detail::atomic::load<int, detail::atomic::memory_scope_system>(dest + SIGNAL_OFFSET, orders) == 0) {
;
}
}
notifier->sync();
validator(error, golden, dest, bytes);
}
template <typename NotifierT>
__global__
void
kernel_simple_fine_copy(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
if (!get_flat_id()) {
ipc_impl->ipcCopy(dest, src, bytes);
ipc_impl->ipcFence();
if (test == WRITE) {
ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1);
}
}
if (test == READ) {
notifier->sync();
validator(error, golden, dest, bytes);
}
}
template <typename NotifierT>
__global__
void
kernel_simple_fine_copy_wg(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
if (!blockIdx.x) {
ipc_impl->ipcCopy_wg(dest, src, bytes);
ipc_impl->ipcFence();
if (test == WRITE) {
if (!threadIdx.x) {
ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1);
}
}
}
if (test == READ) {
notifier->sync();
validator(error, golden, dest, bytes);
}
}
template <typename NotifierT>
__global__
void
kernel_simple_fine_copy_wave(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
if (!blockIdx.x && threadIdx.x < 64) {
ipc_impl->ipcCopy_wave(dest, src, bytes);
ipc_impl->ipcFence();
if (test == WRITE) {
if (!threadIdx.x) {
ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1);
}
}
}
__syncthreads();
if (test == READ) {
notifier->sync();
validator(error, golden, dest, bytes);
}
}
class IPCImplSimpleFineTestFixture : public ::testing::Test {
using HEAP_T = HeapMemory<HIPDefaultFinegrainedAllocator>;
using MPI_T = RemoteHeapInfo<CommunicatorMPI>;
using NotifierT = Notifier<detail::atomic::memory_scope_agent>;
using NotifierProxyT = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_agent>;
using FN_T1 = void (*)(IpcImpl*, bool*, int*, int*, int*, size_t, TestType, NotifierT*);
using FN_T2 = void (*)(bool*, int*, int*, size_t, NotifierT*);
public:
IPCImplSimpleFineTestFixture() {
ipc_impl_.ipcHostInit(mpi_.my_pe(), mpi_.get_heap_bases() , MPI_COMM_WORLD);
assert(ipc_impl_dptr_ == nullptr);
hip_allocator_.allocate((void**)&ipc_impl_dptr_, sizeof(IpcImpl));
CHECK_HIP(hipMemcpy(ipc_impl_dptr_, &ipc_impl_, sizeof(IpcImpl), hipMemcpyHostToDevice));
assert(error_dptr_ == nullptr);
hip_allocator_.allocate((void**)&error_dptr_, sizeof(bool));
*error_dptr_ = false;
}
~IPCImplSimpleFineTestFixture() {
if (ipc_impl_dptr_) {
hip_allocator_.deallocate(ipc_impl_dptr_);
}
if (error_dptr_) {
hip_allocator_.deallocate(error_dptr_);
}
if (golden_dptr_) {
hip_allocator_.deallocate(golden_dptr_);
}
ipc_impl_.ipcHostStop();
}
void launch(FN_T1 f, const dim3 grid, const dim3 block, int* src, int* dest, size_t bytes, TestType test) {
f<<<grid, block>>>(ipc_impl_dptr_, error_dptr_, golden_dptr_, src, dest, bytes, test, notifier_.get());
CHECK_HIP(hipStreamSynchronize(nullptr));
}
void launch(FN_T2 f, const dim3 grid, const dim3 block, int* dest, size_t bytes) {
f<<<grid, block>>>(error_dptr_, golden_dptr_, dest, bytes, notifier_.get());
CHECK_HIP(hipStreamSynchronize(nullptr));
}
void write(const dim3 grid, const dim3 block, size_t elems) {
iota_golden(elems);
initialize_signal(WRITE);
initialize_src_buffer(WRITE);
copy(WRITE, grid, block);
check_device_validation_errors(WRITE);
}
void write_wg(const dim3 grid, const dim3 block, size_t elems) {
iota_golden(elems);
initialize_signal(WRITE);
initialize_src_buffer(WRITE);
copy_wg(WRITE, grid, block);
check_device_validation_errors(WRITE);
}
void write_wave(const dim3 grid, const dim3 block, size_t elems) {
iota_golden(elems);
initialize_signal(WRITE);
initialize_src_buffer(WRITE);
copy_wave(WRITE, grid, block);
check_device_validation_errors(WRITE);
}
void read(const dim3 grid, const dim3 block, size_t elems) {
iota_golden(elems);
initialize_signal(READ);
initialize_src_buffer(READ);
copy(READ, grid, block);
check_device_validation_errors(READ);
}
void read_wg(const dim3 grid, const dim3 block, size_t elems) {
iota_golden(elems);
initialize_signal(READ);
initialize_src_buffer(READ);
copy_wg(READ, grid, block);
check_device_validation_errors(READ);
}
void read_wave(const dim3 grid, const dim3 block, size_t elems) {
iota_golden(elems);
initialize_signal(READ);
initialize_src_buffer(READ);
copy_wave(READ, grid, block);
check_device_validation_errors(READ);
}
void iota_golden(size_t elems) {
golden_.resize(elems);
std::iota(golden_.begin(), golden_.end(), 0);
assert(golden_dptr_ == nullptr);
size_t golden_dptr_bytes {golden_.size() * sizeof(int)};
hip_allocator_.allocate((void**)&golden_dptr_, golden_dptr_bytes);
CHECK_HIP(hipMemcpy(golden_dptr_, golden_.data(), golden_dptr_bytes, hipMemcpyHostToDevice));
}
void validate_golden(size_t elems) {
ASSERT_EQ(golden_.size(), elems);
for (int i{0}; i < golden_.size(); i++) {
ASSERT_EQ(golden_[i], i);
}
}
void initialize_signal(TestType test) {
bool is_write_test = test;
if (is_write_test && mpi_.my_pe() == 0) {
int *dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
*(dest + SIGNAL_OFFSET) = 0;
}
}
void initialize_src_buffer(TestType test) {
if (!pe_initializes_src_buffer(test)) {
return;
}
size_t bytes = golden_.size() * sizeof(int);
auto dev_src = reinterpret_cast<int*>(ipc_impl_.ipc_bases[mpi_.my_pe()]);
CHECK_HIP(hipMemcpy(dev_src, golden_.data(), bytes, hipMemcpyHostToDevice));
}
bool pe_initializes_src_buffer(TestType test) {
bool is_write_test = test;
bool is_read_test = !test;
return (is_write_test && mpi_.my_pe() == 0) ||
(is_read_test && mpi_.my_pe() == 1);
}
void execute(TestType test, FN_T1 fn, const dim3 grid, const dim3 block) {
size_t bytes = golden_.size() * sizeof(int);
if (mpi_.my_pe()) {
mpi_.barrier();
if (test == WRITE) {
int *dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
FN_T2 val_fn = kernel_put_with_signal_validator;
launch(val_fn, grid, block, dest, bytes);
}
mpi_.barrier();
return;
}
int *src{nullptr};
int *dest{nullptr};
if (test == WRITE) {
src = reinterpret_cast<int*>(ipc_impl_.ipc_bases[0]);
dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
} else {
src = reinterpret_cast<int*>(ipc_impl_.ipc_bases[1]);
dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[0]);
}
mpi_.barrier();
launch(fn, grid, block, src, dest, bytes, test);
mpi_.barrier();
}
void copy(TestType test, dim3 grid, dim3 block) {
execute(test, kernel_simple_fine_copy, grid, block);
}
void copy_wg(TestType test, dim3 grid, dim3 block) {
execute(test, kernel_simple_fine_copy_wg, grid, block);
}
void copy_wave(TestType test, dim3 grid, dim3 block) {
execute(test, kernel_simple_fine_copy_wave, grid, block);
}
void check_device_validation_errors(TestType test) {
if (!pe_validates_dest_buffer(test)) {
return;
}
ASSERT_EQ(*error_dptr_, false);
}
void validate_dest_buffer(TestType test) {
if (!pe_validates_dest_buffer(test)) {
return;
}
auto dev_dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[mpi_.my_pe()]);
for (int i{0}; i < golden_.size(); i++) {
ASSERT_EQ(golden_[i], dev_dest[i]);
}
}
bool pe_validates_dest_buffer(TestType test) {
return !pe_initializes_src_buffer(test);
}
protected:
HIPDefaultFinegrainedAllocator hip_allocator_ {};
NotifierProxyT notifier_ {};
HEAP_T heap_mem_ {};
MPI_T mpi_ {heap_mem_.get_ptr(), heap_mem_.get_size()};
std::vector<int> golden_;
int *golden_dptr_ {nullptr};
IpcImpl ipc_impl_ {};
IpcImpl *ipc_impl_dptr_ {nullptr};
bool *error_dptr_ {nullptr};
};
} // namespace rocshmem
#endif // ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP
+75 -7
Просмотреть файл
@@ -28,30 +28,98 @@ using namespace rocshmem;
******************************* Fixture Tests *******************************
*****************************************************************************/
TEST_F(NotifierTestFixture, run_all_threads_once_1_1) {
TEST_F(NotifierBlockTestFixture, run_all_threads_once_1_1) {
run_all_threads_once(1, 1);
}
TEST_F(NotifierTestFixture, run_all_threads_once_2_1) {
TEST_F(NotifierBlockTestFixture, run_all_threads_once_2_1) {
run_all_threads_once(2, 1);
}
TEST_F(NotifierTestFixture, run_all_threads_once_64_1) {
TEST_F(NotifierBlockTestFixture, run_all_threads_once_64_1) {
run_all_threads_once(64, 1);
}
TEST_F(NotifierTestFixture, run_all_threads_once_128_1) {
TEST_F(NotifierBlockTestFixture, run_all_threads_once_128_1) {
run_all_threads_once(128, 1);
}
TEST_F(NotifierTestFixture, run_all_threads_once_256_1) {
TEST_F(NotifierBlockTestFixture, run_all_threads_once_256_1) {
run_all_threads_once(256, 1);
}
TEST_F(NotifierTestFixture, run_all_threads_once_512_1) {
TEST_F(NotifierBlockTestFixture, run_all_threads_once_512_1) {
run_all_threads_once(512, 1);
}
TEST_F(NotifierTestFixture, run_all_threads_once_1024_1) {
TEST_F(NotifierBlockTestFixture, run_all_threads_once_1024_1) {
run_all_threads_once(1024, 1);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_1) {
run_all_threads_once(1, 1);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_2_1) {
run_all_threads_once(2, 1);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_64_1) {
run_all_threads_once(64, 1);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_128_1) {
run_all_threads_once(128, 1);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_256_1) {
run_all_threads_once(256, 1);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_512_1) {
run_all_threads_once(512, 1);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_1) {
run_all_threads_once(1024, 1);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_2) {
run_all_threads_once(1, 2);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_2) {
run_all_threads_once(1024, 2);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_4) {
run_all_threads_once(1, 4);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_4) {
run_all_threads_once(1024, 4);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_8) {
run_all_threads_once(1, 8);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_8) {
run_all_threads_once(1024, 8);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_32) {
run_all_threads_once(1, 32);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_32) {
run_all_threads_once(1024, 32);
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_38) {
run_all_threads_once(1, 38); // MI300 CPX
}
TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_38) {
run_all_threads_once(1024, 38); // MI300 CPX
}
+52 -34
Просмотреть файл
@@ -44,66 +44,45 @@ static const uint64_t NOTIFIER_OFFSET {0x100B00};
inline __device__
void
write_to_memory(uint8_t* raw_memory) {
auto thread_idx {get_flat_block_id()};
auto thread_idx {get_flat_id()};
raw_memory[thread_idx] = THREAD_VALUE;
__threadfence();
}
template <typename NotifierT>
__global__
void
all_threads_once(uint8_t* raw_memory,
Notifier* notifier) {
notifier->write(NOTIFIER_OFFSET);
uint64_t offset_u64 {notifier->read()};
notifier->done();
NotifierT * notifier) {
if (!get_flat_id()) {
notifier->store(NOTIFIER_OFFSET);
notifier->fence();
}
notifier->sync();
uint64_t offset_u64 {notifier->load()};
uint64_t raw_memory_u64 {reinterpret_cast<uint64_t>(raw_memory)};
uint64_t address_u64 {raw_memory_u64 + offset_u64};
uint8_t* address {reinterpret_cast<uint8_t*>(address_u64)};
write_to_memory(address);
__syncthreads();
}
class NotifierTestFixture : public ::testing::Test {
using NotifierProxyT = NotifierProxy<HIPAllocator>;
class NotifierBase : public ::testing::Test {
public:
NotifierTestFixture() {
NotifierBase() {
assert(raw_memory_ == nullptr);
hip_allocator_.allocate((void**)&raw_memory_, GIBIBYTE_);
assert(raw_memory_);
}
~NotifierTestFixture() {
~NotifierBase() {
if (raw_memory_) {
hip_allocator_.deallocate(raw_memory_);
}
}
void
run_all_threads_once(uint32_t x_block_dim,
uint32_t x_grid_dim) {
const dim3 hip_blocksize(x_block_dim, 1, 1);
const dim3 hip_gridsize(x_grid_dim, 1, 1);
hipLaunchKernelGGL(all_threads_once,
hip_gridsize,
hip_blocksize,
0,
nullptr,
raw_memory_,
notifier_.get());
hipError_t return_code = hipStreamSynchronize(nullptr);
if (return_code != hipSuccess) {
printf("Failed in stream synchronize\n");
assert(return_code == hipSuccess);
}
size_t number_threads {x_block_dim * x_grid_dim};
verify(size_t number_threads) {
uint8_t* offset_addr {compute_offset_addr()};
for (size_t i {0}; i < number_threads; i++) {
ASSERT_EQ(offset_addr[i], THREAD_VALUE);
}
@@ -136,12 +115,51 @@ class NotifierTestFixture : public ::testing::Test {
*/
uint8_t *raw_memory_ {nullptr};
};
class NotifierBlockTestFixture : public NotifierBase {
using NotifierT = Notifier<detail::atomic::memory_scope_workgroup>;
using NotifierProxyT = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_workgroup>;
public:
void
run_all_threads_once(uint32_t x_block_dim,
uint32_t x_grid_dim) {
new (notifier_.get()) NotifierT();
const dim3 block(x_block_dim, 1, 1);
const dim3 grid(x_grid_dim, 1, 1);
all_threads_once<NotifierT><<<grid, block>>>(raw_memory_, notifier_.get());
CHECK_HIP(hipStreamSynchronize(nullptr));
verify(x_block_dim * x_grid_dim);
}
/**
* @brief Used to broadcast base offset for writing.
*/
NotifierProxyT notifier_ {};
};
class NotifierAgentTestFixture : public NotifierBase {
using NotifierT = Notifier<detail::atomic::memory_scope_agent>;
using NotifierProxyT = NotifierProxy<HIPAllocator, detail::atomic::memory_scope_agent>;
public:
void
run_all_threads_once(uint32_t x_block_dim,
uint32_t x_grid_dim) {
new (notifier_.get()) NotifierT();
const dim3 block(x_block_dim, 1, 1);
const dim3 grid(x_grid_dim, 1, 1);
all_threads_once<NotifierT><<<grid, block>>>(raw_memory_, notifier_.get());
CHECK_HIP(hipStreamSynchronize(nullptr));
verify(x_block_dim * x_grid_dim);
}
/**
* @brief Used to broadcast base offset for writing.
*/
NotifierProxyT notifier_ {};
};
} // namespace rocshmem