diff --git a/examples/rocshmem_allreduce_test.cc b/examples/rocshmem_allreduce_test.cc index 57e8571aa5..46ec91e215 100644 --- a/examples/rocshmem_allreduce_test.cc +++ b/examples/rocshmem_allreduce_test.cc @@ -25,8 +25,8 @@ using namespace rocshmem; -__global__ void allreduce_test(int *source, int *dest, int* pWork, long *pSync, size_t nelem) -{ +__global__ void allreduce_test(int *source, int *dest, size_t nelem, + roc_shmem_team_t team) { __shared__ roc_shmem_ctx_t ctx; int64_t ctx_type = 0; @@ -34,7 +34,7 @@ __global__ void allreduce_test(int *source, int *dest, int* pWork, long *pSync, roc_shmem_wg_ctx_create(ctx_type, &ctx); int num_pes = roc_shmem_ctx_n_pes(ctx); - roc_shmem_ctx_int_sum_wg_to_all(ctx, dest, source, nelem, 0, 0, num_pes, pWork, pSync); + roc_shmem_ctx_int_sum_wg_to_all(ctx, team, dest, source, nelem); roc_shmem_ctx_quiet(ctx); __syncthreads(); @@ -43,24 +43,24 @@ __global__ void allreduce_test(int *source, int *dest, int* pWork, long *pSync, roc_shmem_wg_finalize(); } -static void init_sendbuf (int *sendbuf, int count, int mynode) +static void init_sendbuf (int *source, int nelem, int my_pe) { - for (int i = 0; i < count; i++) { - sendbuf[i] = mynode + i%9; + for (int i = 0; i < nelem; i++) { + source[i] = my_pe + i%9; } } -static bool check_recvbuf(int *recvbuf, int nprocs, int rank, int count) +static bool check_recvbuf(int *dest, int nelem, int my_pe, int npes) { bool res=true; - int expected = nprocs * (nprocs -1) / 2; + int expected = npes * (npes -1) / 2; - for (int i=0; i 1) { nelem = atoi(argv[1]); } - roc_shmem_init(); + int my_pe = roc_shmem_my_pe(); int npes = roc_shmem_n_pes(); + + int ndevices, my_device = 0; + CHECK_HIP(hipGetDeviceCount(&ndevices)); + my_device = my_pe % ndevices; + CHECK_HIP(hipSetDevice(my_device)); + + roc_shmem_init(); + int *source = (int *)roc_shmem_malloc(nelem * sizeof(int)); - int *result = (int *)roc_shmem_malloc(nelem * sizeof(int)); - if (NULL == source || NULL == result) { + int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int)); + if (NULL == source || NULL == dest) { std::cout << "Error allocating memory from symmetric heap" << std::endl; roc_shmem_global_exit(1); } - init_sendbuf(source, nelem, rank); + init_sendbuf(source, nelem, my_pe); for (int i=0; i>>(source, result, pWrk, pSync, nelem); + allreduce_test<<>>(source, dest, + nelem, team_reduce_world_dup); CHECK_HIP(hipDeviceSynchronize()); - bool pass = check_recvbuf(result, npes, rank, nelem); - + bool pass = check_recvbuf(dest, nelem, my_pe, npes); printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); roc_shmem_free(source); - roc_shmem_free(result); - roc_shmem_free(pWrk); - roc_shmem_free(pSync); + roc_shmem_free(dest); roc_shmem_finalize(); return 0; diff --git a/examples/rocshmem_alltoall_test.cc b/examples/rocshmem_alltoall_test.cc new file mode 100644 index 0000000000..e1ee85e60f --- /dev/null +++ b/examples/rocshmem_alltoall_test.cc @@ -0,0 +1,129 @@ +/* +** hipcc -c -fgpu-rdc -x hip rocshmem_alltoall_test.cc -I/opt/rocm/include +** -I$ROCHSMEM_INSTALL_DIR/include -I$OPENMPI_UCX_INSTALL_DIR/include/ +** hipcc -fgpu-rdc --hip-link rocshmem_alltoall_test.o -o rocshmem_alltoall_test +** $ROCHSMEM_INSTALL_DIR/lib/librocshmem.a $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so +** -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 +** +** ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test +*/ + +#include + +#define __HIP_PLATFORM_AMD__ +#include +#include +#include + +#define CHECK_HIP(condition) { \ + hipError_t error = condition; \ + if(error != hipSuccess){ \ + fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \ + MPI_Abort(MPI_COMM_WORLD, error); \ + } \ + } + +using namespace rocshmem; + +__global__ void alltoall_test(int *source, int *dest, size_t nelem, + roc_shmem_team_t team) { + __shared__ roc_shmem_ctx_t ctx; + int64_t ctx_type = 0; + + roc_shmem_wg_init(); + roc_shmem_wg_ctx_create(ctx_type, &ctx); + int num_pes = roc_shmem_ctx_n_pes(ctx); + + roc_shmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem); + + roc_shmem_ctx_quiet(ctx); + __syncthreads(); + + roc_shmem_wg_ctx_destroy(&ctx); + roc_shmem_wg_finalize(); +} + +static void init_sendbuf (int *source, int nelem, int my_pe, int npes) +{ + for (int pe = 0; pe < npes; pe++) { + for (int i = 0; i < nelem; i++) { + int idx = (pe * nelem) + i; + source[idx] = my_pe + pe; + } + } +} + +static bool check_recvbuf(int *dest, int nelem, int my_pe, int npes) +{ + bool res=true; + + for(int pe = 0; pe < npes; pe++) { + for(int i = 0; i < nelem; i++) { + int idx = (pe * nelem) + i; + int result = my_pe + pe; + if (dest[idx] != result) { + res = false; +#ifdef VERBOSE + printf("recvbuf[%d] = %d expected %d \n", i, dest[i], result); +#endif + } + } + } + + return res; +} + +#define MAX_ELEM 256 + +int main (int argc, char **argv) +{ + int nelem = MAX_ELEM; + + if (argc > 1) { + nelem = atoi(argv[1]); + } + + int my_pe = roc_shmem_my_pe(); + int npes = roc_shmem_n_pes(); + + int ndevices, my_device = 0; + CHECK_HIP(hipGetDeviceCount(&ndevices)); + my_device = my_pe % ndevices; + CHECK_HIP(hipSetDevice(my_device)); + + roc_shmem_init(); + + int *source = (int *)roc_shmem_malloc(nelem * npes * sizeof(int)); + int *dest = (int *)roc_shmem_malloc(nelem * npes * sizeof(int)); + if (NULL == source || NULL == dest) { + std::cout << "Error allocating memory from symmetric heap" << std::endl; + roc_shmem_global_exit(1); + } + + init_sendbuf(source, nelem, my_pe, npes); + for (int i = 0; i < nelem * npes; i++) { + dest[i] = -1; + } + + roc_shmem_team_t team_reduce_world_dup; + team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID; + roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0, + &team_reduce_world_dup); + + CHECK_HIP(hipDeviceSynchronize()); + + int threadsPerBlock=256; + alltoall_test<<>>(source, dest, + nelem, team_reduce_world_dup); + CHECK_HIP(hipDeviceSynchronize()); + + bool pass = check_recvbuf(dest, nelem, my_pe, npes); + + printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); + + roc_shmem_free(source); + roc_shmem_free(dest); + + roc_shmem_finalize(); + return 0; +} diff --git a/examples/rocshmem_broadcast_test.cc b/examples/rocshmem_broadcast_test.cc new file mode 100644 index 0000000000..48deea1393 --- /dev/null +++ b/examples/rocshmem_broadcast_test.cc @@ -0,0 +1,124 @@ +/* +** hipcc -c -fgpu-rdc -x hip rocshmem_broadcast_test.cc -I/opt/rocm/include +** -I$ROCHSMEM_INSTALL_DIR/include -I$OPENMPI_UCX_INSTALL_DIR/include/ +** hipcc -fgpu-rdc --hip-link rocshmem_broadcast_test.o -o rocshmem_broadcast_test +** $ROCHSMEM_INSTALL_DIR/lib/librocshmem.a $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so +** -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 +** +** ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test +*/ + +#include + +#define __HIP_PLATFORM_AMD__ +#include +#include +#include + +#define CHECK_HIP(condition) { \ + hipError_t error = condition; \ + if(error != hipSuccess){ \ + fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \ + MPI_Abort(MPI_COMM_WORLD, error); \ + } \ + } + +using namespace rocshmem; + +__global__ void broadcast_test(int *source, int *dest, size_t nelem, + int root, roc_shmem_team_t team) { + __shared__ roc_shmem_ctx_t ctx; + int64_t ctx_type = 0; + + roc_shmem_wg_init(); + roc_shmem_wg_ctx_create(ctx_type, &ctx); + int num_pes = roc_shmem_ctx_n_pes(ctx); + + roc_shmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root); + + roc_shmem_ctx_quiet(ctx); + __syncthreads(); + + roc_shmem_wg_ctx_destroy(&ctx); + roc_shmem_wg_finalize(); +} + +static void init_sendbuf(int *source, int nelem, int my_pe) +{ + for (int i = 0; i < nelem; i++) { + source[i] = i; + } +} + +static bool check_recvbuf(int *dest, int nelem, int my_pe, int npes) +{ + bool res=true; + + for (int i = 0; i < npes; i++) { + if (dest[i] != i) { + res = false; +#ifdef VERBOSE + printf("PE: %d, dest[%d] = %d, expected %d \n", my_pe, i, dest[i], i); +#endif + } + } + + return res; +} + +#define MAX_ELEM 256 + +int main(int argc, char **argv) +{ + int nelem = MAX_ELEM; + + if (argc > 1) { + nelem = atoi(argv[1]); + } + + int my_pe = roc_shmem_my_pe(); + int npes = roc_shmem_n_pes(); + + int ndevices, my_device = 0; + CHECK_HIP(hipGetDeviceCount(&ndevices)); + my_device = my_pe % ndevices; + CHECK_HIP(hipSetDevice(my_device)); + + roc_shmem_init(); + + int *source = (int *)roc_shmem_malloc(nelem * sizeof(int)); + int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int)); + if (NULL == source || NULL == dest) { + std::cout << "Error allocating memory from symmetric heap" << std::endl; + roc_shmem_global_exit(1); + } + + init_sendbuf(source, nelem, my_pe); + for (int i=0; i>>(source, dest, + nelem, root, team_reduce_world_dup); + CHECK_HIP(hipDeviceSynchronize()); + + if(my_pe != root) { + bool pass = check_recvbuf(dest, nelem, my_pe, npes); + printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); + } + + roc_shmem_free(source); + roc_shmem_free(dest); + + roc_shmem_finalize(); + return 0; +} diff --git a/include/roc_shmem/roc_shmem.hpp b/include/roc_shmem/roc_shmem.hpp index c0484e8bd7..faef8a7502 100644 --- a/include/roc_shmem/roc_shmem.hpp +++ b/include/roc_shmem/roc_shmem.hpp @@ -837,10 +837,6 @@ __device__ ATTR_NO_INLINE void roc_shmem_threadfence_system(); * MACRO DECLARE SHMEM_REDUCTION APIs */ #define REDUCTION_API_GEN(T, TNAME, Op_API) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_##Op_API##_wg_to_all( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, int nreduce, \ - int PE_start, int logPE_stride, int PE_size, T *pWrk, \ - long *pSync); /* NOLINT */ \ __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_##Op_API##_wg_to_all( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nreduce); \ @@ -874,16 +870,12 @@ __device__ ATTR_NO_INLINE void roc_shmem_threadfence_system(); */ #define BROADCAST_API_GEN(T, TNAME) \ __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_wg_broadcast( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, \ - int pe_start, int log_pe_stride, int pe_size, \ - long *p_sync); /* NOLINT */ \ + roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ + int nelem, int pe_root); /* NOLINT */ \ __host__ void roc_shmem_ctx_##TNAME##_broadcast( \ roc_shmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, \ int pe_start, int log_pe_stride, int pe_size, \ long *p_sync); /* NOLINT */ \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_wg_broadcast( \ - roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ - int nelem, int pe_root); /* NOLINT */ \ __host__ void roc_shmem_ctx_##TNAME##_broadcast( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nelem, int pe_root); /* NOLINT */ diff --git a/src/ipc/context_ipc_device.hpp b/src/ipc/context_ipc_device.hpp index fe8f9ef323..c2a196b16a 100644 --- a/src/ipc/context_ipc_device.hpp +++ b/src/ipc/context_ipc_device.hpp @@ -121,11 +121,6 @@ class IPCContext : public Context { __device__ T amo_fetch_cas(void *dst, T value, T cond, int pe); // Collectives - template - __device__ void to_all(T *dest, const T *source, int nreduce, int PE_start, - int logPE_stride, int PE_size, T *pWrk, - long *pSync); // NOLINT(runtime/int) - template __device__ void to_all(roc_shmem_team_t team, T *dest, const T *source, int nreduce); @@ -135,10 +130,6 @@ class IPCContext : public Context { int nelems, int pe_root); template - __device__ void broadcast(T *dest, const T *source, int nelems, int pe_root, - int pe_start, int log_pe_stride, int pe_size, - long *p_sync); // NOLINT(runtime/int) - template __device__ void alltoall(roc_shmem_team_t team, T *dest, const T *source, int nelems); template @@ -206,7 +197,17 @@ class IPCContext : public Context { char* g_ret; //internal functions used by collective operations - template + template + __device__ void internal_to_all(T *dest, const T *source, int nreduce, int PE_start, + int stride, int PE_size, T *pWrk, + long *pSync); // NOLINT(runtime/int) + + template + __device__ void internal_broadcast(T *dest, const T *source, int nelems, int pe_root, + int pe_start, int stride, int pe_size, + long *p_sync); // NOLINT(runtime/int) + + template __device__ void internal_put_broadcast(T *dst, const T *src, int nelems, int pe_root, int PE_start, int logPE_stride, int PE_size); // NOLINT(runtime/int) diff --git a/src/ipc/context_ipc_tmpl_device.hpp b/src/ipc/context_ipc_tmpl_device.hpp index 406367c72e..11891c9830 100644 --- a/src/ipc/context_ipc_tmpl_device.hpp +++ b/src/ipc/context_ipc_tmpl_device.hpp @@ -164,11 +164,10 @@ __device__ void compute_reduce(T *src, T *dst, int size, int wg_id, template __device__ void IPCContext::internal_direct_allreduce( - T *dst, const T *src, int nelems, int PE_start, int logPE_stride, + T *dst, const T *src, int nelems, int PE_start, int stride, int PE_size, T *pWrk, long *pSync) { // NOLINT(runtime/int) - int stride = 1 << logPE_stride; int finish = PE_start + stride * PE_size; int pe = my_pe; @@ -183,12 +182,12 @@ __device__ void IPCContext::internal_direct_allreduce( for (int i = PE_start; i < finish; i += stride) { if (i != pe) { - putmem_nbi_wg(&pWrk[pe * nelems], reinterpret_cast(src), + internal_putmem_wg(&pWrk[pe * nelems], reinterpret_cast(src), nelems * sizeof(T), i); if (is_thread_zero_in_block()) { fence(); - put_nbi(&pSync[pe], &flag_val, 1, i); + internal_putmem(&pSync[pe], &flag_val, sizeof(*pSync), i); } } } @@ -278,7 +277,7 @@ __device__ void IPCContext::internal_direct_allreduce( template __device__ void IPCContext::internal_ring_allreduce( T *dst, const T *src, int nelems, [[maybe_unused]] int PE_start, - [[maybe_unused]] int logPE_stride, [[maybe_unused]] int PE_size, T *pWrk, + [[maybe_unused]] int stride, [[maybe_unused]] int PE_size, T *pWrk, long *pSync, // NOLINT(runtime/int) int n_seg, int seg_size, int chunk_size) { int off_seg, off_send, off_recv; @@ -300,7 +299,7 @@ __device__ void IPCContext::internal_ring_allreduce( off_send = (((my_pe + 1 - iter + 2 * num_pes) % num_pes) * chunk_size); off_recv = (((my_pe - iter + 2 * num_pes) % num_pes) * chunk_size); - putmem_nbi_wg(reinterpret_cast(&pWrk[off_send]), + internal_putmem_wg(reinterpret_cast(&pWrk[off_send]), reinterpret_cast(&dst[off_send + off_seg]), chunk_size * sizeof(T), send_pe); @@ -308,7 +307,7 @@ __device__ void IPCContext::internal_ring_allreduce( fence(); wait_val = seg + 100; - put_nbi(&pSync[iter], &wait_val, 1, send_pe); + internal_putmem(&pSync[iter], &wait_val, sizeof(*pSync), send_pe); #if defined(__gfx90a__) __threadfence_system(); #endif /* __gfx90a__ */ @@ -329,7 +328,7 @@ __device__ void IPCContext::internal_ring_allreduce( if (is_thread_zero_in_block()) { fence(); wait_val = seg + 100; - put_nbi(&pSync[iter], &wait_val, 1, send_pe); + internal_putmem(&pSync[iter], &wait_val, sizeof(*pSync), send_pe); #if defined(__gfx90a__) __threadfence_system(); #endif /* __gfx90a__ */ @@ -354,19 +353,19 @@ __device__ void IPCContext::to_all(roc_shmem_team_t team, T *dest, /** * Ensure that the stride is a multiple of 2 for GPU_IB. */ - int log_pe_stride = static_cast(team_obj->tinfo_wrt_world->log_stride); + int stride = team_obj->tinfo_wrt_world->stride; int pe_start = team_obj->tinfo_wrt_world->pe_start; int pe_size = team_obj->tinfo_wrt_world->size; long *p_sync = team_obj->barrier_pSync; T *pWrk = reinterpret_cast(team_obj->pWrk); - to_all(dest, source, nreduce, pe_start, log_pe_stride, pe_size, pWrk, + internal_to_all(dest, source, nreduce, pe_start, stride, pe_size, pWrk, p_sync); } template -__device__ void IPCContext::to_all(T *dest, const T *source, int nreduce, - int PE_start, int logPE_stride, +__device__ void IPCContext::internal_to_all(T *dest, const T *source, int nreduce, + int PE_start, int stride, int PE_size, T *pWrk, long *pSync) { // NOLINT(runtime/int) size_t direct_pWrk = num_pes * nreduce; @@ -376,7 +375,7 @@ __device__ void IPCContext::to_all(T *dest, const T *source, int nreduce, size_t provided_pSync = ROC_SHMEM_REDUCE_SYNC_SIZE; if (provided_pWrk >= direct_pWrk && provided_pSync >= direct_pSync) { - internal_direct_allreduce(dest, source, nreduce, PE_start, logPE_stride, + internal_direct_allreduce(dest, source, nreduce, PE_start, stride, PE_size, pWrk, pSync); } else { if (ring_pSync <= ROC_SHMEM_REDUCE_SYNC_SIZE) { @@ -395,7 +394,7 @@ __device__ void IPCContext::to_all(T *dest, const T *source, int nreduce, n_seg = 1; } internal_ring_allreduce(dest, source, nreduce, PE_start, - logPE_stride, PE_size, pWrk, pSync, n_seg, + stride, PE_size, pWrk, pSync, n_seg, seg_size, chunk_size); if (n_seg_up > n_seg) { T *p_dst = (dest + (n_seg * seg_size)); @@ -403,7 +402,7 @@ __device__ void IPCContext::to_all(T *dest, const T *source, int nreduce, int p_count = nreduce - (n_seg * seg_size); int p_chunk = p_count / num_pes; - internal_ring_allreduce(p_dst, p_src, p_count, PE_start, logPE_stride, + internal_ring_allreduce(p_dst, p_src, p_count, PE_start, stride, PE_size, pWrk, pSync, 1, (p_chunk * num_pes), p_chunk); if ((p_chunk * num_pes) < p_count) { @@ -412,7 +411,7 @@ __device__ void IPCContext::to_all(T *dest, const T *source, int nreduce, p_dst += (p_chunk * num_pes); const T *p_src2 = p_src + (p_chunk * num_pes); - internal_direct_allreduce(p_dst, p_src2, p_count, PE_start, logPE_stride, + internal_direct_allreduce(p_dst, p_src2, p_count, PE_start, stride, PE_size, pWrk, pSync); } } @@ -425,9 +424,8 @@ __device__ void IPCContext::to_all(T *dest, const T *source, int nreduce, template __device__ void IPCContext::internal_put_broadcast( T *dst, const T *src, int nelems, int pe_root, int pe_start, - int log_pe_stride, int pe_size) { // NOLINT(runtime/int) + int stride, int pe_size) { // NOLINT(runtime/int) if (my_pe == pe_root) { - int stride = 1 << log_pe_stride; int finish = pe_start + stride * pe_size; for (int i = pe_start; i < finish; i += stride) { if (i != my_pe) { @@ -453,31 +451,31 @@ __device__ void IPCContext::broadcast(roc_shmem_team_t team, T *dst, /** * Ensure that the stride is a multiple of 2 . */ - int log_pe_stride = static_cast(team_obj->tinfo_wrt_world->log_stride); + int stride = team_obj->tinfo_wrt_world->stride; int pe_start = team_obj->tinfo_wrt_world->pe_start; int pe_size = team_obj->tinfo_wrt_world->size; long *p_sync = team_obj->bcast_pSync; // Passed pe_root is relative to team, convert to world root int pe_root_world = team_obj->get_pe_in_world(pe_root); - broadcast(dst, src, nelems, pe_root_world, pe_start, log_pe_stride, + internal_broadcast(dst, src, nelems, pe_root_world, pe_start, stride, pe_size, p_sync); } template -__device__ void IPCContext::broadcast(T *dst, const T *src, int nelems, +__device__ void IPCContext::internal_broadcast(T *dst, const T *src, int nelems, int pe_root, int pe_start, - int log_pe_stride, int pe_size, + int stride, int pe_size, long *p_sync) { // NOLINT(runtime/int) if (num_pes < 4) { - internal_put_broadcast(dst, src, nelems, pe_root, pe_start, log_pe_stride, + internal_put_broadcast(dst, src, nelems, pe_root, pe_start, stride, pe_size); } else { internal_get_broadcast(dst, src, nelems, pe_root); } // Synchronize on completion of broadcast - internal_sync(my_pe, pe_start, (1 << log_pe_stride), pe_size, p_sync); + internal_sync(my_pe, pe_start, stride, pe_size, p_sync); } template diff --git a/src/roc_shmem_gpu.cpp b/src/roc_shmem_gpu.cpp index 582249eec9..1f1867724b 100644 --- a/src/roc_shmem_gpu.cpp +++ b/src/roc_shmem_gpu.cpp @@ -429,17 +429,6 @@ __device__ void *roc_shmem_ptr(const void *dest, int pe) { return get_internal_ctx(ROC_SHMEM_CTX_DEFAULT)->shmem_ptr(dest, pe); } -template -__device__ void roc_shmem_wg_to_all(roc_shmem_ctx_t ctx, T *dest, - const T *source, int nreduce, int PE_start, - int logPE_stride, int PE_size, T *pWrk, - long *pSync) { - GPU_DPRINTF("Function: roc_shmem_to_all\n"); - - get_internal_ctx(ctx)->to_all(dest, source, nreduce, PE_start, - logPE_stride, PE_size, pWrk, pSync); -} - template __device__ void roc_shmem_wg_to_all(roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, int nreduce) { @@ -448,17 +437,6 @@ __device__ void roc_shmem_wg_to_all(roc_shmem_ctx_t ctx, roc_shmem_team_t team, get_internal_ctx(ctx)->to_all(team, dest, source, nreduce); } -template -__device__ void roc_shmem_wg_broadcast(roc_shmem_ctx_t ctx, T *dest, - const T *source, int nelem, int pe_root, - int pe_start, int log_pe_stride, - int pe_size, long *p_sync) { - GPU_DPRINTF("Function: roc_shmem_broadcast\n"); - - get_internal_ctx(ctx)->broadcast(dest, source, nelem, pe_root, pe_start, - log_pe_stride, pe_size, p_sync); -} - template __device__ void roc_shmem_wg_broadcast(roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, @@ -886,9 +864,6 @@ __device__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, * Template generator for reductions */ #define REDUCTION_GEN(T, Op) \ - template __device__ void roc_shmem_wg_to_all( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, int nreduce, \ - int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); \ template __device__ void roc_shmem_wg_to_all( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ int nreduce); @@ -919,9 +894,6 @@ __device__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, template __device__ void roc_shmem_get_nbi(T * dest, const T *source, \ size_t nelems, int pe); \ template __device__ T roc_shmem_g(const T *source, int pe); \ - template __device__ void roc_shmem_wg_broadcast( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, int nelem, int pe_root, \ - int pe_start, int log_pe_stride, int pe_size, long *p_sync); \ template __device__ void roc_shmem_wg_broadcast( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ int nelem, int pe_root); \ @@ -1100,12 +1072,6 @@ __device__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, **/ #define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ - __device__ void roc_shmem_ctx_##TNAME##_##Op_API##_wg_to_all( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, int nreduce, \ - int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync) { \ - roc_shmem_wg_to_all(ctx, dest, source, nreduce, PE_start, \ - logPE_stride, PE_size, pWrk, pSync); \ - } \ __device__ void roc_shmem_ctx_##TNAME##_##Op_API##_wg_to_all( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nreduce) { \ @@ -1240,12 +1206,6 @@ __device__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, size_t nelems, int pe) { \ roc_shmemx_get_nbi_wg(dest, source, nelems, pe); \ } \ - __device__ void roc_shmem_ctx_##TNAME##_wg_broadcast( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, \ - int pe_start, int log_pe_stride, int pe_size, long *p_sync) { \ - roc_shmem_wg_broadcast(ctx, dest, source, nelem, pe_root, pe_start, \ - log_pe_stride, pe_size, p_sync); \ - } \ __device__ void roc_shmem_ctx_##TNAME##_wg_broadcast( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nelem, int pe_root) { \ diff --git a/tests/functional_tests/broadcast_tester.cpp b/tests/functional_tests/broadcast_tester.cpp deleted file mode 100644 index f5c936c2ed..0000000000 --- a/tests/functional_tests/broadcast_tester.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -using namespace rocshmem; - -/* Declare the template with a generic implementation */ -template -__device__ void wg_broadcast(roc_shmem_ctx_t ctx, T *dest, const T *source, - int nelem, int pe_root, int pe_start, - int log_pe_stride, int pe_size, long *p_sync) { - return; -} - -/* Define templates to call ROC_SHMEM */ -#define BROADCAST_DEF_GEN(T, TNAME) \ - template <> \ - __device__ void wg_broadcast( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, int nelem, int pe_root, \ - int pe_start, int log_pe_stride, int pe_size, long *p_sync) { \ - roc_shmem_ctx_##TNAME##_wg_broadcast(ctx, dest, source, nelem, pe_root, \ - pe_start, log_pe_stride, pe_size, \ - p_sync); \ - } - -BROADCAST_DEF_GEN(float, float) -BROADCAST_DEF_GEN(double, double) -BROADCAST_DEF_GEN(char, char) -// BROADCAST_DEF_GEN(long double, longdouble) -BROADCAST_DEF_GEN(signed char, schar) -BROADCAST_DEF_GEN(short, short) -BROADCAST_DEF_GEN(int, int) -BROADCAST_DEF_GEN(long, long) -BROADCAST_DEF_GEN(long long, longlong) -BROADCAST_DEF_GEN(unsigned char, uchar) -BROADCAST_DEF_GEN(unsigned short, ushort) -BROADCAST_DEF_GEN(unsigned int, uint) -BROADCAST_DEF_GEN(unsigned long, ulong) -BROADCAST_DEF_GEN(unsigned long long, ulonglong) - -/****************************************************************************** - * DEVICE TEST KERNEL - *****************************************************************************/ -template -__global__ void BroadcastTest(int loop, int skip, uint64_t *timer, - T1 *source_buf, T1 *dest_buf, long *pSync, - int size, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; - - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); - - int n_pes = roc_shmem_ctx_n_pes(ctx); - - __syncthreads(); - - uint64_t start; - for (int i = 0; i < loop; i++) { - if (i == skip && hipThreadIdx_x == 0) { - start = roc_shmem_timer(); - } - - wg_broadcast(ctx, - dest_buf, // T* dest - source_buf, // const T* source - size, // int nelement - 0, // int PE_root - 0, // int PE_start - 0, // int logPE_stride - n_pes, // int PE_size - pSync); // long *pSync - roc_shmem_ctx_wg_barrier_all(ctx); - } - - __syncthreads(); - - if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; - } - - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); -} - -/****************************************************************************** - * HOST TESTER CLASS METHODS - *****************************************************************************/ -template -BroadcastTester::BroadcastTester( - TesterArguments args, std::function f1, - std::function(const T1 &)> f2) - : Tester(args), init_buf{f1}, verify_buf{f2} { - source_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1)); - dest_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1)); - - size_t p_sync_size = ROC_SHMEM_BCAST_SYNC_SIZE; - pSync = (long *)roc_shmem_malloc(p_sync_size * sizeof(long)); - - for (int i = 0; i < p_sync_size; i++) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; - } -} - -template -BroadcastTester::~BroadcastTester() { - roc_shmem_free(source_buf); - roc_shmem_free(dest_buf); - roc_shmem_free(pSync); -} - -template -void BroadcastTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, - uint64_t size) { - size_t shared_bytes = 0; - - hipLaunchKernelGGL(BroadcastTest, gridSize, blockSize, shared_bytes, - stream, loop, args.skip, timer, source_buf, dest_buf, - pSync, size, _shmem_context); - - num_msgs = loop + args.skip; - num_timed_msgs = loop; -} - -template -void BroadcastTester::resetBuffers(uint64_t size) { - for (int i = 0; i < args.max_msg_size; i++) { - init_buf(source_buf[i], dest_buf[i]); - } -} - -template -void BroadcastTester::verifyResults(uint64_t size) { - for (int i = 0; i < size; i++) { - auto r = verify_buf(dest_buf[i]); - if (r.first == false) { - fprintf(stderr, "Data validation error at idx %d\n", i); - fprintf(stderr, "%s.\n", r.second.c_str()); - exit(-1); - } - } -} diff --git a/tests/functional_tests/broadcast_tester.hpp b/tests/functional_tests/broadcast_tester.hpp deleted file mode 100644 index b436756aa4..0000000000 --- a/tests/functional_tests/broadcast_tester.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef _BROADCAST_TESTER_HPP_ -#define _BROADCAST_TESTER_HPP_ - -#include -#include - -#include "tester.hpp" - -/****************************************************************************** - * HOST TESTER CLASS - *****************************************************************************/ -template -class BroadcastTester : public Tester { - public: - explicit BroadcastTester( - TesterArguments args, std::function f1, - std::function(const T1 &)> f2); - virtual ~BroadcastTester(); - - protected: - virtual void resetBuffers(uint64_t size) override; - - virtual void launchKernel(dim3 gridSize, dim3 blockSize, int loop, - uint64_t size) override; - - virtual void verifyResults(uint64_t size) override; - - T1 *source_buf; - T1 *dest_buf; - long *pSync; - - private: - std::function init_buf; - std::function(const T1 &)> verify_buf; -}; - -#include "broadcast_tester.cpp" - -#endif diff --git a/tests/functional_tests/reduction_tester.cpp b/tests/functional_tests/reduction_tester.cpp deleted file mode 100644 index 4c1cbf66ba..0000000000 --- a/tests/functional_tests/reduction_tester.cpp +++ /dev/null @@ -1,168 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -using namespace rocshmem; - -/* Declare the template with a generic implementation */ -template -__device__ void wg_to_all(roc_shmem_ctx_t ctx, T *dest, const T *source, - int nreduce, int PE_start, int logPE_stride, - int PE_size, T *pWrk, long *pSync) { - return; -} - -/* Define templates to call ROC_SHMEM */ -#define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ - template <> \ - __device__ void wg_to_all( \ - roc_shmem_ctx_t ctx, T * dest, const T *source, int nreduce, \ - int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync) { \ - roc_shmem_ctx_##TNAME##_##Op_API##_wg_to_all(ctx, dest, source, nreduce, \ - PE_start, logPE_stride, \ - PE_size, pWrk, pSync); \ - } - -#define ARITH_REDUCTION_DEF_GEN(T, TNAME) \ - REDUCTION_DEF_GEN(T, TNAME, sum, ROC_SHMEM_SUM) \ - REDUCTION_DEF_GEN(T, TNAME, min, ROC_SHMEM_MIN) \ - REDUCTION_DEF_GEN(T, TNAME, max, ROC_SHMEM_MAX) \ - REDUCTION_DEF_GEN(T, TNAME, prod, ROC_SHMEM_PROD) - -#define BITWISE_REDUCTION_DEF_GEN(T, TNAME) \ - REDUCTION_DEF_GEN(T, TNAME, or, ROC_SHMEM_OR) \ - REDUCTION_DEF_GEN(T, TNAME, and, ROC_SHMEM_AND) \ - REDUCTION_DEF_GEN(T, TNAME, xor, ROC_SHMEM_XOR) - -#define INT_REDUCTION_DEF_GEN(T, TNAME) \ - ARITH_REDUCTION_DEF_GEN(T, TNAME) \ - BITWISE_REDUCTION_DEF_GEN(T, TNAME) - -#define FLOAT_REDUCTION_DEF_GEN(T, TNAME) ARITH_REDUCTION_DEF_GEN(T, TNAME) - -INT_REDUCTION_DEF_GEN(int, int) -INT_REDUCTION_DEF_GEN(short, short) -INT_REDUCTION_DEF_GEN(long, long) -INT_REDUCTION_DEF_GEN(long long, longlong) -FLOAT_REDUCTION_DEF_GEN(float, float) -FLOAT_REDUCTION_DEF_GEN(double, double) -// long double reduction fails. hipcc/device may not support long double. -// so disable it for now. -// FLOAT_REDUCTION_DEF_GEN(long double, longdouble) - -/****************************************************************************** - * DEVICE TEST KERNEL - *****************************************************************************/ -template -__global__ void ReductionTest(int loop, int skip, uint64_t *timer, T1 *s_buf, - T1 *r_buf, T1 *pWrk, long *pSync, int size, - TestType type, ShmemContextType ctx_type) { - __shared__ roc_shmem_ctx_t ctx; - - roc_shmem_wg_init(); - roc_shmem_wg_ctx_create(ctx_type, &ctx); - - int n_pes = roc_shmem_ctx_n_pes(ctx); - - __syncthreads(); - - uint64_t start; - for (int i = 0; i < loop + skip; i++) { - if (i == skip && hipThreadIdx_x == 0) { - start = roc_shmem_timer(); - } - wg_to_all(ctx, r_buf, s_buf, size, 0, 0, n_pes, pWrk, pSync); - roc_shmem_ctx_wg_barrier_all(ctx); - } - - __syncthreads(); - - if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = roc_shmem_timer() - start; - } - - roc_shmem_wg_ctx_destroy(&ctx); - roc_shmem_wg_finalize(); -} - -/****************************************************************************** - * HOST TESTER CLASS METHODS - *****************************************************************************/ -template -ReductionTester::ReductionTester( - TesterArguments args, std::function f1, - std::function(const T1 &, const T1 &)> f2) - : Tester(args), init_buf{f1}, verify_buf{f2} { - s_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1)); - r_buf = (T1 *)roc_shmem_malloc(args.max_msg_size * sizeof(T1)); - - size_t p_wrk_size = - std::max(args.max_msg_size / 2 + 1, ROC_SHMEM_REDUCE_MIN_WRKDATA_SIZE); - pWrk = (T1 *)roc_shmem_malloc(p_wrk_size * sizeof(T1)); - - size_t p_sync_size = ROC_SHMEM_REDUCE_SYNC_SIZE; - pSync = (long *)roc_shmem_malloc(p_sync_size * sizeof(long)); - - for (int i = 0; i < p_sync_size; i++) { - pSync[i] = ROC_SHMEM_SYNC_VALUE; - } -} - -template -ReductionTester::~ReductionTester() { - roc_shmem_free(s_buf); - roc_shmem_free(r_buf); - roc_shmem_free(pWrk); - roc_shmem_free(pSync); -} - -template -void ReductionTester::launchKernel(dim3 gridSize, dim3 blockSize, - int loop, uint64_t size) { - size_t shared_bytes = 0; - - hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionTest), gridSize, - blockSize, shared_bytes, stream, loop, args.skip, timer, - s_buf, r_buf, pWrk, pSync, size, _type, _shmem_context); - - num_msgs = loop + args.skip; - num_timed_msgs = loop; -} - -template -void ReductionTester::resetBuffers(uint64_t size) { - for (int i = 0; i < args.max_msg_size; i++) { - init_buf(s_buf[i], r_buf[i]); - } -} - -template -void ReductionTester::verifyResults(uint64_t size) { - int n_pes = roc_shmem_n_pes(); - for (int i = 0; i < size; i++) { - auto r = verify_buf(r_buf[i], (T1)n_pes); - if (r.first == false) { - fprintf(stderr, "Data validation error at idx %d\n", i); - fprintf(stderr, "%s.\n", r.second.c_str()); - exit(-1); - } - } -} diff --git a/tests/functional_tests/reduction_tester.hpp b/tests/functional_tests/reduction_tester.hpp deleted file mode 100644 index 7342fec9ff..0000000000 --- a/tests/functional_tests/reduction_tester.hpp +++ /dev/null @@ -1,63 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef _REDUCTION_TESTER_HPP_ -#define _REDUCTION_TESTER_HPP_ - -#include -#include - -#include "tester.hpp" - -/****************************************************************************** - * HOST TESTER CLASS - *****************************************************************************/ -template -class ReductionTester : public Tester { - public: - explicit ReductionTester( - TesterArguments args, std::function f1, - std::function(const T1 &, const T1 &)> f2); - virtual ~ReductionTester(); - - protected: - virtual void resetBuffers(uint64_t size) override; - - virtual void launchKernel(dim3 gridSize, dim3 blockSize, int loop, - uint64_t size) override; - - virtual void verifyResults(uint64_t size) override; - - T1 *s_buf; - T1 *r_buf; - T1 *pWrk; - long *pSync; - - private: - std::function init_buf; - std::function(const T1 &, const T1 &)> - verify_buf; -}; - -#include "reduction_tester.cpp" - -#endif diff --git a/tests/functional_tests/tester.cpp b/tests/functional_tests/tester.cpp index 8d081d976b..b73ae67b6c 100644 --- a/tests/functional_tests/tester.cpp +++ b/tests/functional_tests/tester.cpp @@ -35,7 +35,6 @@ #include "amo_extended_tester.hpp" #include "amo_standard_tester.hpp" #include "barrier_all_tester.hpp" -#include "broadcast_tester.hpp" #include "empty_tester.hpp" #include "extended_primitives.hpp" #include "fcollect_tester.hpp" @@ -44,7 +43,6 @@ #include "primitive_mr_tester.hpp" #include "primitive_tester.hpp" #include "random_access_tester.hpp" -#include "reduction_tester.hpp" #include "shmem_ptr_tester.hpp" #include "swarm_tester.hpp" #include "sync_tester.hpp" @@ -156,130 +154,6 @@ std::vector Tester::create(TesterArguments args) { std::to_string(n_pes)); })); return testers; - case ReductionTestType: - if (rank == 0) std::cout << "All-to-All Reduction ###" << std::endl; - - testers.push_back(new ReductionTester( - args, - [](float& f1, float& f2) { - f1 = 1; - f2 = 1; - }, - [](float v, float n_pes) { - return (v == n_pes) - ? std::make_pair(true, "") - : std::make_pair(false, "Got " + std::to_string(v) + - ", Expect " + - std::to_string(n_pes)); - })); - -#if 0 - testers.push_back( - new ReductionTester( - args, - [](double& f1, double& f2) - { - f1=1; - f2=1; - }, - [](double v, double n_pes) - { - return (v == n_pes) ? - std::make_pair(true, "") : - std::make_pair(false, - "Got "+ std::to_string(v) + ", Expect " + std::to_string(n_pes)); - } - ) - ); - - testers.push_back( new ReductionTester(args, - [](long double& f1,long double& f2){f1=1; f2=1;}, - [](long double v){ return (v==2.0) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 2.0. [r3]."); })); - testers.push_back( new ReductionTester(args, - [](short& f1, short& f2){f1=1; f2=2;}, - [](short v){ return (v==3) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 3. [r4]."); })); - testers.push_back( new ReductionTester(args, - [](int& f1, int& f2){f1=1; f2=2;}, - [](int v){ return (v==3) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 3. [r5]."); })); - testers.push_back( new ReductionTester(args, - [](long long& f1, long long& f2){f1=1; f2=2;}, - [](long long v){ return (v==3) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 3. [r6]."); })); - testers.push_back( new ReductionTester(args, - [](long long& f1, long long& f2){f1=1; f2=2;}, - [](long long v){ return (v==3) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 3. [r7]."); })); - // seems like deadlock or soemthing, this test hang forever - testers.push_back( new ReductionTester(args, - [](short& f1, short& f2){f1=1; f2=2;}, - [](short v){ return (v==1) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 1. [r8]."); })); - testers.push_back( new ReductionTester(args, - [](int& f1, int& f2){f1=1; f2=2;}, - [](int v){ return (v==1) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 1. [r9]."); })); - testers.push_back( new ReductionTester(args, - [](long long& f1, long long& f2){f1=1; f2=2;}, - [](long long v){ return (v==1) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 1. [r10]."); })); - testers.push_back( new ReductionTester(args, - [](long long& f1, long long& f2){f1=1; f2=2;}, - [](long long v){ return (v==1) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 1. [r11]."); })); - testers.push_back( new ReductionTester(args, - [](int& f1, int& f2){f1=1; f2=2;}, - [](int v){ return (v==2) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 2. [r12]."); })); - testers.push_back( new ReductionTester(args, - [](long long& f1, long long& f2){f1=1; f2=2;}, - [](long long v){ return (v==2) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 2. [r13]."); })); - testers.push_back( new ReductionTester(args, - [](long long& f1, long long& f2){f1=1; f2=2;}, - [](long long v){ return (v==2) ? std::make_pair(true, "") : - std::make_pair(false, "Got "+ std::to_string(v) +", Expect 2. [r14]."); })); -#endif - return testers; - case BroadcastTestType: - if (rank == 0) { - std::cout << "Broadcast Test ###" << std::endl; - } - testers.push_back(new BroadcastTester( - args, - [](long& f1, long& f2) { - f1 = 1; - f2 = 2; - }, - [rank](long v) { - long expected_val; - /** - * The verification routine here requires that the - * PE_root value is 0 which denotes that the - * sending processing element is rank 0. - * - * The difference in expected values arises from - * the specification for broadcast where the - * PE_root processing element does not copy the - * contents from its own source to dest during - * the broadcast. - */ - if (rank == 0) { - expected_val = 2; - } else { - expected_val = 1; - } - - return (v == expected_val) - ? std::make_pair(true, "") - : std::make_pair( - false, "Rank " + std::to_string(rank) + ", Got " + - std::to_string(v) + ", Expect " + - std::to_string(expected_val)); - })); - return testers; case TeamBroadcastTestType: if (rank == 0) { std::cout << "Team Broadcast Test ###" << std::endl; @@ -658,9 +532,7 @@ bool Tester::peLaunchesKernel() { /** * Some test types are active on both sides. */ - is_launcher = is_launcher || (_type == ReductionTestType) || - (_type == TeamReductionTestType) || - (_type == BroadcastTestType) || + is_launcher = is_launcher || (_type == TeamReductionTestType) || (_type == TeamBroadcastTestType) || (_type == AllToAllTestType) || (_type == FCollectTestType) || (_type == PingPongTestType) || (_type == BarrierAllTestType) || diff --git a/tests/functional_tests/tester.hpp b/tests/functional_tests/tester.hpp index ad639a63ae..d5da3a6bc7 100644 --- a/tests/functional_tests/tester.hpp +++ b/tests/functional_tests/tester.hpp @@ -37,7 +37,7 @@ enum TestType { PutTestType = 2, PutNBITestType = 3, GetSwarmTestType = 4, - ReductionTestType = 5, + // ReductionTestType = 5, AMO_FAddTestType = 6, AMO_FIncTestType = 7, AMO_FetchTestType = 8, @@ -52,7 +52,7 @@ enum TestType { BarrierAllTestType = 17, SyncAllTestType = 18, SyncTestType = 19, - BroadcastTestType = 20, + // BroadcastTestType = 20, CollectTestType = 21, FCollectTestType = 22, AllToAllTestType = 23, diff --git a/tests/functional_tests/tester_arguments.cpp b/tests/functional_tests/tester_arguments.cpp index 1167fafaeb..a8c5067492 100644 --- a/tests/functional_tests/tester_arguments.cpp +++ b/tests/functional_tests/tester_arguments.cpp @@ -136,9 +136,8 @@ void TesterArguments::get_rocshmem_arguments() { myid = roc_shmem_my_pe(); TestType type = (TestType)algorithm; - if ((type != ReductionTestType) && (type != BarrierAllTestType) && - (type != SyncAllTestType) && (type != SyncTestType) && - (type != BroadcastTestType) && (type != AllToAllTestType) && + if ((type != BarrierAllTestType) && (type != SyncAllTestType) && + (type != SyncTestType) && (type != AllToAllTestType) && (type != FCollectTestType) && (type != TeamReductionTestType) && (type != TeamBroadcastTestType) && (type != PingAllTestType)) { if (numprocs != 2) {