diff --git a/projects/rocshmem/examples/rocshmem_allreduce_test.cc b/projects/rocshmem/examples/rocshmem_allreduce_test.cc index d9a2a581ff..fd63f07c3c 100644 --- a/projects/rocshmem/examples/rocshmem_allreduce_test.cc +++ b/projects/rocshmem/examples/rocshmem_allreduce_test.cc @@ -43,21 +43,21 @@ __global__ void allreduce_test(int *source, int *dest, size_t nelem, roc_shmem_wg_finalize(); } -static void init_sendbuf (int *sendbuf, int count, int mynode) +static void init_sendbuf (int *source, int nelem, int my_pe) { - for (int i = 0; i < count; i++) { - sendbuf[i] = mynode + i%9; + for (int i = 0; i < nelem; i++) { + source[i] = my_pe + i%9; } } -static bool check_recvbuf(int *recvbuf, int nprocs, int rank, int count) +static bool check_recvbuf(int *dest, int nelem, int my_pe, int npes) { bool res=true; - int expected = nprocs * (nprocs -1) / 2; + int expected = npes * (npes -1) / 2; - for (int i=0; i 1) { nelem = atoi(argv[1]); } - roc_shmem_init(); + int my_pe = roc_shmem_my_pe(); int npes = roc_shmem_n_pes(); + + int ndevices, my_device = 0; + CHECK_HIP(hipGetDeviceCount(&ndevices)); + my_device = my_pe % ndevices; + CHECK_HIP(hipSetDevice(my_device)); + + roc_shmem_init(); + int *source = (int *)roc_shmem_malloc(nelem * sizeof(int)); - int *result = (int *)roc_shmem_malloc(nelem * sizeof(int)); - if (NULL == source || NULL == result) { + int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int)); + if (NULL == source || NULL == dest) { std::cout << "Error allocating memory from symmetric heap" << std::endl; roc_shmem_global_exit(1); } - init_sendbuf(source, nelem, rank); + init_sendbuf(source, nelem, my_pe); for (int i=0; i>>(source, result, + allreduce_test<<>>(source, dest, nelem, team_reduce_world_dup); CHECK_HIP(hipDeviceSynchronize()); - bool pass = check_recvbuf(result, npes, rank, nelem); - + bool pass = check_recvbuf(dest, nelem, my_pe, npes); printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); roc_shmem_free(source); - roc_shmem_free(result); + roc_shmem_free(dest); roc_shmem_finalize(); return 0; diff --git a/projects/rocshmem/examples/rocshmem_alltoall_test.cc b/projects/rocshmem/examples/rocshmem_alltoall_test.cc new file mode 100644 index 0000000000..858276dbf1 --- /dev/null +++ b/projects/rocshmem/examples/rocshmem_alltoall_test.cc @@ -0,0 +1,129 @@ +/* +** hipcc -c -fgpu-rdc -x hip rocshmem_alltoall_test.cc -I/opt/rocm/include +** -I$ROCHSMEM_INSTALL_DIR/include -I$OPENMPI_UCX_INSTALL_DIR/include/ +** hipcc -fgpu-rdc --hip-link rocshmem_alltoall_test.o -o rocshmem_alltoall_test +** $ROCHSMEM_INSTALL_DIR/lib/librocshmem.a $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so +** -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 +** +** ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test +*/ + +#include + +#define __HIP_PLATFORM_AMD__ +#include +#include +#include + +#define CHECK_HIP(condition) { \ + hipError_t error = condition; \ + if(error != hipSuccess){ \ + fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \ + MPI_Abort(MPI_COMM_WORLD, error); \ + } \ + } + +using namespace rocshmem; + +__global__ void alltoall_test(int *source, int *dest, size_t nelem, + roc_shmem_team_t team) { + __shared__ roc_shmem_ctx_t ctx; + int64_t ctx_type = 0; + + roc_shmem_wg_init(); + roc_shmem_wg_ctx_create(ctx_type, &ctx); + int num_pes = roc_shmem_ctx_n_pes(ctx); + + roc_shmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem); + + roc_shmem_ctx_quiet(ctx); + __syncthreads(); + + roc_shmem_wg_ctx_destroy(&ctx); + roc_shmem_wg_finalize(); +} + +static void init_sendbuf (int *source, int nelem, int my_pe, int npes) +{ + for (int pe = 0; pe < npes; pe++) { + for (int i = 0; i < nelem; i++) { + int idx = (pe * nelem) + i; + source[idx] = my_pe + pe; + } + } +} + +static bool check_recvbuf(int *dest, int nelem, int my_pe, int npes) +{ + bool res=true; + + for(int pe = 0; pe < npes; pe++) { + for(int i = 0; i < nelem; i++) { + int idx = (pe * nelem) + i; + int result = my_pe + pe; + if (dest[idx] != result) { + res = false; +#ifdef VERBOSE + printf("recvbuf[%d] = %d expected %d \n", i, recvbuf[i], result); +#endif + } + } + } + + return res; +} + +#define MAX_ELEM 256 + +int main (int argc, char **argv) +{ + int nelem = MAX_ELEM; + + if (argc > 1) { + nelem = atoi(argv[1]); + } + + int my_pe = roc_shmem_my_pe(); + int npes = roc_shmem_n_pes(); + + int ndevices, my_device = 0; + CHECK_HIP(hipGetDeviceCount(&ndevices)); + my_device = my_pe % ndevices; + CHECK_HIP(hipSetDevice(my_device)); + + roc_shmem_init(); + + int *source = (int *)roc_shmem_malloc(nelem * npes * sizeof(int)); + int *dest = (int *)roc_shmem_malloc(nelem * npes * sizeof(int)); + if (NULL == source || NULL == dest) { + std::cout << "Error allocating memory from symmetric heap" << std::endl; + roc_shmem_global_exit(1); + } + + init_sendbuf(source, nelem, my_pe, npes); + for (int i = 0; i < nelem * npes; i++) { + dest[i] = -1; + } + + roc_shmem_team_t team_reduce_world_dup; + team_reduce_world_dup = ROC_SHMEM_TEAM_INVALID; + roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0, + &team_reduce_world_dup); + + CHECK_HIP(hipDeviceSynchronize()); + + int threadsPerBlock=256; + alltoall_test<<>>(source, dest, + nelem, team_reduce_world_dup); + CHECK_HIP(hipDeviceSynchronize()); + + bool pass = check_recvbuf(dest, nelem, my_pe, npes); + + printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); + + roc_shmem_free(source); + roc_shmem_free(dest); + + roc_shmem_finalize(); + return 0; +} diff --git a/projects/rocshmem/examples/rocshmem_broadcast_test.cc b/projects/rocshmem/examples/rocshmem_broadcast_test.cc new file mode 100644 index 0000000000..48deea1393 --- /dev/null +++ b/projects/rocshmem/examples/rocshmem_broadcast_test.cc @@ -0,0 +1,124 @@ +/* +** hipcc -c -fgpu-rdc -x hip rocshmem_broadcast_test.cc -I/opt/rocm/include +** -I$ROCHSMEM_INSTALL_DIR/include -I$OPENMPI_UCX_INSTALL_DIR/include/ +** hipcc -fgpu-rdc --hip-link rocshmem_broadcast_test.o -o rocshmem_broadcast_test +** $ROCHSMEM_INSTALL_DIR/lib/librocshmem.a $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so +** -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 +** +** ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_broadcast_test +*/ + +#include + +#define __HIP_PLATFORM_AMD__ +#include +#include +#include + +#define CHECK_HIP(condition) { \ + hipError_t error = condition; \ + if(error != hipSuccess){ \ + fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \ + MPI_Abort(MPI_COMM_WORLD, error); \ + } \ + } + +using namespace rocshmem; + +__global__ void broadcast_test(int *source, int *dest, size_t nelem, + int root, roc_shmem_team_t team) { + __shared__ roc_shmem_ctx_t ctx; + int64_t ctx_type = 0; + + roc_shmem_wg_init(); + roc_shmem_wg_ctx_create(ctx_type, &ctx); + int num_pes = roc_shmem_ctx_n_pes(ctx); + + roc_shmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root); + + roc_shmem_ctx_quiet(ctx); + __syncthreads(); + + roc_shmem_wg_ctx_destroy(&ctx); + roc_shmem_wg_finalize(); +} + +static void init_sendbuf(int *source, int nelem, int my_pe) +{ + for (int i = 0; i < nelem; i++) { + source[i] = i; + } +} + +static bool check_recvbuf(int *dest, int nelem, int my_pe, int npes) +{ + bool res=true; + + for (int i = 0; i < npes; i++) { + if (dest[i] != i) { + res = false; +#ifdef VERBOSE + printf("PE: %d, dest[%d] = %d, expected %d \n", my_pe, i, dest[i], i); +#endif + } + } + + return res; +} + +#define MAX_ELEM 256 + +int main(int argc, char **argv) +{ + int nelem = MAX_ELEM; + + if (argc > 1) { + nelem = atoi(argv[1]); + } + + int my_pe = roc_shmem_my_pe(); + int npes = roc_shmem_n_pes(); + + int ndevices, my_device = 0; + CHECK_HIP(hipGetDeviceCount(&ndevices)); + my_device = my_pe % ndevices; + CHECK_HIP(hipSetDevice(my_device)); + + roc_shmem_init(); + + int *source = (int *)roc_shmem_malloc(nelem * sizeof(int)); + int *dest = (int *)roc_shmem_malloc(nelem * sizeof(int)); + if (NULL == source || NULL == dest) { + std::cout << "Error allocating memory from symmetric heap" << std::endl; + roc_shmem_global_exit(1); + } + + init_sendbuf(source, nelem, my_pe); + for (int i=0; i>>(source, dest, + nelem, root, team_reduce_world_dup); + CHECK_HIP(hipDeviceSynchronize()); + + if(my_pe != root) { + bool pass = check_recvbuf(dest, nelem, my_pe, npes); + printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); + } + + roc_shmem_free(source); + roc_shmem_free(dest); + + roc_shmem_finalize(); + return 0; +}