From 7f3dd331442753bf547a93d54af2bce7749c005d Mon Sep 17 00:00:00 2001 From: Yiltan Hassan Temucin Date: Tue, 29 Oct 2024 06:32:13 -0700 Subject: [PATCH 1/3] modified team based to_all -> reduce [ROCm/rocshmem commit: 997eb69b5adce857d1aca3b72d3b5476be49323e] --- .../rocshmem/include/roc_shmem/roc_shmem.hpp | 10 ++++++++-- .../rocshmem/scripts/build_configs/ipc_single | 2 +- projects/rocshmem/src/context.hpp | 6 ++---- projects/rocshmem/src/context_tmpl_device.hpp | 6 +++--- projects/rocshmem/src/context_tmpl_host.hpp | 8 ++++---- projects/rocshmem/src/host/host.hpp | 3 +-- projects/rocshmem/src/host/host_templates.hpp | 6 +++--- projects/rocshmem/src/ipc/context_ipc_device.hpp | 3 +-- projects/rocshmem/src/ipc/context_ipc_host.hpp | 3 +-- .../rocshmem/src/ipc/context_ipc_tmpl_device.hpp | 7 ++++--- .../rocshmem/src/ipc/context_ipc_tmpl_host.hpp | 4 ++-- projects/rocshmem/src/roc_shmem.cpp | 14 +++++++------- projects/rocshmem/src/roc_shmem_gpu.cpp | 14 +++++++------- .../functional_tests/team_reduction_tester.cpp | 16 ++++++++-------- 14 files changed, 52 insertions(+), 50 deletions(-) diff --git a/projects/rocshmem/include/roc_shmem/roc_shmem.hpp b/projects/rocshmem/include/roc_shmem/roc_shmem.hpp index faef8a7502..cdcf74f467 100644 --- a/projects/rocshmem/include/roc_shmem/roc_shmem.hpp +++ b/projects/rocshmem/include/roc_shmem/roc_shmem.hpp @@ -49,6 +49,12 @@ namespace rocshmem { #define ATTR_NO_INLINE #endif + +enum ROC_SHMEM_STATUS { + ROC_SHMEM_SUCCESS = 0, + ROC_SHMEM_ERROR = 1, +}; + enum ROC_SHMEM_OP { ROC_SHMEM_SUM, ROC_SHMEM_MAX, @@ -837,14 +843,14 @@ __device__ ATTR_NO_INLINE void roc_shmem_threadfence_system(); * MACRO DECLARE SHMEM_REDUCTION APIs */ #define REDUCTION_API_GEN(T, TNAME, Op_API) \ - __device__ ATTR_NO_INLINE void roc_shmem_ctx_##TNAME##_##Op_API##_wg_to_all( \ + __device__ ATTR_NO_INLINE int roc_shmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nreduce); \ __host__ void roc_shmem_ctx_##TNAME##_##Op_API##_to_all( \ roc_shmem_ctx_t ctx, T *dest, const T *source, int nreduce, \ int PE_start, int logPE_stride, int PE_size, T *pWrk, \ long *pSync); /* NOLINT */ \ - __host__ void roc_shmem_ctx_##TNAME##_##Op_API##_to_all( \ + __host__ int roc_shmem_ctx_##TNAME##_##Op_API##_reduce( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nreduce); diff --git a/projects/rocshmem/scripts/build_configs/ipc_single b/projects/rocshmem/scripts/build_configs/ipc_single index 2013b2b67c..d0c43b9231 100755 --- a/projects/rocshmem/scripts/build_configs/ipc_single +++ b/projects/rocshmem/scripts/build_configs/ipc_single @@ -27,4 +27,4 @@ cmake \ -DUSE_HOST_SIDE_HDP_FLUSH=OFF\ $src_path cmake --build . --parallel 8 -cmake --install . \ No newline at end of file +cmake --install . diff --git a/projects/rocshmem/src/context.hpp b/projects/rocshmem/src/context.hpp index 0db4e99b23..4516a5d0a1 100644 --- a/projects/rocshmem/src/context.hpp +++ b/projects/rocshmem/src/context.hpp @@ -192,8 +192,7 @@ class Context { long* pSync); // NOLINT(runtime/int) template - __device__ void to_all(roc_shmem_team_t team, T* dest, const T* source, - int nreduce); + __device__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce); template __device__ void put(T* dest, const T* source, size_t nelems, int pe); @@ -361,8 +360,7 @@ class Context { long* pSync); // NOLINT(runtime/int) template - __host__ void to_all(roc_shmem_team_t team, T* dest, const T* source, - int nreduce); + __host__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce); template __host__ void wait_until(T *ivars, int cmp, T val); diff --git a/projects/rocshmem/src/context_tmpl_device.hpp b/projects/rocshmem/src/context_tmpl_device.hpp index d0310d476a..b504fce229 100644 --- a/projects/rocshmem/src/context_tmpl_device.hpp +++ b/projects/rocshmem/src/context_tmpl_device.hpp @@ -80,17 +80,17 @@ __device__ void Context::to_all(T *dest, const T *source, int nreduce, } template -__device__ void Context::to_all(roc_shmem_team_t team, T *dest, const T *source, +__device__ int Context::reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce) { if (nreduce == 0) { - return; + return ROC_SHMEM_SUCCESS; } if (is_thread_zero_in_block()) { ctxStats.incStat(NUM_TO_ALL); } - DISPATCH(to_all(team, dest, source, nreduce)); + DISPATCH_RET(reduce(team, dest, source, nreduce)); } template diff --git a/projects/rocshmem/src/context_tmpl_host.hpp b/projects/rocshmem/src/context_tmpl_host.hpp index a4c7929501..e8cd2f23b9 100644 --- a/projects/rocshmem/src/context_tmpl_host.hpp +++ b/projects/rocshmem/src/context_tmpl_host.hpp @@ -222,15 +222,15 @@ __host__ void Context::to_all(T *dest, const T *source, int nreduce, } template -__host__ void Context::to_all(roc_shmem_team_t team, T *dest, const T *source, - int nreduce) { // NOLINT(runtime/int) +__host__ int Context::reduce(roc_shmem_team_t team, T *dest, const T *source, + int nreduce) { // NOLINT(runtime/int) if (nreduce == 0) { - return; + return ROC_SHMEM_SUCCESS; } ctxHostStats.incStat(NUM_HOST_TO_ALL); - HOST_DISPATCH(to_all(team, dest, source, nreduce)); + HOST_DISPATCH_RET(reduce(team, dest, source, nreduce)); } template diff --git a/projects/rocshmem/src/host/host.hpp b/projects/rocshmem/src/host/host.hpp index 4cd2b2e393..79d7a2eec5 100644 --- a/projects/rocshmem/src/host/host.hpp +++ b/projects/rocshmem/src/host/host.hpp @@ -207,8 +207,7 @@ class HostInterface { long* p_sync); // NOLINT(runtime/int) template - __host__ void to_all(roc_shmem_team_t team, T* dest, const T* source, - int nreduce); + __host__ int reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce); template __host__ void wait_until(T *ivars, int cmp, T val, diff --git a/projects/rocshmem/src/host/host_templates.hpp b/projects/rocshmem/src/host/host_templates.hpp index 41ff7cb094..a7cce76ea9 100644 --- a/projects/rocshmem/src/host/host_templates.hpp +++ b/projects/rocshmem/src/host/host_templates.hpp @@ -376,9 +376,9 @@ __host__ void HostInterface::to_all(T* dest, const T* source, int nreduce, } template -__host__ void HostInterface::to_all(roc_shmem_team_t team, T* dest, +__host__ int HostInterface::reduce(roc_shmem_team_t team, T* dest, const T* source, int nreduce) { - DPRINTF("Function: Team-based host_to_all\n"); + DPRINTF("Function: Team-based host_reduce\n"); /* * Get the MPI communicator of this team @@ -388,7 +388,7 @@ __host__ void HostInterface::to_all(roc_shmem_team_t team, T* dest, to_all_internal(mpi_comm, dest, source, nreduce); - return; + return ROC_SHMEM_SUCCESS; } template diff --git a/projects/rocshmem/src/ipc/context_ipc_device.hpp b/projects/rocshmem/src/ipc/context_ipc_device.hpp index c2a196b16a..9ff8a688a6 100644 --- a/projects/rocshmem/src/ipc/context_ipc_device.hpp +++ b/projects/rocshmem/src/ipc/context_ipc_device.hpp @@ -122,8 +122,7 @@ class IPCContext : public Context { // Collectives template - __device__ void to_all(roc_shmem_team_t team, T *dest, const T *source, - int nreduce); + __device__ int reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce); template __device__ void broadcast(roc_shmem_team_t team, T *dest, const T *source, diff --git a/projects/rocshmem/src/ipc/context_ipc_host.hpp b/projects/rocshmem/src/ipc/context_ipc_host.hpp index 4c2e626d75..8e2c9f48d2 100644 --- a/projects/rocshmem/src/ipc/context_ipc_host.hpp +++ b/projects/rocshmem/src/ipc/context_ipc_host.hpp @@ -95,8 +95,7 @@ class IPCHostContext : public Context { long *p_sync); template - __host__ void to_all(roc_shmem_team_t team, T *dest, const T *source, - int nreduce); + __host__ int reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce); template __host__ void wait_until(T *ivars, int cmp, T val); diff --git a/projects/rocshmem/src/ipc/context_ipc_tmpl_device.hpp b/projects/rocshmem/src/ipc/context_ipc_tmpl_device.hpp index 11891c9830..0e1504a6e7 100644 --- a/projects/rocshmem/src/ipc/context_ipc_tmpl_device.hpp +++ b/projects/rocshmem/src/ipc/context_ipc_tmpl_device.hpp @@ -151,7 +151,7 @@ __device__ T IPCContext::amo_fetch_cas(void *dest, T value, T cond, int pe) { reinterpret_cast(ipcImpl_.ipc_bases[pe] + L_offset), cond, value); } - + // Collectives template __device__ void compute_reduce(T *src, T *dst, int size, int wg_id, @@ -346,8 +346,8 @@ __device__ void IPCContext::internal_ring_allreduce( } template -__device__ void IPCContext::to_all(roc_shmem_team_t team, T *dest, - const T *source, int nreduce) { +__device__ int IPCContext::reduce(roc_shmem_team_t team, T *dest, + const T *source, int nreduce) { IPCTeam *team_obj = reinterpret_cast(team); /** @@ -361,6 +361,7 @@ __device__ void IPCContext::to_all(roc_shmem_team_t team, T *dest, internal_to_all(dest, source, nreduce, pe_start, stride, pe_size, pWrk, p_sync); + return ROC_SHMEM_SUCCESS; } template diff --git a/projects/rocshmem/src/ipc/context_ipc_tmpl_host.hpp b/projects/rocshmem/src/ipc/context_ipc_tmpl_host.hpp index d78593b888..e559ca7068 100644 --- a/projects/rocshmem/src/ipc/context_ipc_tmpl_host.hpp +++ b/projects/rocshmem/src/ipc/context_ipc_tmpl_host.hpp @@ -109,9 +109,9 @@ __host__ void IPCHostContext::to_all(T *dest, const T *source, int nreduce, } template -__host__ void IPCHostContext::to_all(roc_shmem_team_t team, T *dest, +__host__ int IPCHostContext::reduce(roc_shmem_team_t team, T *dest, const T *source, int nreduce) { - host_interface->to_all(team, dest, source, nreduce); + return host_interface->reduce(team, dest, source, nreduce); } template diff --git a/projects/rocshmem/src/roc_shmem.cpp b/projects/rocshmem/src/roc_shmem.cpp index cdf76fa9d3..e778bbeab5 100644 --- a/projects/rocshmem/src/roc_shmem.cpp +++ b/projects/rocshmem/src/roc_shmem.cpp @@ -734,13 +734,13 @@ __host__ void roc_shmem_to_all([[maybe_unused]] roc_shmem_ctx_t ctx, T *dest, } template -__host__ void roc_shmem_to_all([[maybe_unused]] roc_shmem_ctx_t ctx, +__host__ int roc_shmem_reduce([[maybe_unused]] roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, int nreduce) { - DPRINTF("Host function: Team-based roc_shmem_to_all\n"); + DPRINTF("Host function: Team-based roc_shmem_reduce\n"); - get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT) - ->to_all(team, dest, source, nreduce); + return get_internal_ctx(ROC_SHMEM_HOST_CTX_DEFAULT) + ->reduce(team, dest, source, nreduce); } template @@ -821,7 +821,7 @@ __host__ int roc_shmem_test(T *ivars, int cmp, T val) { template __host__ void roc_shmem_to_all( \ roc_shmem_ctx_t ctx, T * dest, const T *source, int nreduce, \ int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); \ - template __host__ void roc_shmem_to_all( \ + template __host__ int roc_shmem_reduce( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ int nreduce); @@ -977,10 +977,10 @@ __host__ int roc_shmem_test(T *ivars, int cmp, T val) { roc_shmem_to_all(ctx, dest, source, nreduce, PE_start, \ logPE_stride, PE_size, pWrk, pSync); \ } \ - __host__ void roc_shmem_ctx_##TNAME##_##Op_API##_to_all( \ + __host__ int roc_shmem_ctx_##TNAME##_##Op_API##_reduce( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nreduce) { \ - roc_shmem_to_all(ctx, team, dest, source, nreduce); \ + return roc_shmem_reduce(ctx, team, dest, source, nreduce); \ } #define ARITH_REDUCTION_DEF_GEN(T, TNAME) \ diff --git a/projects/rocshmem/src/roc_shmem_gpu.cpp b/projects/rocshmem/src/roc_shmem_gpu.cpp index 1f1867724b..186823ac61 100644 --- a/projects/rocshmem/src/roc_shmem_gpu.cpp +++ b/projects/rocshmem/src/roc_shmem_gpu.cpp @@ -430,11 +430,11 @@ __device__ void *roc_shmem_ptr(const void *dest, int pe) { } template -__device__ void roc_shmem_wg_to_all(roc_shmem_ctx_t ctx, roc_shmem_team_t team, - T *dest, const T *source, int nreduce) { - GPU_DPRINTF("Function: roc_shmem_to_all\n"); +__device__ int roc_shmem_wg_reduce(roc_shmem_ctx_t ctx, roc_shmem_team_t team, + T *dest, const T *source, int nreduce) { + GPU_DPRINTF("Function: roc_shmem_reduce\n"); - get_internal_ctx(ctx)->to_all(team, dest, source, nreduce); + return get_internal_ctx(ctx)->reduce(team, dest, source, nreduce); } template @@ -864,7 +864,7 @@ __device__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, * Template generator for reductions */ #define REDUCTION_GEN(T, Op) \ - template __device__ void roc_shmem_wg_to_all( \ + template __device__ int roc_shmem_wg_reduce( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T * dest, const T *source, \ int nreduce); @@ -1072,10 +1072,10 @@ __device__ int roc_shmem_team_translate_pe(roc_shmem_team_t src_team, **/ #define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ - __device__ void roc_shmem_ctx_##TNAME##_##Op_API##_wg_to_all( \ + __device__ int roc_shmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nreduce) { \ - roc_shmem_wg_to_all(ctx, team, dest, source, nreduce); \ + return roc_shmem_wg_reduce(ctx, team, dest, source, nreduce); \ } #define ARITH_REDUCTION_DEF_GEN(T, TNAME) \ diff --git a/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp b/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp index 159ffd8e5c..6b9d56d225 100644 --- a/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp @@ -24,19 +24,19 @@ using namespace rocshmem; /* Declare the template with a generic implementation */ template -__device__ void wg_team_to_all(roc_shmem_ctx_t ctx, roc_shmem_team_t, T *dest, +__device__ int wg_team_reduce(roc_shmem_ctx_t ctx, roc_shmem_team_t, T *dest, const T *source, int nreduce) { - return; + return ROC_SHMEM_SUCCESS; } /* Define templates to call ROC_SHMEM */ #define TEAM_REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ template <> \ - __device__ void wg_team_to_all(roc_shmem_ctx_t ctx, \ - roc_shmem_team_t team, T * dest, \ - const T *source, int nreduce) { \ - roc_shmem_ctx_##TNAME##_##Op_API##_wg_to_all(ctx, team, dest, source, \ - nreduce); \ + __device__ int wg_team_reduce(roc_shmem_ctx_t ctx, \ + roc_shmem_team_t team, T * dest, \ + const T *source, int nreduce) { \ + return roc_shmem_ctx_##TNAME##_##Op_API##_wg_reduce(ctx, team, dest, \ + source, nreduce); \ } #define TEAM_ARITH_REDUCTION_DEF_GEN(T, TNAME) \ @@ -91,7 +91,7 @@ __global__ void TeamReductionTest(int loop, int skip, uint64_t *timer, if (i == skip && hipThreadIdx_x == 0) { start = roc_shmem_timer(); } - wg_team_to_all(ctx, team, r_buf, s_buf, size); + wg_team_reduce(ctx, team, r_buf, s_buf, size); roc_shmem_ctx_wg_barrier_all(ctx); } From f887e9b886a343b1a8dce6402bb3a9d28d943393 Mon Sep 17 00:00:00 2001 From: Yiltan Hassan Temucin Date: Tue, 29 Oct 2024 17:58:45 +0000 Subject: [PATCH 2/3] removed external access to non-team based reduce [ROCm/rocshmem commit: 9aa9aea7e60805ad0550c31d1aaf3c50be9353eb] --- projects/rocshmem/include/roc_shmem/roc_shmem.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/projects/rocshmem/include/roc_shmem/roc_shmem.hpp b/projects/rocshmem/include/roc_shmem/roc_shmem.hpp index cdcf74f467..16bd0a6892 100644 --- a/projects/rocshmem/include/roc_shmem/roc_shmem.hpp +++ b/projects/rocshmem/include/roc_shmem/roc_shmem.hpp @@ -846,10 +846,6 @@ __device__ ATTR_NO_INLINE void roc_shmem_threadfence_system(); __device__ ATTR_NO_INLINE int roc_shmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nreduce); \ - __host__ void roc_shmem_ctx_##TNAME##_##Op_API##_to_all( \ - roc_shmem_ctx_t ctx, T *dest, const T *source, int nreduce, \ - int PE_start, int logPE_stride, int PE_size, T *pWrk, \ - long *pSync); /* NOLINT */ \ __host__ int roc_shmem_ctx_##TNAME##_##Op_API##_reduce( \ roc_shmem_ctx_t ctx, roc_shmem_team_t team, T *dest, const T *source, \ int nreduce); From 8df27a93be314596123a007bca9d439f0fb81ce0 Mon Sep 17 00:00:00 2001 From: Yiltan Temucin Date: Wed, 6 Nov 2024 09:49:06 -0600 Subject: [PATCH 3/3] updated examples to use new APIs [ROCm/rocshmem commit: 799d9d5ed7af0cddb9f91611040d8c5181563dcb] --- projects/rocshmem/examples/rocshmem_allreduce_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/projects/rocshmem/examples/rocshmem_allreduce_test.cc b/projects/rocshmem/examples/rocshmem_allreduce_test.cc index 46ec91e215..e925a51b05 100644 --- a/projects/rocshmem/examples/rocshmem_allreduce_test.cc +++ b/projects/rocshmem/examples/rocshmem_allreduce_test.cc @@ -1,8 +1,8 @@ /* -** hipcc -c -fgpu-rdc -x hip rocshmem_allreduce_test.cc -I/opt/rocm/include +** hipcc -c -fgpu-rdc -x hip rocshmem_allreduce_test.cc -I/opt/rocm/include ** -I$ROCHSMEM_INSTALL_DIR/include -I$OPENMPI_UCX_INSTALL_DIR/include/ -** hipcc -fgpu-rdc --hip-link rocshmem_allreduce_test.o -o rocshmem_allreduce_test -** $ROCHSMEM_INSTALL_DIR/lib/librocshmem.a $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so +** hipcc -fgpu-rdc --hip-link rocshmem_allreduce_test.o -o rocshmem_allreduce_test +** $ROCHSMEM_INSTALL_DIR/lib/librocshmem.a $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so ** -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 ** ** ROC_SHMEM_MAX_NUM_CONTEXTS=2 mpirun -np 8 ./rocshmem_allreduce_test @@ -34,7 +34,7 @@ __global__ void allreduce_test(int *source, int *dest, size_t nelem, roc_shmem_wg_ctx_create(ctx_type, &ctx); int num_pes = roc_shmem_ctx_n_pes(ctx); - roc_shmem_ctx_int_sum_wg_to_all(ctx, team, dest, source, nelem); + roc_shmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem); roc_shmem_ctx_quiet(ctx); __syncthreads(); @@ -114,7 +114,7 @@ int main (int argc, char **argv) bool pass = check_recvbuf(dest, nelem, my_pe, npes); printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]"); - + roc_shmem_free(source); roc_shmem_free(dest);