diff --git a/projects/rocshmem/examples/rocshmem_allreduce_test.cc b/projects/rocshmem/examples/rocshmem_allreduce_test.cc index c6f4d05dc9..1e6f73d69e 100644 --- a/projects/rocshmem/examples/rocshmem_allreduce_test.cc +++ b/projects/rocshmem/examples/rocshmem_allreduce_test.cc @@ -37,7 +37,7 @@ __global__ void allreduce_test(int *source, int *dest, size_t nelem, rocshmem_wg_ctx_create(ctx_type, &ctx); int num_pes = rocshmem_ctx_n_pes(ctx); - rocshmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem); + rocshmem_ctx_int_sum_reduce_wg(ctx, team, dest, source, nelem); rocshmem_ctx_quiet(ctx); __syncthreads(); diff --git a/projects/rocshmem/examples/rocshmem_alltoall_test.cc b/projects/rocshmem/examples/rocshmem_alltoall_test.cc index 775580fba2..6b9d2325bc 100644 --- a/projects/rocshmem/examples/rocshmem_alltoall_test.cc +++ b/projects/rocshmem/examples/rocshmem_alltoall_test.cc @@ -37,7 +37,7 @@ __global__ void alltoall_test(int *source, int *dest, size_t nelem, rocshmem_wg_ctx_create(ctx_type, &ctx); int num_pes = rocshmem_ctx_n_pes(ctx); - rocshmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem); + rocshmem_ctx_int_alltoall_wg(ctx, team, dest, source, nelem); rocshmem_ctx_quiet(ctx); __syncthreads(); diff --git a/projects/rocshmem/examples/rocshmem_broadcast_test.cc b/projects/rocshmem/examples/rocshmem_broadcast_test.cc index 4a630c75db..9bb204720e 100644 --- a/projects/rocshmem/examples/rocshmem_broadcast_test.cc +++ b/projects/rocshmem/examples/rocshmem_broadcast_test.cc @@ -37,7 +37,7 @@ __global__ void broadcast_test(int *source, int *dest, size_t nelem, rocshmem_wg_ctx_create(ctx_type, &ctx); int num_pes = rocshmem_ctx_n_pes(ctx); - rocshmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root); + rocshmem_ctx_int_broadcast_wg(ctx, team, dest, source, nelem, root); rocshmem_ctx_quiet(ctx); __syncthreads(); diff --git a/projects/rocshmem/include/rocshmem/rocshmem.hpp b/projects/rocshmem/include/rocshmem/rocshmem.hpp index a9a2251017..1fa1d21e65 100644 --- a/projects/rocshmem/include/rocshmem/rocshmem.hpp +++ b/projects/rocshmem/include/rocshmem/rocshmem.hpp @@ -490,210 +490,6 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, int src_pe, rocshmem_team_t dest_team); -/** - * @brief perform a collective barrier between all PEs in the system. - * The caller is blocked until the barrier is resolved. - * - * This function must be invoked by a single thread within the PE. - * - * @param[in] handle GPU side handle. - * - * @return void - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all( - rocshmem_ctx_t ctx); - -/** - * @brief perform a collective barrier between all PEs in the system. - * The caller is blocked until the barrier is resolved. - * - * This function must be called as a wave-front collective. - * - * @param[in] handle GPU side handle. - * - * @return void - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_barrier_all( - rocshmem_ctx_t ctx); - -/** - * @brief perform a collective barrier between all PEs in the system. - * The caller is blocked until the barrier is resolved. - * - * This function must be called as a work-group collective. - * - * @param[in] handle GPU side handle. - * - * @return void - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_barrier_all( - rocshmem_ctx_t ctx); - -/** - * @brief perform a collective barrier between all PEs in the team. - * The caller is blocked until the barrier is resolved. - * - * This function must be invoked by a single thread within the PE. - * - * @param[in] handle GPU side handle. - * - * @param[in] team The team on which to perform barrier synchronization - * - * @return void - */ -__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team); - -/** - * @brief perform a collective barrier between all PEs in the team. - * The caller is blocked until the barrier is resolved. - * - * This function must be called as a wave-front collective. - * - * @param[in] handle GPU side handle. - * - * @param[in] team The team on which to perform barrier synchronization - * - * @return void - */ -__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team); - -/** - * @brief perform a collective barrier between all PEs in the team. - * The caller is blocked until the barrier is resolved. - * - * This function must be called as a work-group collective. - * - * @param[in] handle GPU side handle. - * - * @param[in] team The team on which to perform barrier synchronization - * - * @return void - */ -__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team); - -/** - * @brief registers the arrival of a PE at a barrier. - * The caller is blocked until the synchronization is resolved. - * - * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures - * completion and visibility of previously issued memory stores and does not - * ensure completion of remote memory updates issued via OpenSHMEM routines. - * - * This function must be invoked by a single thread within the PE. - * - * @param[in] handle GPU side handle. - * - * @return void - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx); - -/** - * @brief registers the arrival of a PE at a barrier. - * The caller is blocked until the synchronization is resolved. - * - * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures - * completion and visibility of previously issued memory stores and does not - * ensure completion of remote memory updates issued via OpenSHMEM routines. - * - * This function must be called as a wave-front collective. - * - * @param[in] handle GPU side handle. - * - * @return void - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx); - -/** - * @brief registers the arrival of a PE at a barrier. - * The caller is blocked until the synchronization is resolved. - * - * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures - * completion and visibility of previously issued memory stores and does not - * ensure completion of remote memory updates issued via OpenSHMEM routines. - * - * This function must be called as a work-group collective. - * - * @param[in] handle GPU side handle. - * - * @return void - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx); - -/** - * @brief registers the arrival of a PE at a barrier. - * The caller is blocked until the synchronization is resolved. - * - * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures - * completion and visibility of previously issued memory stores and does not - * ensure completion of remote memory updates issued via OpenSHMEM routines. - * - * This function must be invoked by a single thread within the PE. - * - * @param[in] handle GPU side handle. - * @param[in] team Handle of the team being synchronized - * - * @return void - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_team_sync( - rocshmem_ctx_t ctx, rocshmem_team_t team); - -/** - * @brief registers the arrival of a PE at a barrier. - * The caller is blocked until the synchronization is resolved. - * - * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures - * completion and visibility of previously issued memory stores and does not - * ensure completion of remote memory updates issued via OpenSHMEM routines. - * - * This function must be called as a wave-front collective. - * - * @param[in] handle GPU side handle. - * @param[in] team Handle of the team being synchronized - * - * @return void - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_team_sync( - rocshmem_ctx_t ctx, rocshmem_team_t team); - -/** - * @brief registers the arrival of a PE at a barrier. - * The caller is blocked until the synchronization is resolved. - * - * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures - * completion and visibility of previously issued memory stores and does not - * ensure completion of remote memory updates issued via OpenSHMEM routines. - * - * This function must be called as a work-group collective. - * - * @param[in] handle GPU side handle. - * @param[in] team Handle of the team being synchronized - * - * @return void - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_team_sync( - rocshmem_ctx_t ctx, rocshmem_team_t team); - -/** - * @brief Query a local pointer to a symmetric data object on the - * specified \pe . Returns an address that may be used to directly reference - * dest on the specified \pe. This address can be accesses with LD/ST ops. - * - * Can be called per thread with no performance penalty. - */ -__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe); - -/** - * @brief Make all uncacheable GPU data visible to other agents in the sytem. - * - * This only works for data that was explicitly allocated uncacheable on the - * GPU! - * - * Can be called per thread with no performance penalty. - * - * @param[in] GPU-side handle. - * - * @return void - */ __device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system( rocshmem_ctx_t ctx); diff --git a/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp index c15498705d..191cec25a1 100644 --- a/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp +++ b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp @@ -41,55 +41,55 @@ namespace rocshmem { * * @return void */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, const char *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, const signed char *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, const unsigned char *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, const unsigned short *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, const unsigned int *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, const unsigned long *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall( +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_alltoall_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, const unsigned long long *source, int nelems); @@ -116,7 +116,7 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall( * * @return void */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nelems, int pe_root); __host__ void rocshmem_ctx_float_broadcast( @@ -127,7 +127,7 @@ __host__ void rocshmem_ctx_float_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nelems, int pe_root); __host__ void rocshmem_ctx_double_broadcast( @@ -138,7 +138,7 @@ __host__ void rocshmem_ctx_double_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, const char *source, int nelems, int pe_root); __host__ void rocshmem_ctx_char_broadcast( @@ -149,7 +149,7 @@ __host__ void rocshmem_ctx_char_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, const char *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, const signed char *source, int nelems, int pe_root); __host__ void rocshmem_ctx_schar_broadcast( @@ -160,7 +160,7 @@ __host__ void rocshmem_ctx_schar_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, const signed char *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nelems, int pe_root); __host__ void rocshmem_ctx_short_broadcast( @@ -171,7 +171,7 @@ __host__ void rocshmem_ctx_short_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nelems, int pe_root); __host__ void rocshmem_ctx_int_broadcast( @@ -182,7 +182,7 @@ __host__ void rocshmem_ctx_int_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nelems, int pe_root); __host__ void rocshmem_ctx_long_broadcast( @@ -193,7 +193,7 @@ __host__ void rocshmem_ctx_long_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nelems, int pe_root); __host__ void rocshmem_ctx_longlong_broadcast( @@ -204,7 +204,7 @@ __host__ void rocshmem_ctx_longlong_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, const unsigned char *source, int nelems, int pe_root); __host__ void rocshmem_ctx_uchar_broadcast( @@ -215,7 +215,7 @@ __host__ void rocshmem_ctx_uchar_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, const unsigned char *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, const unsigned short *source, int nelems, int pe_root); __host__ void rocshmem_ctx_ushort_broadcast( @@ -226,7 +226,7 @@ __host__ void rocshmem_ctx_ushort_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, const unsigned short *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, const unsigned int *source, int nelems, int pe_root); __host__ void rocshmem_ctx_uint_broadcast( @@ -237,7 +237,7 @@ __host__ void rocshmem_ctx_uint_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, const unsigned int *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, const unsigned long *source, int nelems, int pe_root); __host__ void rocshmem_ctx_ulong_broadcast( @@ -248,7 +248,7 @@ __host__ void rocshmem_ctx_ulong_broadcast( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, const unsigned long *source, int nelems, int pe_root); -__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast( +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_broadcast_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, const unsigned long long *source, int nelems, int pe_root); __host__ void rocshmem_ctx_ulonglong_broadcast( @@ -276,55 +276,55 @@ __host__ void rocshmem_ctx_ulonglong_broadcast( * * @return void */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, const char *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, const signed char *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, const unsigned char *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, const unsigned short *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, const unsigned int *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, const unsigned long *source, int nelems); -__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect( +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_fcollect_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, const unsigned long long *source, int nelems); @@ -345,258 +345,462 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect( * * @return int (Zero on successful local completion. Nonzero otherwise.) */ -__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); __host__ int rocshmem_ctx_short_sum_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); __host__ int rocshmem_ctx_short_min_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); __host__ int rocshmem_ctx_short_max_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); __host__ int rocshmem_ctx_short_prod_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); __host__ int rocshmem_ctx_short_or_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); __host__ int rocshmem_ctx_short_and_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); __host__ int rocshmem_ctx_short_xor_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); __host__ int rocshmem_ctx_int_sum_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); __host__ int rocshmem_ctx_int_min_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); __host__ int rocshmem_ctx_int_max_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); __host__ int rocshmem_ctx_int_prod_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); __host__ int rocshmem_ctx_int_or_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); __host__ int rocshmem_ctx_int_and_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); __host__ int rocshmem_ctx_int_xor_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); __host__ int rocshmem_ctx_long_sum_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); __host__ int rocshmem_ctx_long_min_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); __host__ int rocshmem_ctx_long_max_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); __host__ int rocshmem_ctx_long_prod_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); __host__ int rocshmem_ctx_long_or_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); __host__ int rocshmem_ctx_long_and_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); __host__ int rocshmem_ctx_long_xor_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); __host__ int rocshmem_ctx_longlong_sum_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); __host__ int rocshmem_ctx_longlong_min_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); __host__ int rocshmem_ctx_longlong_max_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); __host__ int rocshmem_ctx_longlong_prod_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); __host__ int rocshmem_ctx_longlong_or_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); __host__ int rocshmem_ctx_longlong_and_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); __host__ int rocshmem_ctx_longlong_xor_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nreduce); __host__ int rocshmem_ctx_float_sum_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nreduce); __host__ int rocshmem_ctx_float_min_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nreduce); __host__ int rocshmem_ctx_float_max_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nreduce); __host__ int rocshmem_ctx_float_prod_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nreduce); __host__ int rocshmem_ctx_double_sum_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nreduce); __host__ int rocshmem_ctx_double_min_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nreduce); __host__ int rocshmem_ctx_double_max_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nreduce); -__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce( +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_reduce_wg( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nreduce); __host__ int rocshmem_ctx_double_prod_reduce( rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, int nreduce); +/** + * @brief perform a collective barrier between all PEs in the system. + * The caller is blocked until the barrier is resolved. + * + * This function must be invoked by a single thread within the PE. + * + * @param[in] handle GPU side handle. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all( + rocshmem_ctx_t ctx); + +/** + * @brief perform a collective barrier between all PEs in the system. + * The caller is blocked until the barrier is resolved. + * + * This function must be called as a wave-front collective. + * + * @param[in] handle GPU side handle. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wave( + rocshmem_ctx_t ctx); + +/** + * @brief perform a collective barrier between all PEs in the system. + * The caller is blocked until the barrier is resolved. + * + * This function must be called as a work-group collective. + * + * @param[in] handle GPU side handle. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wg( + rocshmem_ctx_t ctx); + +/** + * @brief perform a collective barrier between all PEs in the team. + * The caller is blocked until the barrier is resolved. + * + * This function must be invoked by a single thread within the PE. + * + * @param[in] handle GPU side handle. + * + * @param[in] team The team on which to perform barrier synchronization + * + * @return void + */ +__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team); + +/** + * @brief perform a collective barrier between all PEs in the team. + * The caller is blocked until the barrier is resolved. + * + * This function must be called as a wave-front collective. + * + * @param[in] handle GPU side handle. + * + * @param[in] team The team on which to perform barrier synchronization + * + * @return void + */ +__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team); + +/** + * @brief perform a collective barrier between all PEs in the team. + * The caller is blocked until the barrier is resolved. + * + * This function must be called as a work-group collective. + * + * @param[in] handle GPU side handle. + * + * @param[in] team The team on which to perform barrier synchronization + * + * @return void + */ +__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team); + +/** + * @brief registers the arrival of a PE at a barrier. + * The caller is blocked until the synchronization is resolved. + * + * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures + * completion and visibility of previously issued memory stores and does not + * ensure completion of remote memory updates issued via OpenSHMEM routines. + * + * This function must be invoked by a single thread within the PE. + * + * @param[in] handle GPU side handle. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx); + +/** + * @brief registers the arrival of a PE at a barrier. + * The caller is blocked until the synchronization is resolved. + * + * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures + * completion and visibility of previously issued memory stores and does not + * ensure completion of remote memory updates issued via OpenSHMEM routines. + * + * This function must be called as a wave-front collective. + * + * @param[in] handle GPU side handle. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx); + +/** + * @brief registers the arrival of a PE at a barrier. + * The caller is blocked until the synchronization is resolved. + * + * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures + * completion and visibility of previously issued memory stores and does not + * ensure completion of remote memory updates issued via OpenSHMEM routines. + * + * This function must be called as a work-group collective. + * + * @param[in] handle GPU side handle. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx); + +/** + * @brief registers the arrival of a PE at a barrier. + * The caller is blocked until the synchronization is resolved. + * + * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures + * completion and visibility of previously issued memory stores and does not + * ensure completion of remote memory updates issued via OpenSHMEM routines. + * + * This function must be invoked by a single thread within the PE. + * + * @param[in] handle GPU side handle. + * @param[in] team Handle of the team being synchronized + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_sync( + rocshmem_ctx_t ctx, rocshmem_team_t team); + +/** + * @brief registers the arrival of a PE at a barrier. + * The caller is blocked until the synchronization is resolved. + * + * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures + * completion and visibility of previously issued memory stores and does not + * ensure completion of remote memory updates issued via OpenSHMEM routines. + * + * This function must be called as a wave-front collective. + * + * @param[in] handle GPU side handle. + * @param[in] team Handle of the team being synchronized + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wave( + rocshmem_ctx_t ctx, rocshmem_team_t team); + +/** + * @brief registers the arrival of a PE at a barrier. + * The caller is blocked until the synchronization is resolved. + * + * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures + * completion and visibility of previously issued memory stores and does not + * ensure completion of remote memory updates issued via OpenSHMEM routines. + * + * This function must be called as a work-group collective. + * + * @param[in] handle GPU side handle. + * @param[in] team Handle of the team being synchronized + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wg( + rocshmem_ctx_t ctx, rocshmem_team_t team); + +/** + * @brief Query a local pointer to a symmetric data object on the + * specified \pe . Returns an address that may be used to directly reference + * dest on the specified \pe. This address can be accesses with LD/ST ops. + * + * Can be called per thread with no performance penalty. + */ +__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe); + +/** + * @brief Make all uncacheable GPU data visible to other agents in the sytem. + * + * This only works for data that was explicitly allocated uncacheable on the + * GPU! + * + * Can be called per thread with no performance penalty. + * + * @param[in] GPU-side handle. + * + * @return void + */ } // namespace rocshmem diff --git a/projects/rocshmem/src/rocshmem_gpu.cpp b/projects/rocshmem/src/rocshmem_gpu.cpp index 5c9e5279e8..b73173abc1 100644 --- a/projects/rocshmem/src/rocshmem_gpu.cpp +++ b/projects/rocshmem/src/rocshmem_gpu.cpp @@ -453,7 +453,7 @@ __device__ void *rocshmem_ptr(const void *dest, int pe) { } template -__device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team, +__device__ int rocshmem_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nreduce) { GPU_DPRINTF("Function: rocshmem_reduce\n"); @@ -461,7 +461,7 @@ __device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team, } template -__device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx, +__device__ void rocshmem_broadcast_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nelem, int pe_root) { @@ -471,7 +471,7 @@ __device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx, } template -__device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx, +__device__ void rocshmem_alltoall_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nelem) { GPU_DPRINTF("Function: rocshmem_alltoall\n"); @@ -480,7 +480,7 @@ __device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx, } template -__device__ void rocshmem_wg_fcollect(rocshmem_ctx_t ctx, +__device__ void rocshmem_fcollect_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, int nelem) { GPU_DPRINTF("Function: rocshmem_fcollect\n"); @@ -576,14 +576,14 @@ __device__ void rocshmem_ctx_barrier_all(rocshmem_ctx_t ctx) { get_internal_ctx(ctx)->barrier_all(); } -__device__ void rocshmem_ctx_wave_barrier_all(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_wave_barrier_all\n"); +__device__ void rocshmem_ctx_barrier_all_wave(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave\n"); get_internal_ctx(ctx)->barrier_all_wave(); } -__device__ void rocshmem_ctx_wg_barrier_all(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_wg_barrier_all\n"); +__device__ void rocshmem_ctx_barrier_all_wg(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg\n"); get_internal_ctx(ctx)->barrier_all_wg(); } @@ -594,14 +594,14 @@ __device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) { get_internal_ctx(ctx)->barrier(team); } -__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) { - GPU_DPRINTF("Function: rocshmem_wave_barrier\n"); +__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team) { + GPU_DPRINTF("Function: rocshmem_barrier_wave\n"); get_internal_ctx(ctx)->barrier_wave(team); } -__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) { - GPU_DPRINTF("Function: rocshmem_wg_barrier\n"); +__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team) { + GPU_DPRINTF("Function: rocshmem_barrier_wg\n"); get_internal_ctx(ctx)->barrier_wg(team); } @@ -612,35 +612,35 @@ __device__ void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx) { get_internal_ctx(ctx)->sync_all(); } -__device__ void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n"); +__device__ void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave\n"); get_internal_ctx(ctx)->sync_all_wave(); } -__device__ void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx) { - GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n"); +__device__ void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx) { + GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg\n"); get_internal_ctx(ctx)->sync_all_wg(); } -__device__ void rocshmem_ctx_team_sync(rocshmem_ctx_t ctx, +__device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx, rocshmem_team_t team) { - GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n"); + GPU_DPRINTF("Function: rocshmem_ctx_sync\n"); get_internal_ctx(ctx)->sync_wg(team); } -__device__ void rocshmem_ctx_wave_team_sync(rocshmem_ctx_t ctx, +__device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx, rocshmem_team_t team) { -GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n"); +GPU_DPRINTF("Function: rocshmem_ctx_sync_wave\n"); get_internal_ctx(ctx)->sync_wg(team); } -__device__ void rocshmem_ctx_wg_team_sync(rocshmem_ctx_t ctx, +__device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx, rocshmem_team_t team) { -GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n"); +GPU_DPRINTF("Function: rocshmem_ctx_sync_wg\n"); get_internal_ctx(ctx)->sync_wg(team); } @@ -981,7 +981,7 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, * Template generator for reductions */ #define REDUCTION_GEN(T, Op) \ - template __device__ int rocshmem_wg_reduce( \ + template __device__ int rocshmem_reduce_wg( \ rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ int nreduce); @@ -1011,13 +1011,13 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, template __device__ void rocshmem_get_nbi(T * dest, const T *source, \ size_t nelems, int pe); \ template __device__ T rocshmem_g(const T *source, int pe); \ - template __device__ void rocshmem_wg_broadcast( \ + template __device__ void rocshmem_broadcast_wg( \ rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ int nelem, int pe_root); \ - template __device__ void rocshmem_wg_alltoall( \ + template __device__ void rocshmem_alltoall_wg( \ rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ int nelem); \ - template __device__ void rocshmem_wg_fcollect( \ + template __device__ void rocshmem_fcollect_wg( \ rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ int nelem); \ template __device__ void rocshmem_put_wave( \ @@ -1189,10 +1189,10 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, **/ #define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \ - __device__ int rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ + __device__ int rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg( \ rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nreduce) { \ - return rocshmem_wg_reduce(ctx, team, dest, source, nreduce); \ + return rocshmem_reduce_wg(ctx, team, dest, source, nreduce); \ } #define ARITH_REDUCTION_DEF_GEN(T, TNAME) \ @@ -1323,20 +1323,20 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, size_t nelems, int pe) { \ rocshmem_get_nbi_wg(dest, source, nelems, pe); \ } \ - __device__ void rocshmem_ctx_##TNAME##_wg_broadcast( \ + __device__ void rocshmem_ctx_##TNAME##_broadcast_wg( \ rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nelem, int pe_root) { \ - rocshmem_wg_broadcast(ctx, team, dest, source, nelem, pe_root); \ + rocshmem_broadcast_wg(ctx, team, dest, source, nelem, pe_root); \ } \ - __device__ void rocshmem_ctx_##TNAME##_wg_alltoall( \ + __device__ void rocshmem_ctx_##TNAME##_alltoall_wg( \ rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nelem) { \ - rocshmem_wg_alltoall(ctx, team, dest, source, nelem); \ + rocshmem_alltoall_wg(ctx, team, dest, source, nelem); \ } \ - __device__ void rocshmem_ctx_##TNAME##_wg_fcollect( \ + __device__ void rocshmem_ctx_##TNAME##_fcollect_wg( \ rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ int nelem) { \ - rocshmem_wg_fcollect(ctx, team, dest, source, nelem); \ + rocshmem_fcollect_wg(ctx, team, dest, source, nelem); \ } #define AMO_STANDARD_DEF_GEN(T, TNAME) \ diff --git a/projects/rocshmem/tests/functional_tests/barrier_all_tester.cpp b/projects/rocshmem/tests/functional_tests/barrier_all_tester.cpp index dfd5811726..4f8325125a 100644 --- a/projects/rocshmem/tests/functional_tests/barrier_all_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/barrier_all_tester.cpp @@ -53,11 +53,11 @@ __global__ void BarrierAllTest(int loop, int skip, long long int *start_time, break; case WAVEBarrierAllTestType: if(wf_id == 0) { - rocshmem_ctx_wave_barrier_all(ctx); + rocshmem_ctx_barrier_all_wave(ctx); } break; case WGBarrierAllTestType: - rocshmem_ctx_wg_barrier_all(ctx); + rocshmem_ctx_barrier_all_wg(ctx); break; default: break; diff --git a/projects/rocshmem/tests/functional_tests/sync_all_tester.cpp b/projects/rocshmem/tests/functional_tests/sync_all_tester.cpp index 8a02db4d54..16dc738c41 100644 --- a/projects/rocshmem/tests/functional_tests/sync_all_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/sync_all_tester.cpp @@ -53,11 +53,11 @@ __global__ void SyncAllTest(int loop, int skip, long long int *start_time, break; case WAVESyncAllTestType: if(wf_id == 0) { - rocshmem_ctx_wave_sync_all(ctx); + rocshmem_ctx_sync_all_wave(ctx); } break; case WGSyncAllTestType: - rocshmem_ctx_wg_sync_all(ctx); + rocshmem_ctx_sync_all_wg(ctx); break; default: break; diff --git a/projects/rocshmem/tests/functional_tests/sync_tester.cpp b/projects/rocshmem/tests/functional_tests/sync_tester.cpp index b130e6fdb7..8ffa4c91ff 100644 --- a/projects/rocshmem/tests/functional_tests/sync_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/sync_tester.cpp @@ -45,16 +45,16 @@ __global__ void SyncTest(int loop, int skip, long long int *start_time, switch (type) { case SyncTestType: if(t_id == 0) { - rocshmem_ctx_team_sync(ctx, teams[wg_id]); + rocshmem_ctx_sync(ctx, teams[wg_id]); } break; case WAVESyncTestType: if(wf_id == 0) { - rocshmem_ctx_wave_team_sync(ctx, teams[wg_id]); + rocshmem_ctx_sync_wave(ctx, teams[wg_id]); } break; case WGSyncTestType: - rocshmem_ctx_wg_team_sync(ctx, teams[wg_id]); + rocshmem_ctx_sync_wg(ctx, teams[wg_id]); break; default: break; diff --git a/projects/rocshmem/tests/functional_tests/team_alltoall_tester.cpp b/projects/rocshmem/tests/functional_tests/team_alltoall_tester.cpp index fd3cc28e00..10f8f5b60f 100644 --- a/projects/rocshmem/tests/functional_tests/team_alltoall_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/team_alltoall_tester.cpp @@ -32,7 +32,7 @@ __device__ void wg_team_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team, template <> \ __device__ void wg_team_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team,\ T * dest, const T *source, int nelem) { \ - rocshmem_ctx_##TNAME##_wg_alltoall(ctx, team, dest, source, nelem); \ + rocshmem_ctx_##TNAME##_alltoall_wg(ctx, team, dest, source, nelem); \ } TEAM_ALLTOALL_DEF_GEN(float, float) diff --git a/projects/rocshmem/tests/functional_tests/team_barrier_tester.cpp b/projects/rocshmem/tests/functional_tests/team_barrier_tester.cpp index ff81154aed..73e1db00f1 100644 --- a/projects/rocshmem/tests/functional_tests/team_barrier_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/team_barrier_tester.cpp @@ -51,11 +51,11 @@ __global__ void TeamBarrierTest(int loop, int skip, long long int *start_time, break; case TeamWAVEBarrierTestType: if(wf_id == 0) { - rocshmem_ctx_wave_barrier(ctx, teams[wg_id]); + rocshmem_ctx_barrier_wave(ctx, teams[wg_id]); } break; case TeamWGBarrierTestType: - rocshmem_ctx_wg_barrier(ctx, teams[wg_id]); + rocshmem_ctx_barrier_wg(ctx, teams[wg_id]); break; default: break; diff --git a/projects/rocshmem/tests/functional_tests/team_broadcast_tester.cpp b/projects/rocshmem/tests/functional_tests/team_broadcast_tester.cpp index 43d5a7ac4c..11326542fa 100644 --- a/projects/rocshmem/tests/functional_tests/team_broadcast_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/team_broadcast_tester.cpp @@ -34,7 +34,7 @@ __device__ void wg_team_broadcast(rocshmem_ctx_t ctx, rocshmem_team_t team, __device__ void wg_team_broadcast( \ rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ int nelem, int pe_root) { \ - rocshmem_ctx_##TNAME##_wg_broadcast(ctx, team, dest, source, nelem, \ + rocshmem_ctx_##TNAME##_broadcast_wg(ctx, team, dest, source, nelem, \ pe_root); \ } diff --git a/projects/rocshmem/tests/functional_tests/team_fcollect_tester.cpp b/projects/rocshmem/tests/functional_tests/team_fcollect_tester.cpp index 4f8b85e693..5f1fc46264 100644 --- a/projects/rocshmem/tests/functional_tests/team_fcollect_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/team_fcollect_tester.cpp @@ -32,7 +32,7 @@ __device__ void wg_team_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team, template <> \ __device__ void wg_team_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team,\ T * dest, const T *source, int nelem) { \ - rocshmem_ctx_##TNAME##_wg_fcollect(ctx, team, dest, source, nelem); \ + rocshmem_ctx_##TNAME##_fcollect_wg(ctx, team, dest, source, nelem); \ } TEAM_FCOLLECT_DEF_GEN(float, float) diff --git a/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp b/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp index 17c14ae91d..4229888bfc 100644 --- a/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp @@ -35,7 +35,7 @@ __device__ int wg_team_reduce(rocshmem_ctx_t ctx, rocshmem_team_t, T *dest, __device__ int wg_team_reduce(rocshmem_ctx_t ctx, \ rocshmem_team_t team, T * dest, \ const T *source, int nreduce) { \ - return rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce(ctx, team, dest, \ + return rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg(ctx, team, dest, \ source, nreduce); \ } @@ -93,7 +93,7 @@ __global__ void TeamReductionTest(int loop, int skip, long long int *start_time, start_time[wg_id] = wall_clock64(); } wg_team_reduce(ctx, team, r_buf, s_buf, size); - rocshmem_ctx_wg_barrier_all(ctx); + rocshmem_ctx_barrier_all_wg(ctx); } __syncthreads(); diff --git a/projects/rocshmem/utils/header_files_gen/COLL.py b/projects/rocshmem/utils/header_files_gen/COLL.py index 2755f8b26f..2bf899699a 100644 --- a/projects/rocshmem/utils/header_files_gen/COLL.py +++ b/projects/rocshmem/utils/header_files_gen/COLL.py @@ -43,7 +43,7 @@ types = [ def alltoall_api(T, TNAME): return ( - f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_alltoall(\n" + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_alltoall_wg(\n" f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n" f" const {T} *source, int nelems);\n\n" ) @@ -75,7 +75,7 @@ def generate_alltoall_api(): def broadcast_api(T, TNAME): return ( - f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_broadcast(\n" + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_broadcast_wg(\n" f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n" f" const {T} *source, int nelems, int pe_root);\n" f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n" @@ -120,7 +120,7 @@ def generate_broadcast_api(): def fcollect_api(T, TNAME): return ( - f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_fcollect(\n" + f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_fcollect_wg(\n" f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n" f" const {T} *source, int nelems);\n\n" ) @@ -152,7 +152,7 @@ def generate_fcollect_api(): def reduction_api(T, TNAME, Op_API): return ( - f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_wg_reduce(\n" + f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_reduce_wg(\n" f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n" f" int nreduce);\n" f"__host__ int rocshmem_ctx_{TNAME}_{Op_API}_reduce(\n"