Update collective APIs naming (#77)

* Update the naming convention for collective APIs to ensure consistency across the interface. * Move all collective API declarations to rocshmem_COLL.hpp * The following APIs were updated as part of this change: - `barrier` - `barrier_all` - `sync` - `sync_all` - `all_to_all` - `broadcast` - `fcollect` - `all_reduce` * Update header file generation code for collective APIs [ROCm/rocshmem commit: 68421895d6]
2025-04-10 12:14:47 -05:00
parent 5b22ddd1ff
commit 41d5d739e2
15 changed files with 334 additions and 334 deletions
@@ -37,7 +37,7 @@ __global__ void allreduce_test(int *source, int *dest, size_t nelem,
    rocshmem_wg_ctx_create(ctx_type, &ctx);
    int num_pes = rocshmem_ctx_n_pes(ctx);

-    rocshmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem);
+    rocshmem_ctx_int_sum_reduce_wg(ctx, team, dest, source, nelem);

    rocshmem_ctx_quiet(ctx);
    __syncthreads();
@@ -37,7 +37,7 @@ __global__ void alltoall_test(int *source, int *dest, size_t nelem,
    rocshmem_wg_ctx_create(ctx_type, &ctx);
    int num_pes = rocshmem_ctx_n_pes(ctx);

-    rocshmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem);
+    rocshmem_ctx_int_alltoall_wg(ctx, team, dest, source, nelem);

    rocshmem_ctx_quiet(ctx);
    __syncthreads();
@@ -37,7 +37,7 @@ __global__ void broadcast_test(int *source, int *dest, size_t nelem,
    rocshmem_wg_ctx_create(ctx_type, &ctx);
    int num_pes = rocshmem_ctx_n_pes(ctx);

-    rocshmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root);
+    rocshmem_ctx_int_broadcast_wg(ctx, team, dest, source, nelem, root);

    rocshmem_ctx_quiet(ctx);
    __syncthreads();
@@ -490,210 +490,6 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
                                           int src_pe,
                                           rocshmem_team_t dest_team);

-/**
- * @brief perform a collective barrier between all PEs in the system.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be invoked by a single thread within the PE.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all(
-    rocshmem_ctx_t ctx);
-
-/**
- * @brief perform a collective barrier between all PEs in the system.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be called as a wave-front collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_barrier_all(
-    rocshmem_ctx_t ctx);
-
-/**
- * @brief perform a collective barrier between all PEs in the system.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be called as a work-group collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_barrier_all(
-    rocshmem_ctx_t ctx);
-
-/**
- * @brief perform a collective barrier between all PEs in the team.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be invoked by a single thread within the PE.
- *
- * @param[in] handle GPU side handle.
- *
- * @param[in] team The team on which to perform barrier synchronization
- *
- * @return void
- */
-__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief perform a collective barrier between all PEs in the team.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be called as a wave-front collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @param[in] team The team on which to perform barrier synchronization
- *
- * @return void
- */
-__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief perform a collective barrier between all PEs in the team.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be called as a work-group collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @param[in] team The team on which to perform barrier synchronization
- *
- * @return void
- */
-__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be invoked by a single thread within the PE.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be called as a wave-front collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be called as a work-group collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be invoked by a single thread within the PE.
- *
- * @param[in] handle GPU side handle.
- * @param[in] team  Handle of the team being synchronized
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_team_sync(
-    rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be called as a wave-front collective.
- *
- * @param[in] handle GPU side handle.
- * @param[in] team  Handle of the team being synchronized
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_team_sync(
-    rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be called as a work-group collective.
- *
- * @param[in] handle GPU side handle.
- * @param[in] team  Handle of the team being synchronized
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_team_sync(
-    rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief Query a local pointer to a symmetric data object on the
- * specified \pe . Returns an address that may be used to directly reference
- * dest on the specified \pe. This address can be accesses with LD/ST ops.
- *
- * Can be called per thread with no performance penalty.
- */
-__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe);
-
-/**
- * @brief Make all uncacheable GPU data visible to other agents in the sytem.
- *
- * This only works for data that was explicitly allocated uncacheable on the
- * GPU!
- *
- * Can be called per thread with no performance penalty.
- *
- * @param[in] GPU-side handle.
- *
- * @return void
- */
 __device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system(
    rocshmem_ctx_t ctx);

@@ -41,55 +41,55 @@ namespace rocshmem {
 *
 * @return void
 */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_float_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
    const float *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_double_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
    const double *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_char_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
    const char *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
    const signed char *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_short_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
    const short *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_int_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
    const int *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_long_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
    const long *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
    const long long *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
    const unsigned char *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
    const unsigned short *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
    const unsigned int *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
    const unsigned long *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_alltoall_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
    const unsigned long long *source, int nelems);

@@ -116,7 +116,7 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
 *
 * @return void
 */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_float_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
    const float *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_float_broadcast(
@@ -127,7 +127,7 @@ __host__ void rocshmem_ctx_float_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
    const float *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_double_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
    const double *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_double_broadcast(
@@ -138,7 +138,7 @@ __host__ void rocshmem_ctx_double_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
    const double *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_char_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
    const char *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_char_broadcast(
@@ -149,7 +149,7 @@ __host__ void rocshmem_ctx_char_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
    const char *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
    const signed char *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_schar_broadcast(
@@ -160,7 +160,7 @@ __host__ void rocshmem_ctx_schar_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
    const signed char *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_short_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
    const short *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_short_broadcast(
@@ -171,7 +171,7 @@ __host__ void rocshmem_ctx_short_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
    const short *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_int_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
    const int *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_int_broadcast(
@@ -182,7 +182,7 @@ __host__ void rocshmem_ctx_int_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
    const int *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_long_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
    const long *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_long_broadcast(
@@ -193,7 +193,7 @@ __host__ void rocshmem_ctx_long_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
    const long *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
    const long long *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_longlong_broadcast(
@@ -204,7 +204,7 @@ __host__ void rocshmem_ctx_longlong_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
    const long long *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
    const unsigned char *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_uchar_broadcast(
@@ -215,7 +215,7 @@ __host__ void rocshmem_ctx_uchar_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
    const unsigned char *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
    const unsigned short *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_ushort_broadcast(
@@ -226,7 +226,7 @@ __host__ void rocshmem_ctx_ushort_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
    const unsigned short *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
    const unsigned int *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_uint_broadcast(
@@ -237,7 +237,7 @@ __host__ void rocshmem_ctx_uint_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
    const unsigned int *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
    const unsigned long *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_ulong_broadcast(
@@ -248,7 +248,7 @@ __host__ void rocshmem_ctx_ulong_broadcast(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
    const unsigned long *source, int nelems, int pe_root);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_broadcast_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
    const unsigned long long *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_ulonglong_broadcast(
@@ -276,55 +276,55 @@ __host__ void rocshmem_ctx_ulonglong_broadcast(
 *
 * @return void
 */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_float_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
    const float *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_double_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
    const double *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_char_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
    const char *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
    const signed char *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_short_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
    const short *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_int_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
    const int *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_long_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
    const long *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
    const long long *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
    const unsigned char *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
    const unsigned short *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
    const unsigned int *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
    const unsigned long *source, int nelems);

-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_fcollect_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
    const unsigned long long *source, int nelems);

@@ -345,258 +345,462 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
 *
 * @return int (Zero on successful local completion. Nonzero otherwise.)
 */
-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);
 __host__ int rocshmem_ctx_short_sum_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);
 __host__ int rocshmem_ctx_short_min_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);
 __host__ int rocshmem_ctx_short_max_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);
 __host__ int rocshmem_ctx_short_prod_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);
 __host__ int rocshmem_ctx_short_or_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);
 __host__ int rocshmem_ctx_short_and_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);
 __host__ int rocshmem_ctx_short_xor_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);
 __host__ int rocshmem_ctx_int_sum_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);
 __host__ int rocshmem_ctx_int_min_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);
 __host__ int rocshmem_ctx_int_max_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);
 __host__ int rocshmem_ctx_int_prod_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);
 __host__ int rocshmem_ctx_int_or_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);
 __host__ int rocshmem_ctx_int_and_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);
 __host__ int rocshmem_ctx_int_xor_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);
 __host__ int rocshmem_ctx_long_sum_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);
 __host__ int rocshmem_ctx_long_min_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);
 __host__ int rocshmem_ctx_long_max_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);
 __host__ int rocshmem_ctx_long_prod_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);
 __host__ int rocshmem_ctx_long_or_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);
 __host__ int rocshmem_ctx_long_and_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);
 __host__ int rocshmem_ctx_long_xor_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);
 __host__ int rocshmem_ctx_longlong_sum_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);
 __host__ int rocshmem_ctx_longlong_min_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);
 __host__ int rocshmem_ctx_longlong_max_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);
 __host__ int rocshmem_ctx_longlong_prod_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);
 __host__ int rocshmem_ctx_longlong_or_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);
 __host__ int rocshmem_ctx_longlong_and_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);
 __host__ int rocshmem_ctx_longlong_xor_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
    int nreduce);
 __host__ int rocshmem_ctx_float_sum_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
    int nreduce);
 __host__ int rocshmem_ctx_float_min_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
    int nreduce);
 __host__ int rocshmem_ctx_float_max_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
    int nreduce);
 __host__ int rocshmem_ctx_float_prod_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
    int nreduce);
 __host__ int rocshmem_ctx_double_sum_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
    int nreduce);
 __host__ int rocshmem_ctx_double_min_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
    int nreduce);
 __host__ int rocshmem_ctx_double_max_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
    int nreduce);

-__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_reduce_wg(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
    int nreduce);
 __host__ int rocshmem_ctx_double_prod_reduce(
    rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
    int nreduce);

+/**
+ * @brief perform a collective barrier between all PEs in the system.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be invoked by a single thread within the PE.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all(
+    rocshmem_ctx_t ctx);
+
+/**
+ * @brief perform a collective barrier between all PEs in the system.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be called as a wave-front collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wave(
+    rocshmem_ctx_t ctx);
+
+/**
+ * @brief perform a collective barrier between all PEs in the system.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be called as a work-group collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wg(
+    rocshmem_ctx_t ctx);
+
+/**
+ * @brief perform a collective barrier between all PEs in the team.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be invoked by a single thread within the PE.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @param[in] team The team on which to perform barrier synchronization
+ *
+ * @return void
+ */
+__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief perform a collective barrier between all PEs in the team.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be called as a wave-front collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @param[in] team The team on which to perform barrier synchronization
+ *
+ * @return void
+ */
+__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief perform a collective barrier between all PEs in the team.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be called as a work-group collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @param[in] team The team on which to perform barrier synchronization
+ *
+ * @return void
+ */
+__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be invoked by a single thread within the PE.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be called as a wave-front collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be called as a work-group collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be invoked by a single thread within the PE.
+ *
+ * @param[in] handle GPU side handle.
+ * @param[in] team  Handle of the team being synchronized
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync(
+    rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be called as a wave-front collective.
+ *
+ * @param[in] handle GPU side handle.
+ * @param[in] team  Handle of the team being synchronized
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wave(
+    rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be called as a work-group collective.
+ *
+ * @param[in] handle GPU side handle.
+ * @param[in] team  Handle of the team being synchronized
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wg(
+    rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief Query a local pointer to a symmetric data object on the
+ * specified \pe . Returns an address that may be used to directly reference
+ * dest on the specified \pe. This address can be accesses with LD/ST ops.
+ *
+ * Can be called per thread with no performance penalty.
+ */
+__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe);
+
+/**
+ * @brief Make all uncacheable GPU data visible to other agents in the sytem.
+ *
+ * This only works for data that was explicitly allocated uncacheable on the
+ * GPU!
+ *
+ * Can be called per thread with no performance penalty.
+ *
+ * @param[in] GPU-side handle.
+ *
+ * @return void
+ */

 }  // namespace rocshmem

@@ -453,7 +453,7 @@ __device__ void *rocshmem_ptr(const void *dest, int pe) {
 }

 template <typename T, ROCSHMEM_OP Op>
-__device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team,
+__device__ int rocshmem_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team,
                                   T *dest, const T *source, int nreduce) {
  GPU_DPRINTF("Function: rocshmem_reduce\n");

@@ -461,7 +461,7 @@ __device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team,
 }

 template <typename T>
-__device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx,
+__device__ void rocshmem_broadcast_wg(rocshmem_ctx_t ctx,
                                       rocshmem_team_t team, T *dest,
                                       const T *source, int nelem,
                                       int pe_root) {
@@ -471,7 +471,7 @@ __device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx,
 }

 template <typename T>
-__device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx,
+__device__ void rocshmem_alltoall_wg(rocshmem_ctx_t ctx,
                                      rocshmem_team_t team, T *dest,
                                      const T *source, int nelem) {
  GPU_DPRINTF("Function: rocshmem_alltoall\n");
@@ -480,7 +480,7 @@ __device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx,
 }

 template <typename T>
-__device__ void rocshmem_wg_fcollect(rocshmem_ctx_t ctx,
+__device__ void rocshmem_fcollect_wg(rocshmem_ctx_t ctx,
                                      rocshmem_team_t team, T *dest,
                                      const T *source, int nelem) {
  GPU_DPRINTF("Function: rocshmem_fcollect\n");
@@ -576,14 +576,14 @@ __device__ void rocshmem_ctx_barrier_all(rocshmem_ctx_t ctx) {
  get_internal_ctx(ctx)->barrier_all();
 }

-__device__ void rocshmem_ctx_wave_barrier_all(rocshmem_ctx_t ctx) {
-  GPU_DPRINTF("Function: rocshmem_ctx_wave_barrier_all\n");
+__device__ void rocshmem_ctx_barrier_all_wave(rocshmem_ctx_t ctx) {
+  GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave\n");

  get_internal_ctx(ctx)->barrier_all_wave();
 }

-__device__ void rocshmem_ctx_wg_barrier_all(rocshmem_ctx_t ctx) {
-  GPU_DPRINTF("Function: rocshmem_ctx_wg_barrier_all\n");
+__device__ void rocshmem_ctx_barrier_all_wg(rocshmem_ctx_t ctx) {
+  GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg\n");

  get_internal_ctx(ctx)->barrier_all_wg();
 }
@@ -594,14 +594,14 @@ __device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
  get_internal_ctx(ctx)->barrier(team);
 }

-__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
-  GPU_DPRINTF("Function: rocshmem_wave_barrier\n");
+__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team) {
+  GPU_DPRINTF("Function: rocshmem_barrier_wave\n");

  get_internal_ctx(ctx)->barrier_wave(team);
 }

-__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
-  GPU_DPRINTF("Function: rocshmem_wg_barrier\n");
+__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team) {
+  GPU_DPRINTF("Function: rocshmem_barrier_wg\n");

  get_internal_ctx(ctx)->barrier_wg(team);
 }
@@ -612,35 +612,35 @@ __device__ void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx) {
  get_internal_ctx(ctx)->sync_all();
 }

-__device__ void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx) {
-  GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n");
+__device__ void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx) {
+  GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave\n");

  get_internal_ctx(ctx)->sync_all_wave();
 }

-__device__ void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx) {
-  GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n");
+__device__ void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx) {
+  GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg\n");

  get_internal_ctx(ctx)->sync_all_wg();
 }

-__device__ void rocshmem_ctx_team_sync(rocshmem_ctx_t ctx,
+__device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx,
                                           rocshmem_team_t team) {
-  GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n");
+  GPU_DPRINTF("Function: rocshmem_ctx_sync\n");

  get_internal_ctx(ctx)->sync_wg(team);
 }

-__device__ void rocshmem_ctx_wave_team_sync(rocshmem_ctx_t ctx,
+__device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx,
  rocshmem_team_t team) {
-GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n");
+GPU_DPRINTF("Function: rocshmem_ctx_sync_wave\n");

 get_internal_ctx(ctx)->sync_wg(team);
 }

-__device__ void rocshmem_ctx_wg_team_sync(rocshmem_ctx_t ctx,
+__device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx,
  rocshmem_team_t team) {
-GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n");
+GPU_DPRINTF("Function: rocshmem_ctx_sync_wg\n");

 get_internal_ctx(ctx)->sync_wg(team);
 }
@@ -981,7 +981,7 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
 * Template generator for reductions
 */
 #define REDUCTION_GEN(T, Op)                                                   \
-  template __device__ int rocshmem_wg_reduce<T, Op>(                           \
+  template __device__ int rocshmem_reduce_wg<T, Op>(                           \
      rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,     \
      int nreduce);

@@ -1011,13 +1011,13 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
  template __device__ void rocshmem_get_nbi<T>(T * dest, const T *source,      \
                                                size_t nelems, int pe);        \
  template __device__ T rocshmem_g<T>(const T *source, int pe);                \
-  template __device__ void rocshmem_wg_broadcast<T>(                           \
+  template __device__ void rocshmem_broadcast_wg<T>(                           \
      rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,     \
      int nelem, int pe_root);                                                 \
-  template __device__ void rocshmem_wg_alltoall<T>(                            \
+  template __device__ void rocshmem_alltoall_wg<T>(                            \
      rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,     \
      int nelem);                                                              \
-  template __device__ void rocshmem_wg_fcollect<T>(                            \
+  template __device__ void rocshmem_fcollect_wg<T>(                            \
      rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,     \
      int nelem);                                                              \
  template __device__ void rocshmem_put_wave<T>(                               \
@@ -1189,10 +1189,10 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
 **/

 #define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op)                               \
-  __device__ int rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce(                 \
+  __device__ int rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg(                 \
      rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source,     \
      int nreduce) {                                                          \
-    return rocshmem_wg_reduce<T, Op>(ctx, team, dest, source, nreduce);       \
+    return rocshmem_reduce_wg<T, Op>(ctx, team, dest, source, nreduce);       \
  }

 #define ARITH_REDUCTION_DEF_GEN(T, TNAME)         \
@@ -1323,20 +1323,20 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
                                                 size_t nelems, int pe) {     \
    rocshmem_get_nbi_wg<T>(dest, source, nelems, pe);                         \
  }                                                                           \
-  __device__ void rocshmem_ctx_##TNAME##_wg_broadcast(                        \
+  __device__ void rocshmem_ctx_##TNAME##_broadcast_wg(                        \
      rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source,     \
      int nelem, int pe_root) {                                               \
-    rocshmem_wg_broadcast<T>(ctx, team, dest, source, nelem, pe_root);        \
+    rocshmem_broadcast_wg<T>(ctx, team, dest, source, nelem, pe_root);        \
  }                                                                           \
-  __device__ void rocshmem_ctx_##TNAME##_wg_alltoall(                         \
+  __device__ void rocshmem_ctx_##TNAME##_alltoall_wg(                         \
      rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source,     \
      int nelem) {                                                            \
-    rocshmem_wg_alltoall<T>(ctx, team, dest, source, nelem);                  \
+    rocshmem_alltoall_wg<T>(ctx, team, dest, source, nelem);                  \
  }                                                                           \
-  __device__ void rocshmem_ctx_##TNAME##_wg_fcollect(                         \
+  __device__ void rocshmem_ctx_##TNAME##_fcollect_wg(                         \
      rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source,     \
      int nelem) {                                                            \
-    rocshmem_wg_fcollect<T>(ctx, team, dest, source, nelem);                  \
+    rocshmem_fcollect_wg<T>(ctx, team, dest, source, nelem);                  \
  }

 #define AMO_STANDARD_DEF_GEN(T, TNAME)                                        \
@@ -53,11 +53,11 @@ __global__ void BarrierAllTest(int loop, int skip, long long int *start_time,
        break;
      case WAVEBarrierAllTestType:
        if(wf_id == 0) {
-          rocshmem_ctx_wave_barrier_all(ctx);
+          rocshmem_ctx_barrier_all_wave(ctx);
        }
        break;
      case WGBarrierAllTestType:
-        rocshmem_ctx_wg_barrier_all(ctx);
+        rocshmem_ctx_barrier_all_wg(ctx);
        break;
      default:
        break;
@@ -53,11 +53,11 @@ __global__ void SyncAllTest(int loop, int skip, long long int *start_time,
        break;
      case WAVESyncAllTestType:
        if(wf_id == 0) {
-          rocshmem_ctx_wave_sync_all(ctx);
+          rocshmem_ctx_sync_all_wave(ctx);
        }
        break;
      case WGSyncAllTestType:
-        rocshmem_ctx_wg_sync_all(ctx);
+        rocshmem_ctx_sync_all_wg(ctx);
        break;
      default:
        break;
@@ -45,16 +45,16 @@ __global__ void SyncTest(int loop, int skip, long long int *start_time,
    switch (type) {
      case SyncTestType:
        if(t_id == 0) {
-          rocshmem_ctx_team_sync(ctx, teams[wg_id]);
+          rocshmem_ctx_sync(ctx, teams[wg_id]);
        }
        break;
      case WAVESyncTestType:
        if(wf_id == 0) {
-          rocshmem_ctx_wave_team_sync(ctx, teams[wg_id]);
+          rocshmem_ctx_sync_wave(ctx, teams[wg_id]);
        }
        break;
      case WGSyncTestType:
-        rocshmem_ctx_wg_team_sync(ctx, teams[wg_id]);
+        rocshmem_ctx_sync_wg(ctx, teams[wg_id]);
        break;
      default:
        break;
@@ -32,7 +32,7 @@ __device__ void wg_team_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team,
  template <>                                                                  \
  __device__ void wg_team_alltoall<T>(rocshmem_ctx_t ctx, rocshmem_team_t team,\
                                 T * dest, const T *source, int nelem) {       \
-    rocshmem_ctx_##TNAME##_wg_alltoall(ctx, team, dest, source, nelem);        \
+    rocshmem_ctx_##TNAME##_alltoall_wg(ctx, team, dest, source, nelem);        \
  }

 TEAM_ALLTOALL_DEF_GEN(float, float)
@@ -51,11 +51,11 @@ __global__ void TeamBarrierTest(int loop, int skip, long long int *start_time,
        break;
      case TeamWAVEBarrierTestType:
        if(wf_id == 0) {
-          rocshmem_ctx_wave_barrier(ctx, teams[wg_id]);
+          rocshmem_ctx_barrier_wave(ctx, teams[wg_id]);
        }
        break;
      case TeamWGBarrierTestType:
-        rocshmem_ctx_wg_barrier(ctx, teams[wg_id]);
+        rocshmem_ctx_barrier_wg(ctx, teams[wg_id]);
        break;
      default:
        break;
@@ -34,7 +34,7 @@ __device__ void wg_team_broadcast(rocshmem_ctx_t ctx, rocshmem_team_t team,
  __device__ void wg_team_broadcast<T>(                                       \
      rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,    \
      int nelem, int pe_root) {                                               \
-    rocshmem_ctx_##TNAME##_wg_broadcast(ctx, team, dest, source, nelem,       \
+    rocshmem_ctx_##TNAME##_broadcast_wg(ctx, team, dest, source, nelem,       \
                                         pe_root);                            \
  }

@@ -32,7 +32,7 @@ __device__ void wg_team_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team,
  template <>                                                                  \
  __device__ void wg_team_fcollect<T>(rocshmem_ctx_t ctx, rocshmem_team_t team,\
                                      T * dest, const T *source, int nelem) {  \
-    rocshmem_ctx_##TNAME##_wg_fcollect(ctx, team, dest, source, nelem);        \
+    rocshmem_ctx_##TNAME##_fcollect_wg(ctx, team, dest, source, nelem);        \
  }

 TEAM_FCOLLECT_DEF_GEN(float, float)
@@ -35,7 +35,7 @@ __device__ int wg_team_reduce(rocshmem_ctx_t ctx, rocshmem_team_t, T *dest,
  __device__ int wg_team_reduce<T, Op>(rocshmem_ctx_t ctx,               \
                                       rocshmem_team_t team, T * dest,   \
                                       const T *source, int nreduce) {    \
-    return rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce(ctx, team, dest,  \
+    return rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg(ctx, team, dest,  \
                                                        source, nreduce); \
  }

@@ -93,7 +93,7 @@ __global__ void TeamReductionTest(int loop, int skip, long long int *start_time,
      start_time[wg_id] = wall_clock64();
    }
    wg_team_reduce<T1, T2>(ctx, team, r_buf, s_buf, size);
-    rocshmem_ctx_wg_barrier_all(ctx);
+    rocshmem_ctx_barrier_all_wg(ctx);
  }

  __syncthreads();
@@ -43,7 +43,7 @@ types = [

 def alltoall_api(T, TNAME):
    return (
-        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_alltoall(\n"
+        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_alltoall_wg(\n"
        f"    rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
        f"    const {T} *source, int nelems);\n\n"
    )
@@ -75,7 +75,7 @@ def generate_alltoall_api():

 def broadcast_api(T, TNAME):
    return (
-        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_broadcast(\n"
+        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_broadcast_wg(\n"
        f"    rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
        f"    const {T} *source, int nelems, int pe_root);\n"
        f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n"
@@ -120,7 +120,7 @@ def generate_broadcast_api():

 def fcollect_api(T, TNAME):
    return (
-        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_fcollect(\n"
+        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_fcollect_wg(\n"
        f"    rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
        f"    const {T} *source, int nelems);\n\n"
    )
@@ -152,7 +152,7 @@ def generate_fcollect_api():

 def reduction_api(T, TNAME, Op_API):
    return (
-        f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_wg_reduce(\n"
+        f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_reduce_wg(\n"
        f"    rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n"
        f"    int nreduce);\n"
        f"__host__ int rocshmem_ctx_{TNAME}_{Op_API}_reduce(\n"