diff --git a/projects/rocshmem/examples/rocshmem_allreduce_test.cc b/projects/rocshmem/examples/rocshmem_allreduce_test.cc
index c6f4d05dc9..1e6f73d69e 100644
--- a/projects/rocshmem/examples/rocshmem_allreduce_test.cc
+++ b/projects/rocshmem/examples/rocshmem_allreduce_test.cc
@@ -37,7 +37,7 @@ __global__ void allreduce_test(int *source, int *dest, size_t nelem,
     rocshmem_wg_ctx_create(ctx_type, &ctx);
     int num_pes = rocshmem_ctx_n_pes(ctx);
 
-    rocshmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem);
+    rocshmem_ctx_int_sum_reduce_wg(ctx, team, dest, source, nelem);
 
     rocshmem_ctx_quiet(ctx);
     __syncthreads();
diff --git a/projects/rocshmem/examples/rocshmem_alltoall_test.cc b/projects/rocshmem/examples/rocshmem_alltoall_test.cc
index 775580fba2..6b9d2325bc 100644
--- a/projects/rocshmem/examples/rocshmem_alltoall_test.cc
+++ b/projects/rocshmem/examples/rocshmem_alltoall_test.cc
@@ -37,7 +37,7 @@ __global__ void alltoall_test(int *source, int *dest, size_t nelem,
     rocshmem_wg_ctx_create(ctx_type, &ctx);
     int num_pes = rocshmem_ctx_n_pes(ctx);
 
-    rocshmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem);
+    rocshmem_ctx_int_alltoall_wg(ctx, team, dest, source, nelem);
 
     rocshmem_ctx_quiet(ctx);
     __syncthreads();
diff --git a/projects/rocshmem/examples/rocshmem_broadcast_test.cc b/projects/rocshmem/examples/rocshmem_broadcast_test.cc
index 4a630c75db..9bb204720e 100644
--- a/projects/rocshmem/examples/rocshmem_broadcast_test.cc
+++ b/projects/rocshmem/examples/rocshmem_broadcast_test.cc
@@ -37,7 +37,7 @@ __global__ void broadcast_test(int *source, int *dest, size_t nelem,
     rocshmem_wg_ctx_create(ctx_type, &ctx);
     int num_pes = rocshmem_ctx_n_pes(ctx);
 
-    rocshmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root);
+    rocshmem_ctx_int_broadcast_wg(ctx, team, dest, source, nelem, root);
 
     rocshmem_ctx_quiet(ctx);
     __syncthreads();
diff --git a/projects/rocshmem/include/rocshmem/rocshmem.hpp b/projects/rocshmem/include/rocshmem/rocshmem.hpp
index a9a2251017..1fa1d21e65 100644
--- a/projects/rocshmem/include/rocshmem/rocshmem.hpp
+++ b/projects/rocshmem/include/rocshmem/rocshmem.hpp
@@ -490,210 +490,6 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
                                            int src_pe,
                                            rocshmem_team_t dest_team);
 
-/**
- * @brief perform a collective barrier between all PEs in the system.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be invoked by a single thread within the PE.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all(
-    rocshmem_ctx_t ctx);
-
-/**
- * @brief perform a collective barrier between all PEs in the system.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be called as a wave-front collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_barrier_all(
-    rocshmem_ctx_t ctx);
-
-/**
- * @brief perform a collective barrier between all PEs in the system.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be called as a work-group collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_barrier_all(
-    rocshmem_ctx_t ctx);
-
-/**
- * @brief perform a collective barrier between all PEs in the team.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be invoked by a single thread within the PE.
- *
- * @param[in] handle GPU side handle.
- *
- * @param[in] team The team on which to perform barrier synchronization
- *
- * @return void
- */
-__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief perform a collective barrier between all PEs in the team.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be called as a wave-front collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @param[in] team The team on which to perform barrier synchronization
- *
- * @return void
- */
-__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief perform a collective barrier between all PEs in the team.
- * The caller is blocked until the barrier is resolved.
- *
- * This function must be called as a work-group collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @param[in] team The team on which to perform barrier synchronization
- *
- * @return void
- */
-__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be invoked by a single thread within the PE.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be called as a wave-front collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be called as a work-group collective.
- *
- * @param[in] handle GPU side handle.
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be invoked by a single thread within the PE.
- *
- * @param[in] handle GPU side handle.
- * @param[in] team  Handle of the team being synchronized
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_team_sync(
-    rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be called as a wave-front collective.
- *
- * @param[in] handle GPU side handle.
- * @param[in] team  Handle of the team being synchronized
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_team_sync(
-    rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief registers the arrival of a PE at a barrier.
- * The caller is blocked until the synchronization is resolved.
- *
- * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
- * completion and visibility of previously issued memory stores and does not
- * ensure completion of remote memory updates issued via OpenSHMEM routines.
- *
- * This function must be called as a work-group collective.
- *
- * @param[in] handle GPU side handle.
- * @param[in] team  Handle of the team being synchronized
- *
- * @return void
- */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_team_sync(
-    rocshmem_ctx_t ctx, rocshmem_team_t team);
-
-/**
- * @brief Query a local pointer to a symmetric data object on the
- * specified \pe . Returns an address that may be used to directly reference
- * dest on the specified \pe. This address can be accesses with LD/ST ops.
- *
- * Can be called per thread with no performance penalty.
- */
-__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe);
-
-/**
- * @brief Make all uncacheable GPU data visible to other agents in the sytem.
- *
- * This only works for data that was explicitly allocated uncacheable on the
- * GPU!
- *
- * Can be called per thread with no performance penalty.
- *
- * @param[in] GPU-side handle.
- *
- * @return void
- */
 __device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system(
     rocshmem_ctx_t ctx);
 
diff --git a/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp
index c15498705d..191cec25a1 100644
--- a/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp
+++ b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp
@@ -41,55 +41,55 @@ namespace rocshmem {
  *
  * @return void
  */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_float_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
     const float *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_double_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
     const double *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_char_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
     const char *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
     const signed char *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_short_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
     const short *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_int_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
     const int *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_long_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
     const long *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
     const long long *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
     const unsigned char *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
     const unsigned short *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
     const unsigned int *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
     const unsigned long *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_alltoall_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
     const unsigned long long *source, int nelems);
 
@@ -116,7 +116,7 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
  *
  * @return void
  */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_float_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
     const float *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_float_broadcast(
@@ -127,7 +127,7 @@ __host__ void rocshmem_ctx_float_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
     const float *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_double_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
     const double *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_double_broadcast(
@@ -138,7 +138,7 @@ __host__ void rocshmem_ctx_double_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
     const double *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_char_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
     const char *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_char_broadcast(
@@ -149,7 +149,7 @@ __host__ void rocshmem_ctx_char_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
     const char *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
     const signed char *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_schar_broadcast(
@@ -160,7 +160,7 @@ __host__ void rocshmem_ctx_schar_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
     const signed char *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_short_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
     const short *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_short_broadcast(
@@ -171,7 +171,7 @@ __host__ void rocshmem_ctx_short_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
     const short *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_int_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
     const int *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_int_broadcast(
@@ -182,7 +182,7 @@ __host__ void rocshmem_ctx_int_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
     const int *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_long_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
     const long *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_long_broadcast(
@@ -193,7 +193,7 @@ __host__ void rocshmem_ctx_long_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
     const long *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
     const long long *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_longlong_broadcast(
@@ -204,7 +204,7 @@ __host__ void rocshmem_ctx_longlong_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
     const long long *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
     const unsigned char *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_uchar_broadcast(
@@ -215,7 +215,7 @@ __host__ void rocshmem_ctx_uchar_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
     const unsigned char *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
     const unsigned short *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_ushort_broadcast(
@@ -226,7 +226,7 @@ __host__ void rocshmem_ctx_ushort_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
     const unsigned short *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
     const unsigned int *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_uint_broadcast(
@@ -237,7 +237,7 @@ __host__ void rocshmem_ctx_uint_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
     const unsigned int *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
     const unsigned long *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_ulong_broadcast(
@@ -248,7 +248,7 @@ __host__ void rocshmem_ctx_ulong_broadcast(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
     const unsigned long *source, int nelems, int pe_root);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_broadcast_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
     const unsigned long long *source, int nelems, int pe_root);
 __host__ void rocshmem_ctx_ulonglong_broadcast(
@@ -276,55 +276,55 @@ __host__ void rocshmem_ctx_ulonglong_broadcast(
  *
  * @return void
  */
-__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_float_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
     const float *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_double_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
     const double *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_char_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
     const char *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
     const signed char *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_short_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
     const short *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_int_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
     const int *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_long_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
     const long *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
     const long long *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
     const unsigned char *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
     const unsigned short *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
     const unsigned int *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
     const unsigned long *source, int nelems);
 
-__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
+__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_fcollect_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
     const unsigned long long *source, int nelems);
 
@@ -345,258 +345,462 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
  *
  * @return int (Zero on successful local completion. Nonzero otherwise.)
  */
-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 __host__ int rocshmem_ctx_short_sum_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 __host__ int rocshmem_ctx_short_min_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 __host__ int rocshmem_ctx_short_max_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 __host__ int rocshmem_ctx_short_prod_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 __host__ int rocshmem_ctx_short_or_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 __host__ int rocshmem_ctx_short_and_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 __host__ int rocshmem_ctx_short_xor_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 __host__ int rocshmem_ctx_int_sum_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 __host__ int rocshmem_ctx_int_min_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 __host__ int rocshmem_ctx_int_max_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 __host__ int rocshmem_ctx_int_prod_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 __host__ int rocshmem_ctx_int_or_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 __host__ int rocshmem_ctx_int_and_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 __host__ int rocshmem_ctx_int_xor_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 __host__ int rocshmem_ctx_long_sum_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 __host__ int rocshmem_ctx_long_min_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 __host__ int rocshmem_ctx_long_max_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 __host__ int rocshmem_ctx_long_prod_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 __host__ int rocshmem_ctx_long_or_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 __host__ int rocshmem_ctx_long_and_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 __host__ int rocshmem_ctx_long_xor_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 __host__ int rocshmem_ctx_longlong_sum_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 __host__ int rocshmem_ctx_longlong_min_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 __host__ int rocshmem_ctx_longlong_max_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 __host__ int rocshmem_ctx_longlong_prod_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 __host__ int rocshmem_ctx_longlong_or_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 __host__ int rocshmem_ctx_longlong_and_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 __host__ int rocshmem_ctx_longlong_xor_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
     int nreduce);
 __host__ int rocshmem_ctx_float_sum_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
     int nreduce);
 __host__ int rocshmem_ctx_float_min_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
     int nreduce);
 __host__ int rocshmem_ctx_float_max_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
     int nreduce);
 __host__ int rocshmem_ctx_float_prod_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
     int nreduce);
 __host__ int rocshmem_ctx_double_sum_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
     int nreduce);
 __host__ int rocshmem_ctx_double_min_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
     int nreduce);
 __host__ int rocshmem_ctx_double_max_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
     int nreduce);
 
-__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce(
+__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_reduce_wg(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
     int nreduce);
 __host__ int rocshmem_ctx_double_prod_reduce(
     rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
     int nreduce);
 
+/**
+ * @brief perform a collective barrier between all PEs in the system.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be invoked by a single thread within the PE.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all(
+    rocshmem_ctx_t ctx);
+
+/**
+ * @brief perform a collective barrier between all PEs in the system.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be called as a wave-front collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wave(
+    rocshmem_ctx_t ctx);
+
+/**
+ * @brief perform a collective barrier between all PEs in the system.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be called as a work-group collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wg(
+    rocshmem_ctx_t ctx);
+
+/**
+ * @brief perform a collective barrier between all PEs in the team.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be invoked by a single thread within the PE.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @param[in] team The team on which to perform barrier synchronization
+ *
+ * @return void
+ */
+__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief perform a collective barrier between all PEs in the team.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be called as a wave-front collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @param[in] team The team on which to perform barrier synchronization
+ *
+ * @return void
+ */
+__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief perform a collective barrier between all PEs in the team.
+ * The caller is blocked until the barrier is resolved.
+ *
+ * This function must be called as a work-group collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @param[in] team The team on which to perform barrier synchronization
+ *
+ * @return void
+ */
+__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be invoked by a single thread within the PE.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be called as a wave-front collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be called as a work-group collective.
+ *
+ * @param[in] handle GPU side handle.
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be invoked by a single thread within the PE.
+ *
+ * @param[in] handle GPU side handle.
+ * @param[in] team  Handle of the team being synchronized
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync(
+    rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be called as a wave-front collective.
+ *
+ * @param[in] handle GPU side handle.
+ * @param[in] team  Handle of the team being synchronized
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wave(
+    rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief registers the arrival of a PE at a barrier.
+ * The caller is blocked until the synchronization is resolved.
+ *
+ * In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
+ * completion and visibility of previously issued memory stores and does not
+ * ensure completion of remote memory updates issued via OpenSHMEM routines.
+ *
+ * This function must be called as a work-group collective.
+ *
+ * @param[in] handle GPU side handle.
+ * @param[in] team  Handle of the team being synchronized
+ *
+ * @return void
+ */
+__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wg(
+    rocshmem_ctx_t ctx, rocshmem_team_t team);
+
+/**
+ * @brief Query a local pointer to a symmetric data object on the
+ * specified \pe . Returns an address that may be used to directly reference
+ * dest on the specified \pe. This address can be accesses with LD/ST ops.
+ *
+ * Can be called per thread with no performance penalty.
+ */
+__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe);
+
+/**
+ * @brief Make all uncacheable GPU data visible to other agents in the sytem.
+ *
+ * This only works for data that was explicitly allocated uncacheable on the
+ * GPU!
+ *
+ * Can be called per thread with no performance penalty.
+ *
+ * @param[in] GPU-side handle.
+ *
+ * @return void
+ */
 
 }  // namespace rocshmem
 
diff --git a/projects/rocshmem/src/rocshmem_gpu.cpp b/projects/rocshmem/src/rocshmem_gpu.cpp
index 5c9e5279e8..b73173abc1 100644
--- a/projects/rocshmem/src/rocshmem_gpu.cpp
+++ b/projects/rocshmem/src/rocshmem_gpu.cpp
@@ -453,7 +453,7 @@ __device__ void *rocshmem_ptr(const void *dest, int pe) {
 }
 
 template <typename T, ROCSHMEM_OP Op>
-__device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team,
+__device__ int rocshmem_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team,
                                    T *dest, const T *source, int nreduce) {
   GPU_DPRINTF("Function: rocshmem_reduce\n");
 
@@ -461,7 +461,7 @@ __device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team,
 }
 
 template <typename T>
-__device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx,
+__device__ void rocshmem_broadcast_wg(rocshmem_ctx_t ctx,
                                        rocshmem_team_t team, T *dest,
                                        const T *source, int nelem,
                                        int pe_root) {
@@ -471,7 +471,7 @@ __device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx,
 }
 
 template <typename T>
-__device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx,
+__device__ void rocshmem_alltoall_wg(rocshmem_ctx_t ctx,
                                       rocshmem_team_t team, T *dest,
                                       const T *source, int nelem) {
   GPU_DPRINTF("Function: rocshmem_alltoall\n");
@@ -480,7 +480,7 @@ __device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx,
 }
 
 template <typename T>
-__device__ void rocshmem_wg_fcollect(rocshmem_ctx_t ctx,
+__device__ void rocshmem_fcollect_wg(rocshmem_ctx_t ctx,
                                       rocshmem_team_t team, T *dest,
                                       const T *source, int nelem) {
   GPU_DPRINTF("Function: rocshmem_fcollect\n");
@@ -576,14 +576,14 @@ __device__ void rocshmem_ctx_barrier_all(rocshmem_ctx_t ctx) {
   get_internal_ctx(ctx)->barrier_all();
 }
 
-__device__ void rocshmem_ctx_wave_barrier_all(rocshmem_ctx_t ctx) {
-  GPU_DPRINTF("Function: rocshmem_ctx_wave_barrier_all\n");
+__device__ void rocshmem_ctx_barrier_all_wave(rocshmem_ctx_t ctx) {
+  GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave\n");
 
   get_internal_ctx(ctx)->barrier_all_wave();
 }
 
-__device__ void rocshmem_ctx_wg_barrier_all(rocshmem_ctx_t ctx) {
-  GPU_DPRINTF("Function: rocshmem_ctx_wg_barrier_all\n");
+__device__ void rocshmem_ctx_barrier_all_wg(rocshmem_ctx_t ctx) {
+  GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg\n");
 
   get_internal_ctx(ctx)->barrier_all_wg();
 }
@@ -594,14 +594,14 @@ __device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
   get_internal_ctx(ctx)->barrier(team);
 }
 
-__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
-  GPU_DPRINTF("Function: rocshmem_wave_barrier\n");
+__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team) {
+  GPU_DPRINTF("Function: rocshmem_barrier_wave\n");
 
   get_internal_ctx(ctx)->barrier_wave(team);
 }
 
-__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
-  GPU_DPRINTF("Function: rocshmem_wg_barrier\n");
+__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team) {
+  GPU_DPRINTF("Function: rocshmem_barrier_wg\n");
 
   get_internal_ctx(ctx)->barrier_wg(team);
 }
@@ -612,35 +612,35 @@ __device__ void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx) {
   get_internal_ctx(ctx)->sync_all();
 }
 
-__device__ void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx) {
-  GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n");
+__device__ void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx) {
+  GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave\n");
 
   get_internal_ctx(ctx)->sync_all_wave();
 }
 
-__device__ void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx) {
-  GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n");
+__device__ void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx) {
+  GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg\n");
 
   get_internal_ctx(ctx)->sync_all_wg();
 }
 
-__device__ void rocshmem_ctx_team_sync(rocshmem_ctx_t ctx,
+__device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx,
                                            rocshmem_team_t team) {
-  GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n");
+  GPU_DPRINTF("Function: rocshmem_ctx_sync\n");
 
   get_internal_ctx(ctx)->sync_wg(team);
 }
 
-__device__ void rocshmem_ctx_wave_team_sync(rocshmem_ctx_t ctx,
+__device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx,
   rocshmem_team_t team) {
-GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n");
+GPU_DPRINTF("Function: rocshmem_ctx_sync_wave\n");
 
 get_internal_ctx(ctx)->sync_wg(team);
 }
 
-__device__ void rocshmem_ctx_wg_team_sync(rocshmem_ctx_t ctx,
+__device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx,
   rocshmem_team_t team) {
-GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n");
+GPU_DPRINTF("Function: rocshmem_ctx_sync_wg\n");
 
 get_internal_ctx(ctx)->sync_wg(team);
 }
@@ -981,7 +981,7 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
  * Template generator for reductions
  */
 #define REDUCTION_GEN(T, Op)                                                   \
-  template __device__ int rocshmem_wg_reduce<T, Op>(                           \
+  template __device__ int rocshmem_reduce_wg<T, Op>(                           \
       rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,     \
       int nreduce);
 
@@ -1011,13 +1011,13 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
   template __device__ void rocshmem_get_nbi<T>(T * dest, const T *source,      \
                                                 size_t nelems, int pe);        \
   template __device__ T rocshmem_g<T>(const T *source, int pe);                \
-  template __device__ void rocshmem_wg_broadcast<T>(                           \
+  template __device__ void rocshmem_broadcast_wg<T>(                           \
       rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,     \
       int nelem, int pe_root);                                                 \
-  template __device__ void rocshmem_wg_alltoall<T>(                            \
+  template __device__ void rocshmem_alltoall_wg<T>(                            \
       rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,     \
       int nelem);                                                              \
-  template __device__ void rocshmem_wg_fcollect<T>(                            \
+  template __device__ void rocshmem_fcollect_wg<T>(                            \
       rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,     \
       int nelem);                                                              \
   template __device__ void rocshmem_put_wave<T>(                               \
@@ -1189,10 +1189,10 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
  **/
 
 #define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op)                               \
-  __device__ int rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce(                 \
+  __device__ int rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg(                 \
       rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source,     \
       int nreduce) {                                                          \
-    return rocshmem_wg_reduce<T, Op>(ctx, team, dest, source, nreduce);       \
+    return rocshmem_reduce_wg<T, Op>(ctx, team, dest, source, nreduce);       \
   }
 
 #define ARITH_REDUCTION_DEF_GEN(T, TNAME)         \
@@ -1323,20 +1323,20 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
                                                  size_t nelems, int pe) {     \
     rocshmem_get_nbi_wg<T>(dest, source, nelems, pe);                         \
   }                                                                           \
-  __device__ void rocshmem_ctx_##TNAME##_wg_broadcast(                        \
+  __device__ void rocshmem_ctx_##TNAME##_broadcast_wg(                        \
       rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source,     \
       int nelem, int pe_root) {                                               \
-    rocshmem_wg_broadcast<T>(ctx, team, dest, source, nelem, pe_root);        \
+    rocshmem_broadcast_wg<T>(ctx, team, dest, source, nelem, pe_root);        \
   }                                                                           \
-  __device__ void rocshmem_ctx_##TNAME##_wg_alltoall(                         \
+  __device__ void rocshmem_ctx_##TNAME##_alltoall_wg(                         \
       rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source,     \
       int nelem) {                                                            \
-    rocshmem_wg_alltoall<T>(ctx, team, dest, source, nelem);                  \
+    rocshmem_alltoall_wg<T>(ctx, team, dest, source, nelem);                  \
   }                                                                           \
-  __device__ void rocshmem_ctx_##TNAME##_wg_fcollect(                         \
+  __device__ void rocshmem_ctx_##TNAME##_fcollect_wg(                         \
       rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source,     \
       int nelem) {                                                            \
-    rocshmem_wg_fcollect<T>(ctx, team, dest, source, nelem);                  \
+    rocshmem_fcollect_wg<T>(ctx, team, dest, source, nelem);                  \
   }
 
 #define AMO_STANDARD_DEF_GEN(T, TNAME)                                        \
diff --git a/projects/rocshmem/tests/functional_tests/barrier_all_tester.cpp b/projects/rocshmem/tests/functional_tests/barrier_all_tester.cpp
index dfd5811726..4f8325125a 100644
--- a/projects/rocshmem/tests/functional_tests/barrier_all_tester.cpp
+++ b/projects/rocshmem/tests/functional_tests/barrier_all_tester.cpp
@@ -53,11 +53,11 @@ __global__ void BarrierAllTest(int loop, int skip, long long int *start_time,
         break;
       case WAVEBarrierAllTestType:
         if(wf_id == 0) {
-          rocshmem_ctx_wave_barrier_all(ctx);
+          rocshmem_ctx_barrier_all_wave(ctx);
         }
         break;
       case WGBarrierAllTestType:
-        rocshmem_ctx_wg_barrier_all(ctx);
+        rocshmem_ctx_barrier_all_wg(ctx);
         break;
       default:
         break;
diff --git a/projects/rocshmem/tests/functional_tests/sync_all_tester.cpp b/projects/rocshmem/tests/functional_tests/sync_all_tester.cpp
index 8a02db4d54..16dc738c41 100644
--- a/projects/rocshmem/tests/functional_tests/sync_all_tester.cpp
+++ b/projects/rocshmem/tests/functional_tests/sync_all_tester.cpp
@@ -53,11 +53,11 @@ __global__ void SyncAllTest(int loop, int skip, long long int *start_time,
         break;
       case WAVESyncAllTestType:
         if(wf_id == 0) {
-          rocshmem_ctx_wave_sync_all(ctx);
+          rocshmem_ctx_sync_all_wave(ctx);
         }
         break;
       case WGSyncAllTestType:
-        rocshmem_ctx_wg_sync_all(ctx);
+        rocshmem_ctx_sync_all_wg(ctx);
         break;
       default:
         break;
diff --git a/projects/rocshmem/tests/functional_tests/sync_tester.cpp b/projects/rocshmem/tests/functional_tests/sync_tester.cpp
index b130e6fdb7..8ffa4c91ff 100644
--- a/projects/rocshmem/tests/functional_tests/sync_tester.cpp
+++ b/projects/rocshmem/tests/functional_tests/sync_tester.cpp
@@ -45,16 +45,16 @@ __global__ void SyncTest(int loop, int skip, long long int *start_time,
     switch (type) {
       case SyncTestType:
         if(t_id == 0) {
-          rocshmem_ctx_team_sync(ctx, teams[wg_id]);
+          rocshmem_ctx_sync(ctx, teams[wg_id]);
         }
         break;
       case WAVESyncTestType:
         if(wf_id == 0) {
-          rocshmem_ctx_wave_team_sync(ctx, teams[wg_id]);
+          rocshmem_ctx_sync_wave(ctx, teams[wg_id]);
         }
         break;
       case WGSyncTestType:
-        rocshmem_ctx_wg_team_sync(ctx, teams[wg_id]);
+        rocshmem_ctx_sync_wg(ctx, teams[wg_id]);
         break;
       default:
         break;
diff --git a/projects/rocshmem/tests/functional_tests/team_alltoall_tester.cpp b/projects/rocshmem/tests/functional_tests/team_alltoall_tester.cpp
index fd3cc28e00..10f8f5b60f 100644
--- a/projects/rocshmem/tests/functional_tests/team_alltoall_tester.cpp
+++ b/projects/rocshmem/tests/functional_tests/team_alltoall_tester.cpp
@@ -32,7 +32,7 @@ __device__ void wg_team_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team,
   template <>                                                                  \
   __device__ void wg_team_alltoall<T>(rocshmem_ctx_t ctx, rocshmem_team_t team,\
                                  T * dest, const T *source, int nelem) {       \
-    rocshmem_ctx_##TNAME##_wg_alltoall(ctx, team, dest, source, nelem);        \
+    rocshmem_ctx_##TNAME##_alltoall_wg(ctx, team, dest, source, nelem);        \
   }
 
 TEAM_ALLTOALL_DEF_GEN(float, float)
diff --git a/projects/rocshmem/tests/functional_tests/team_barrier_tester.cpp b/projects/rocshmem/tests/functional_tests/team_barrier_tester.cpp
index ff81154aed..73e1db00f1 100644
--- a/projects/rocshmem/tests/functional_tests/team_barrier_tester.cpp
+++ b/projects/rocshmem/tests/functional_tests/team_barrier_tester.cpp
@@ -51,11 +51,11 @@ __global__ void TeamBarrierTest(int loop, int skip, long long int *start_time,
         break;
       case TeamWAVEBarrierTestType:
         if(wf_id == 0) {
-          rocshmem_ctx_wave_barrier(ctx, teams[wg_id]);
+          rocshmem_ctx_barrier_wave(ctx, teams[wg_id]);
         }
         break;
       case TeamWGBarrierTestType:
-        rocshmem_ctx_wg_barrier(ctx, teams[wg_id]);
+        rocshmem_ctx_barrier_wg(ctx, teams[wg_id]);
         break;
       default:
         break;
diff --git a/projects/rocshmem/tests/functional_tests/team_broadcast_tester.cpp b/projects/rocshmem/tests/functional_tests/team_broadcast_tester.cpp
index 43d5a7ac4c..11326542fa 100644
--- a/projects/rocshmem/tests/functional_tests/team_broadcast_tester.cpp
+++ b/projects/rocshmem/tests/functional_tests/team_broadcast_tester.cpp
@@ -34,7 +34,7 @@ __device__ void wg_team_broadcast(rocshmem_ctx_t ctx, rocshmem_team_t team,
   __device__ void wg_team_broadcast<T>(                                       \
       rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source,    \
       int nelem, int pe_root) {                                               \
-    rocshmem_ctx_##TNAME##_wg_broadcast(ctx, team, dest, source, nelem,       \
+    rocshmem_ctx_##TNAME##_broadcast_wg(ctx, team, dest, source, nelem,       \
                                          pe_root);                            \
   }
 
diff --git a/projects/rocshmem/tests/functional_tests/team_fcollect_tester.cpp b/projects/rocshmem/tests/functional_tests/team_fcollect_tester.cpp
index 4f8b85e693..5f1fc46264 100644
--- a/projects/rocshmem/tests/functional_tests/team_fcollect_tester.cpp
+++ b/projects/rocshmem/tests/functional_tests/team_fcollect_tester.cpp
@@ -32,7 +32,7 @@ __device__ void wg_team_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team,
   template <>                                                                  \
   __device__ void wg_team_fcollect<T>(rocshmem_ctx_t ctx, rocshmem_team_t team,\
                                       T * dest, const T *source, int nelem) {  \
-    rocshmem_ctx_##TNAME##_wg_fcollect(ctx, team, dest, source, nelem);        \
+    rocshmem_ctx_##TNAME##_fcollect_wg(ctx, team, dest, source, nelem);        \
   }
 
 TEAM_FCOLLECT_DEF_GEN(float, float)
diff --git a/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp b/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp
index 17c14ae91d..4229888bfc 100644
--- a/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp
+++ b/projects/rocshmem/tests/functional_tests/team_reduction_tester.cpp
@@ -35,7 +35,7 @@ __device__ int wg_team_reduce(rocshmem_ctx_t ctx, rocshmem_team_t, T *dest,
   __device__ int wg_team_reduce<T, Op>(rocshmem_ctx_t ctx,               \
                                        rocshmem_team_t team, T * dest,   \
                                        const T *source, int nreduce) {    \
-    return rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce(ctx, team, dest,  \
+    return rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg(ctx, team, dest,  \
                                                         source, nreduce); \
   }
 
@@ -93,7 +93,7 @@ __global__ void TeamReductionTest(int loop, int skip, long long int *start_time,
       start_time[wg_id] = wall_clock64();
     }
     wg_team_reduce<T1, T2>(ctx, team, r_buf, s_buf, size);
-    rocshmem_ctx_wg_barrier_all(ctx);
+    rocshmem_ctx_barrier_all_wg(ctx);
   }
 
   __syncthreads();
diff --git a/projects/rocshmem/utils/header_files_gen/COLL.py b/projects/rocshmem/utils/header_files_gen/COLL.py
index 2755f8b26f..2bf899699a 100644
--- a/projects/rocshmem/utils/header_files_gen/COLL.py
+++ b/projects/rocshmem/utils/header_files_gen/COLL.py
@@ -43,7 +43,7 @@ types = [
 
 def alltoall_api(T, TNAME):
     return (
-        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_alltoall(\n"
+        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_alltoall_wg(\n"
         f"    rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
         f"    const {T} *source, int nelems);\n\n"
     )
@@ -75,7 +75,7 @@ def generate_alltoall_api():
 
 def broadcast_api(T, TNAME):
     return (
-        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_broadcast(\n"
+        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_broadcast_wg(\n"
         f"    rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
         f"    const {T} *source, int nelems, int pe_root);\n"
         f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n"
@@ -120,7 +120,7 @@ def generate_broadcast_api():
 
 def fcollect_api(T, TNAME):
     return (
-        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_fcollect(\n"
+        f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_fcollect_wg(\n"
         f"    rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
         f"    const {T} *source, int nelems);\n\n"
     )
@@ -152,7 +152,7 @@ def generate_fcollect_api():
 
 def reduction_api(T, TNAME, Op_API):
     return (
-        f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_wg_reduce(\n"
+        f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_reduce_wg(\n"
         f"    rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n"
         f"    int nreduce);\n"
         f"__host__ int rocshmem_ctx_{TNAME}_{Op_API}_reduce(\n"