Update collective APIs naming (#77)

* Update the naming convention for collective APIs to ensure consistency across the interface.

* Move all collective API declarations to rocshmem_COLL.hpp

* The following APIs were updated as part of this change:
  - `barrier`
  - `barrier_all`
  - `sync`
  - `sync_all`
  - `all_to_all`
  - `broadcast`
  - `fcollect`
  - `all_reduce`

* Update header file generation code for collective APIs

[ROCm/rocshmem commit: 68421895d6]
This commit is contained in:
Avinash Kethineedi
2025-04-10 12:14:47 -05:00
committed by GitHub
parent 5b22ddd1ff
commit 41d5d739e2
15 changed files with 334 additions and 334 deletions
@@ -37,7 +37,7 @@ __global__ void allreduce_test(int *source, int *dest, size_t nelem,
rocshmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = rocshmem_ctx_n_pes(ctx);
rocshmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem);
rocshmem_ctx_int_sum_reduce_wg(ctx, team, dest, source, nelem);
rocshmem_ctx_quiet(ctx);
__syncthreads();
@@ -37,7 +37,7 @@ __global__ void alltoall_test(int *source, int *dest, size_t nelem,
rocshmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = rocshmem_ctx_n_pes(ctx);
rocshmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem);
rocshmem_ctx_int_alltoall_wg(ctx, team, dest, source, nelem);
rocshmem_ctx_quiet(ctx);
__syncthreads();
@@ -37,7 +37,7 @@ __global__ void broadcast_test(int *source, int *dest, size_t nelem,
rocshmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = rocshmem_ctx_n_pes(ctx);
rocshmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root);
rocshmem_ctx_int_broadcast_wg(ctx, team, dest, source, nelem, root);
rocshmem_ctx_quiet(ctx);
__syncthreads();
@@ -490,210 +490,6 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
int src_pe,
rocshmem_team_t dest_team);
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all(
rocshmem_ctx_t ctx);
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_barrier_all(
rocshmem_ctx_t ctx);
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_barrier_all(
rocshmem_ctx_t ctx);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_team_sync(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_team_sync(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_team_sync(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief Query a local pointer to a symmetric data object on the
* specified \pe . Returns an address that may be used to directly reference
* dest on the specified \pe. This address can be accesses with LD/ST ops.
*
* Can be called per thread with no performance penalty.
*/
__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe);
/**
* @brief Make all uncacheable GPU data visible to other agents in the sytem.
*
* This only works for data that was explicitly allocated uncacheable on the
* GPU!
*
* Can be called per thread with no performance penalty.
*
* @param[in] GPU-side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system(
rocshmem_ctx_t ctx);
@@ -41,55 +41,55 @@ namespace rocshmem {
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
@@ -116,7 +116,7 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_float_broadcast(
@@ -127,7 +127,7 @@ __host__ void rocshmem_ctx_float_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_double_broadcast(
@@ -138,7 +138,7 @@ __host__ void rocshmem_ctx_double_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_char_broadcast(
@@ -149,7 +149,7 @@ __host__ void rocshmem_ctx_char_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_schar_broadcast(
@@ -160,7 +160,7 @@ __host__ void rocshmem_ctx_schar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_short_broadcast(
@@ -171,7 +171,7 @@ __host__ void rocshmem_ctx_short_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_int_broadcast(
@@ -182,7 +182,7 @@ __host__ void rocshmem_ctx_int_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_long_broadcast(
@@ -193,7 +193,7 @@ __host__ void rocshmem_ctx_long_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_longlong_broadcast(
@@ -204,7 +204,7 @@ __host__ void rocshmem_ctx_longlong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uchar_broadcast(
@@ -215,7 +215,7 @@ __host__ void rocshmem_ctx_uchar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ushort_broadcast(
@@ -226,7 +226,7 @@ __host__ void rocshmem_ctx_ushort_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uint_broadcast(
@@ -237,7 +237,7 @@ __host__ void rocshmem_ctx_uint_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulong_broadcast(
@@ -248,7 +248,7 @@ __host__ void rocshmem_ctx_ulong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast(
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulonglong_broadcast(
@@ -276,55 +276,55 @@ __host__ void rocshmem_ctx_ulonglong_broadcast(
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
@@ -345,258 +345,462 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
*
* @return int (Zero on successful local completion. Nonzero otherwise.)
*/
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce(
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all(
rocshmem_ctx_t ctx);
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wave(
rocshmem_ctx_t ctx);
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wg(
rocshmem_ctx_t ctx);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wave(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief Query a local pointer to a symmetric data object on the
* specified \pe . Returns an address that may be used to directly reference
* dest on the specified \pe. This address can be accesses with LD/ST ops.
*
* Can be called per thread with no performance penalty.
*/
__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe);
/**
* @brief Make all uncacheable GPU data visible to other agents in the sytem.
*
* This only works for data that was explicitly allocated uncacheable on the
* GPU!
*
* Can be called per thread with no performance penalty.
*
* @param[in] GPU-side handle.
*
* @return void
*/
} // namespace rocshmem
+34 -34
View File
@@ -453,7 +453,7 @@ __device__ void *rocshmem_ptr(const void *dest, int pe) {
}
template <typename T, ROCSHMEM_OP Op>
__device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team,
__device__ int rocshmem_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team,
T *dest, const T *source, int nreduce) {
GPU_DPRINTF("Function: rocshmem_reduce\n");
@@ -461,7 +461,7 @@ __device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team,
}
template <typename T>
__device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx,
__device__ void rocshmem_broadcast_wg(rocshmem_ctx_t ctx,
rocshmem_team_t team, T *dest,
const T *source, int nelem,
int pe_root) {
@@ -471,7 +471,7 @@ __device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx,
}
template <typename T>
__device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx,
__device__ void rocshmem_alltoall_wg(rocshmem_ctx_t ctx,
rocshmem_team_t team, T *dest,
const T *source, int nelem) {
GPU_DPRINTF("Function: rocshmem_alltoall\n");
@@ -480,7 +480,7 @@ __device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx,
}
template <typename T>
__device__ void rocshmem_wg_fcollect(rocshmem_ctx_t ctx,
__device__ void rocshmem_fcollect_wg(rocshmem_ctx_t ctx,
rocshmem_team_t team, T *dest,
const T *source, int nelem) {
GPU_DPRINTF("Function: rocshmem_fcollect\n");
@@ -576,14 +576,14 @@ __device__ void rocshmem_ctx_barrier_all(rocshmem_ctx_t ctx) {
get_internal_ctx(ctx)->barrier_all();
}
__device__ void rocshmem_ctx_wave_barrier_all(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_wave_barrier_all\n");
__device__ void rocshmem_ctx_barrier_all_wave(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave\n");
get_internal_ctx(ctx)->barrier_all_wave();
}
__device__ void rocshmem_ctx_wg_barrier_all(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_wg_barrier_all\n");
__device__ void rocshmem_ctx_barrier_all_wg(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg\n");
get_internal_ctx(ctx)->barrier_all_wg();
}
@@ -594,14 +594,14 @@ __device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
get_internal_ctx(ctx)->barrier(team);
}
__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_wave_barrier\n");
__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_barrier_wave\n");
get_internal_ctx(ctx)->barrier_wave(team);
}
__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_wg_barrier\n");
__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_barrier_wg\n");
get_internal_ctx(ctx)->barrier_wg(team);
}
@@ -612,35 +612,35 @@ __device__ void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx) {
get_internal_ctx(ctx)->sync_all();
}
__device__ void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n");
__device__ void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave\n");
get_internal_ctx(ctx)->sync_all_wave();
}
__device__ void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n");
__device__ void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg\n");
get_internal_ctx(ctx)->sync_all_wg();
}
__device__ void rocshmem_ctx_team_sync(rocshmem_ctx_t ctx,
__device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx,
rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n");
GPU_DPRINTF("Function: rocshmem_ctx_sync\n");
get_internal_ctx(ctx)->sync_wg(team);
}
__device__ void rocshmem_ctx_wave_team_sync(rocshmem_ctx_t ctx,
__device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx,
rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n");
GPU_DPRINTF("Function: rocshmem_ctx_sync_wave\n");
get_internal_ctx(ctx)->sync_wg(team);
}
__device__ void rocshmem_ctx_wg_team_sync(rocshmem_ctx_t ctx,
__device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx,
rocshmem_team_t team) {
GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n");
GPU_DPRINTF("Function: rocshmem_ctx_sync_wg\n");
get_internal_ctx(ctx)->sync_wg(team);
}
@@ -981,7 +981,7 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
* Template generator for reductions
*/
#define REDUCTION_GEN(T, Op) \
template __device__ int rocshmem_wg_reduce<T, Op>( \
template __device__ int rocshmem_reduce_wg<T, Op>( \
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
int nreduce);
@@ -1011,13 +1011,13 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
template __device__ void rocshmem_get_nbi<T>(T * dest, const T *source, \
size_t nelems, int pe); \
template __device__ T rocshmem_g<T>(const T *source, int pe); \
template __device__ void rocshmem_wg_broadcast<T>( \
template __device__ void rocshmem_broadcast_wg<T>( \
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
int nelem, int pe_root); \
template __device__ void rocshmem_wg_alltoall<T>( \
template __device__ void rocshmem_alltoall_wg<T>( \
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
int nelem); \
template __device__ void rocshmem_wg_fcollect<T>( \
template __device__ void rocshmem_fcollect_wg<T>( \
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
int nelem); \
template __device__ void rocshmem_put_wave<T>( \
@@ -1189,10 +1189,10 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
**/
#define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \
__device__ int rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce( \
__device__ int rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg( \
rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \
int nreduce) { \
return rocshmem_wg_reduce<T, Op>(ctx, team, dest, source, nreduce); \
return rocshmem_reduce_wg<T, Op>(ctx, team, dest, source, nreduce); \
}
#define ARITH_REDUCTION_DEF_GEN(T, TNAME) \
@@ -1323,20 +1323,20 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
size_t nelems, int pe) { \
rocshmem_get_nbi_wg<T>(dest, source, nelems, pe); \
} \
__device__ void rocshmem_ctx_##TNAME##_wg_broadcast( \
__device__ void rocshmem_ctx_##TNAME##_broadcast_wg( \
rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \
int nelem, int pe_root) { \
rocshmem_wg_broadcast<T>(ctx, team, dest, source, nelem, pe_root); \
rocshmem_broadcast_wg<T>(ctx, team, dest, source, nelem, pe_root); \
} \
__device__ void rocshmem_ctx_##TNAME##_wg_alltoall( \
__device__ void rocshmem_ctx_##TNAME##_alltoall_wg( \
rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \
int nelem) { \
rocshmem_wg_alltoall<T>(ctx, team, dest, source, nelem); \
rocshmem_alltoall_wg<T>(ctx, team, dest, source, nelem); \
} \
__device__ void rocshmem_ctx_##TNAME##_wg_fcollect( \
__device__ void rocshmem_ctx_##TNAME##_fcollect_wg( \
rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \
int nelem) { \
rocshmem_wg_fcollect<T>(ctx, team, dest, source, nelem); \
rocshmem_fcollect_wg<T>(ctx, team, dest, source, nelem); \
}
#define AMO_STANDARD_DEF_GEN(T, TNAME) \
@@ -53,11 +53,11 @@ __global__ void BarrierAllTest(int loop, int skip, long long int *start_time,
break;
case WAVEBarrierAllTestType:
if(wf_id == 0) {
rocshmem_ctx_wave_barrier_all(ctx);
rocshmem_ctx_barrier_all_wave(ctx);
}
break;
case WGBarrierAllTestType:
rocshmem_ctx_wg_barrier_all(ctx);
rocshmem_ctx_barrier_all_wg(ctx);
break;
default:
break;
@@ -53,11 +53,11 @@ __global__ void SyncAllTest(int loop, int skip, long long int *start_time,
break;
case WAVESyncAllTestType:
if(wf_id == 0) {
rocshmem_ctx_wave_sync_all(ctx);
rocshmem_ctx_sync_all_wave(ctx);
}
break;
case WGSyncAllTestType:
rocshmem_ctx_wg_sync_all(ctx);
rocshmem_ctx_sync_all_wg(ctx);
break;
default:
break;
@@ -45,16 +45,16 @@ __global__ void SyncTest(int loop, int skip, long long int *start_time,
switch (type) {
case SyncTestType:
if(t_id == 0) {
rocshmem_ctx_team_sync(ctx, teams[wg_id]);
rocshmem_ctx_sync(ctx, teams[wg_id]);
}
break;
case WAVESyncTestType:
if(wf_id == 0) {
rocshmem_ctx_wave_team_sync(ctx, teams[wg_id]);
rocshmem_ctx_sync_wave(ctx, teams[wg_id]);
}
break;
case WGSyncTestType:
rocshmem_ctx_wg_team_sync(ctx, teams[wg_id]);
rocshmem_ctx_sync_wg(ctx, teams[wg_id]);
break;
default:
break;
@@ -32,7 +32,7 @@ __device__ void wg_team_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team,
template <> \
__device__ void wg_team_alltoall<T>(rocshmem_ctx_t ctx, rocshmem_team_t team,\
T * dest, const T *source, int nelem) { \
rocshmem_ctx_##TNAME##_wg_alltoall(ctx, team, dest, source, nelem); \
rocshmem_ctx_##TNAME##_alltoall_wg(ctx, team, dest, source, nelem); \
}
TEAM_ALLTOALL_DEF_GEN(float, float)
@@ -51,11 +51,11 @@ __global__ void TeamBarrierTest(int loop, int skip, long long int *start_time,
break;
case TeamWAVEBarrierTestType:
if(wf_id == 0) {
rocshmem_ctx_wave_barrier(ctx, teams[wg_id]);
rocshmem_ctx_barrier_wave(ctx, teams[wg_id]);
}
break;
case TeamWGBarrierTestType:
rocshmem_ctx_wg_barrier(ctx, teams[wg_id]);
rocshmem_ctx_barrier_wg(ctx, teams[wg_id]);
break;
default:
break;
@@ -34,7 +34,7 @@ __device__ void wg_team_broadcast(rocshmem_ctx_t ctx, rocshmem_team_t team,
__device__ void wg_team_broadcast<T>( \
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
int nelem, int pe_root) { \
rocshmem_ctx_##TNAME##_wg_broadcast(ctx, team, dest, source, nelem, \
rocshmem_ctx_##TNAME##_broadcast_wg(ctx, team, dest, source, nelem, \
pe_root); \
}
@@ -32,7 +32,7 @@ __device__ void wg_team_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team,
template <> \
__device__ void wg_team_fcollect<T>(rocshmem_ctx_t ctx, rocshmem_team_t team,\
T * dest, const T *source, int nelem) { \
rocshmem_ctx_##TNAME##_wg_fcollect(ctx, team, dest, source, nelem); \
rocshmem_ctx_##TNAME##_fcollect_wg(ctx, team, dest, source, nelem); \
}
TEAM_FCOLLECT_DEF_GEN(float, float)
@@ -35,7 +35,7 @@ __device__ int wg_team_reduce(rocshmem_ctx_t ctx, rocshmem_team_t, T *dest,
__device__ int wg_team_reduce<T, Op>(rocshmem_ctx_t ctx, \
rocshmem_team_t team, T * dest, \
const T *source, int nreduce) { \
return rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce(ctx, team, dest, \
return rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg(ctx, team, dest, \
source, nreduce); \
}
@@ -93,7 +93,7 @@ __global__ void TeamReductionTest(int loop, int skip, long long int *start_time,
start_time[wg_id] = wall_clock64();
}
wg_team_reduce<T1, T2>(ctx, team, r_buf, s_buf, size);
rocshmem_ctx_wg_barrier_all(ctx);
rocshmem_ctx_barrier_all_wg(ctx);
}
__syncthreads();
@@ -43,7 +43,7 @@ types = [
def alltoall_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_alltoall(\n"
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_alltoall_wg(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
f" const {T} *source, int nelems);\n\n"
)
@@ -75,7 +75,7 @@ def generate_alltoall_api():
def broadcast_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_broadcast(\n"
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_broadcast_wg(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
f" const {T} *source, int nelems, int pe_root);\n"
f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n"
@@ -120,7 +120,7 @@ def generate_broadcast_api():
def fcollect_api(T, TNAME):
return (
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_fcollect(\n"
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_fcollect_wg(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
f" const {T} *source, int nelems);\n\n"
)
@@ -152,7 +152,7 @@ def generate_fcollect_api():
def reduction_api(T, TNAME, Op_API):
return (
f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_wg_reduce(\n"
f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_reduce_wg(\n"
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n"
f" int nreduce);\n"
f"__host__ int rocshmem_ctx_{TNAME}_{Op_API}_reduce(\n"