Update collective APIs naming (#77)
* Update the naming convention for collective APIs to ensure consistency across the interface.
* Move all collective API declarations to rocshmem_COLL.hpp
* The following APIs were updated as part of this change:
- `barrier`
- `barrier_all`
- `sync`
- `sync_all`
- `all_to_all`
- `broadcast`
- `fcollect`
- `all_reduce`
* Update header file generation code for collective APIs
[ROCm/rocshmem commit: 68421895d6]
This commit is contained in:
committed by
GitHub
parent
5b22ddd1ff
commit
41d5d739e2
@@ -37,7 +37,7 @@ __global__ void allreduce_test(int *source, int *dest, size_t nelem,
|
||||
rocshmem_wg_ctx_create(ctx_type, &ctx);
|
||||
int num_pes = rocshmem_ctx_n_pes(ctx);
|
||||
|
||||
rocshmem_ctx_int_sum_wg_reduce(ctx, team, dest, source, nelem);
|
||||
rocshmem_ctx_int_sum_reduce_wg(ctx, team, dest, source, nelem);
|
||||
|
||||
rocshmem_ctx_quiet(ctx);
|
||||
__syncthreads();
|
||||
|
||||
@@ -37,7 +37,7 @@ __global__ void alltoall_test(int *source, int *dest, size_t nelem,
|
||||
rocshmem_wg_ctx_create(ctx_type, &ctx);
|
||||
int num_pes = rocshmem_ctx_n_pes(ctx);
|
||||
|
||||
rocshmem_ctx_int_wg_alltoall(ctx, team, dest, source, nelem);
|
||||
rocshmem_ctx_int_alltoall_wg(ctx, team, dest, source, nelem);
|
||||
|
||||
rocshmem_ctx_quiet(ctx);
|
||||
__syncthreads();
|
||||
|
||||
@@ -37,7 +37,7 @@ __global__ void broadcast_test(int *source, int *dest, size_t nelem,
|
||||
rocshmem_wg_ctx_create(ctx_type, &ctx);
|
||||
int num_pes = rocshmem_ctx_n_pes(ctx);
|
||||
|
||||
rocshmem_ctx_int_wg_broadcast(ctx, team, dest, source, nelem, root);
|
||||
rocshmem_ctx_int_broadcast_wg(ctx, team, dest, source, nelem, root);
|
||||
|
||||
rocshmem_ctx_quiet(ctx);
|
||||
__syncthreads();
|
||||
|
||||
@@ -490,210 +490,6 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
|
||||
int src_pe,
|
||||
rocshmem_team_t dest_team);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the system.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be invoked by a single thread within the PE.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all(
|
||||
rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the system.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be called as a wave-front collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_barrier_all(
|
||||
rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the system.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_barrier_all(
|
||||
rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the team.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be invoked by a single thread within the PE.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @param[in] team The team on which to perform barrier synchronization
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the team.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be called as a wave-front collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @param[in] team The team on which to perform barrier synchronization
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the team.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @param[in] team The team on which to perform barrier synchronization
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be invoked by a single thread within the PE.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be called as a wave-front collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be invoked by a single thread within the PE.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
* @param[in] team Handle of the team being synchronized
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_team_sync(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be called as a wave-front collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
* @param[in] team Handle of the team being synchronized
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_wave_team_sync(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
* @param[in] team Handle of the team being synchronized
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_wg_team_sync(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief Query a local pointer to a symmetric data object on the
|
||||
* specified \pe . Returns an address that may be used to directly reference
|
||||
* dest on the specified \pe. This address can be accesses with LD/ST ops.
|
||||
*
|
||||
* Can be called per thread with no performance penalty.
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe);
|
||||
|
||||
/**
|
||||
* @brief Make all uncacheable GPU data visible to other agents in the sytem.
|
||||
*
|
||||
* This only works for data that was explicitly allocated uncacheable on the
|
||||
* GPU!
|
||||
*
|
||||
* Can be called per thread with no performance penalty.
|
||||
*
|
||||
* @param[in] GPU-side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system(
|
||||
rocshmem_ctx_t ctx);
|
||||
|
||||
|
||||
@@ -41,55 +41,55 @@ namespace rocshmem {
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
|
||||
const float *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
|
||||
const double *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
|
||||
const char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
|
||||
const signed char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
|
||||
const short *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
|
||||
const int *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
|
||||
const long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
|
||||
const long long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
|
||||
const unsigned char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
|
||||
const unsigned short *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
|
||||
const unsigned int *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
|
||||
const unsigned long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_alltoall_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
|
||||
const unsigned long long *source, int nelems);
|
||||
|
||||
@@ -116,7 +116,7 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall(
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
|
||||
const float *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_float_broadcast(
|
||||
@@ -127,7 +127,7 @@ __host__ void rocshmem_ctx_float_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
|
||||
const float *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
|
||||
const double *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_double_broadcast(
|
||||
@@ -138,7 +138,7 @@ __host__ void rocshmem_ctx_double_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
|
||||
const double *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
|
||||
const char *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_char_broadcast(
|
||||
@@ -149,7 +149,7 @@ __host__ void rocshmem_ctx_char_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
|
||||
const char *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
|
||||
const signed char *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_schar_broadcast(
|
||||
@@ -160,7 +160,7 @@ __host__ void rocshmem_ctx_schar_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
|
||||
const signed char *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
|
||||
const short *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_short_broadcast(
|
||||
@@ -171,7 +171,7 @@ __host__ void rocshmem_ctx_short_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
|
||||
const short *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
|
||||
const int *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_int_broadcast(
|
||||
@@ -182,7 +182,7 @@ __host__ void rocshmem_ctx_int_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
|
||||
const int *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
|
||||
const long *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_long_broadcast(
|
||||
@@ -193,7 +193,7 @@ __host__ void rocshmem_ctx_long_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
|
||||
const long *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
|
||||
const long long *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_longlong_broadcast(
|
||||
@@ -204,7 +204,7 @@ __host__ void rocshmem_ctx_longlong_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
|
||||
const long long *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
|
||||
const unsigned char *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_uchar_broadcast(
|
||||
@@ -215,7 +215,7 @@ __host__ void rocshmem_ctx_uchar_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
|
||||
const unsigned char *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
|
||||
const unsigned short *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_ushort_broadcast(
|
||||
@@ -226,7 +226,7 @@ __host__ void rocshmem_ctx_ushort_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
|
||||
const unsigned short *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
|
||||
const unsigned int *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_uint_broadcast(
|
||||
@@ -237,7 +237,7 @@ __host__ void rocshmem_ctx_uint_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
|
||||
const unsigned int *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
|
||||
const unsigned long *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_ulong_broadcast(
|
||||
@@ -248,7 +248,7 @@ __host__ void rocshmem_ctx_ulong_broadcast(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
|
||||
const unsigned long *source, int nelems, int pe_root);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_broadcast_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
|
||||
const unsigned long long *source, int nelems, int pe_root);
|
||||
__host__ void rocshmem_ctx_ulonglong_broadcast(
|
||||
@@ -276,55 +276,55 @@ __host__ void rocshmem_ctx_ulonglong_broadcast(
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
|
||||
const float *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
|
||||
const double *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
|
||||
const char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
|
||||
const signed char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
|
||||
const short *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
|
||||
const int *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
|
||||
const long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
|
||||
const long long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
|
||||
const unsigned char *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
|
||||
const unsigned short *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
|
||||
const unsigned int *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
|
||||
const unsigned long *source, int nelems);
|
||||
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_fcollect_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
|
||||
const unsigned long long *source, int nelems);
|
||||
|
||||
@@ -345,258 +345,462 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect(
|
||||
*
|
||||
* @return int (Zero on successful local completion. Nonzero otherwise.)
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_or_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_and_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_short_xor_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_or_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_and_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_int_xor_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_or_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_and_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_long_xor_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_or_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_and_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_longlong_xor_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_float_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_float_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_float_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_float_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_double_sum_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_double_min_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_double_max_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce(
|
||||
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_reduce_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
__host__ int rocshmem_ctx_double_prod_reduce(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
|
||||
int nreduce);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the system.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be invoked by a single thread within the PE.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all(
|
||||
rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the system.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be called as a wave-front collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wave(
|
||||
rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the system.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_barrier_all_wg(
|
||||
rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the team.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be invoked by a single thread within the PE.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @param[in] team The team on which to perform barrier synchronization
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the team.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be called as a wave-front collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @param[in] team The team on which to perform barrier synchronization
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief perform a collective barrier between all PEs in the team.
|
||||
* The caller is blocked until the barrier is resolved.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @param[in] team The team on which to perform barrier synchronization
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be invoked by a single thread within the PE.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be called as a wave-front collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be invoked by a single thread within the PE.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
* @param[in] team Handle of the team being synchronized
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be called as a wave-front collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
* @param[in] team Handle of the team being synchronized
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wave(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief registers the arrival of a PE at a barrier.
|
||||
* The caller is blocked until the synchronization is resolved.
|
||||
*
|
||||
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
|
||||
* completion and visibility of previously issued memory stores and does not
|
||||
* ensure completion of remote memory updates issued via OpenSHMEM routines.
|
||||
*
|
||||
* This function must be called as a work-group collective.
|
||||
*
|
||||
* @param[in] handle GPU side handle.
|
||||
* @param[in] team Handle of the team being synchronized
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wg(
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team);
|
||||
|
||||
/**
|
||||
* @brief Query a local pointer to a symmetric data object on the
|
||||
* specified \pe . Returns an address that may be used to directly reference
|
||||
* dest on the specified \pe. This address can be accesses with LD/ST ops.
|
||||
*
|
||||
* Can be called per thread with no performance penalty.
|
||||
*/
|
||||
__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe);
|
||||
|
||||
/**
|
||||
* @brief Make all uncacheable GPU data visible to other agents in the sytem.
|
||||
*
|
||||
* This only works for data that was explicitly allocated uncacheable on the
|
||||
* GPU!
|
||||
*
|
||||
* Can be called per thread with no performance penalty.
|
||||
*
|
||||
* @param[in] GPU-side handle.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
|
||||
@@ -453,7 +453,7 @@ __device__ void *rocshmem_ptr(const void *dest, int pe) {
|
||||
}
|
||||
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team,
|
||||
__device__ int rocshmem_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team,
|
||||
T *dest, const T *source, int nreduce) {
|
||||
GPU_DPRINTF("Function: rocshmem_reduce\n");
|
||||
|
||||
@@ -461,7 +461,7 @@ __device__ int rocshmem_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx,
|
||||
__device__ void rocshmem_broadcast_wg(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelem,
|
||||
int pe_root) {
|
||||
@@ -471,7 +471,7 @@ __device__ void rocshmem_wg_broadcast(rocshmem_ctx_t ctx,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx,
|
||||
__device__ void rocshmem_alltoall_wg(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelem) {
|
||||
GPU_DPRINTF("Function: rocshmem_alltoall\n");
|
||||
@@ -480,7 +480,7 @@ __device__ void rocshmem_wg_alltoall(rocshmem_ctx_t ctx,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void rocshmem_wg_fcollect(rocshmem_ctx_t ctx,
|
||||
__device__ void rocshmem_fcollect_wg(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team, T *dest,
|
||||
const T *source, int nelem) {
|
||||
GPU_DPRINTF("Function: rocshmem_fcollect\n");
|
||||
@@ -576,14 +576,14 @@ __device__ void rocshmem_ctx_barrier_all(rocshmem_ctx_t ctx) {
|
||||
get_internal_ctx(ctx)->barrier_all();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_wave_barrier_all(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_wave_barrier_all\n");
|
||||
__device__ void rocshmem_ctx_barrier_all_wave(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wave\n");
|
||||
|
||||
get_internal_ctx(ctx)->barrier_all_wave();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_wg_barrier_all(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_wg_barrier_all\n");
|
||||
__device__ void rocshmem_ctx_barrier_all_wg(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_barrier_all_wg\n");
|
||||
|
||||
get_internal_ctx(ctx)->barrier_all_wg();
|
||||
}
|
||||
@@ -594,14 +594,14 @@ __device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
|
||||
get_internal_ctx(ctx)->barrier(team);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_wave_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_wave_barrier\n");
|
||||
__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_barrier_wave\n");
|
||||
|
||||
get_internal_ctx(ctx)->barrier_wave(team);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_wg_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_wg_barrier\n");
|
||||
__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_barrier_wg\n");
|
||||
|
||||
get_internal_ctx(ctx)->barrier_wg(team);
|
||||
}
|
||||
@@ -612,35 +612,35 @@ __device__ void rocshmem_ctx_sync_all(rocshmem_ctx_t ctx) {
|
||||
get_internal_ctx(ctx)->sync_all();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_wave_sync_all(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n");
|
||||
__device__ void rocshmem_ctx_sync_all_wave(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wave\n");
|
||||
|
||||
get_internal_ctx(ctx)->sync_all_wave();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n");
|
||||
__device__ void rocshmem_ctx_sync_all_wg(rocshmem_ctx_t ctx) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_all_wg\n");
|
||||
|
||||
get_internal_ctx(ctx)->sync_all_wg();
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_team_sync(rocshmem_ctx_t ctx,
|
||||
__device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_all\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync\n");
|
||||
|
||||
get_internal_ctx(ctx)->sync_wg(team);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_wave_team_sync(rocshmem_ctx_t ctx,
|
||||
__device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_wave_sync_all\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_wave\n");
|
||||
|
||||
get_internal_ctx(ctx)->sync_wg(team);
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_wg_team_sync(rocshmem_ctx_t ctx,
|
||||
__device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx,
|
||||
rocshmem_team_t team) {
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_wg_sync_all\n");
|
||||
GPU_DPRINTF("Function: rocshmem_ctx_sync_wg\n");
|
||||
|
||||
get_internal_ctx(ctx)->sync_wg(team);
|
||||
}
|
||||
@@ -981,7 +981,7 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
|
||||
* Template generator for reductions
|
||||
*/
|
||||
#define REDUCTION_GEN(T, Op) \
|
||||
template __device__ int rocshmem_wg_reduce<T, Op>( \
|
||||
template __device__ int rocshmem_reduce_wg<T, Op>( \
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
|
||||
int nreduce);
|
||||
|
||||
@@ -1011,13 +1011,13 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
|
||||
template __device__ void rocshmem_get_nbi<T>(T * dest, const T *source, \
|
||||
size_t nelems, int pe); \
|
||||
template __device__ T rocshmem_g<T>(const T *source, int pe); \
|
||||
template __device__ void rocshmem_wg_broadcast<T>( \
|
||||
template __device__ void rocshmem_broadcast_wg<T>( \
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
|
||||
int nelem, int pe_root); \
|
||||
template __device__ void rocshmem_wg_alltoall<T>( \
|
||||
template __device__ void rocshmem_alltoall_wg<T>( \
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
|
||||
int nelem); \
|
||||
template __device__ void rocshmem_wg_fcollect<T>( \
|
||||
template __device__ void rocshmem_fcollect_wg<T>( \
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
|
||||
int nelem); \
|
||||
template __device__ void rocshmem_put_wave<T>( \
|
||||
@@ -1189,10 +1189,10 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
|
||||
**/
|
||||
|
||||
#define REDUCTION_DEF_GEN(T, TNAME, Op_API, Op) \
|
||||
__device__ int rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce( \
|
||||
__device__ int rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg( \
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \
|
||||
int nreduce) { \
|
||||
return rocshmem_wg_reduce<T, Op>(ctx, team, dest, source, nreduce); \
|
||||
return rocshmem_reduce_wg<T, Op>(ctx, team, dest, source, nreduce); \
|
||||
}
|
||||
|
||||
#define ARITH_REDUCTION_DEF_GEN(T, TNAME) \
|
||||
@@ -1323,20 +1323,20 @@ __device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
|
||||
size_t nelems, int pe) { \
|
||||
rocshmem_get_nbi_wg<T>(dest, source, nelems, pe); \
|
||||
} \
|
||||
__device__ void rocshmem_ctx_##TNAME##_wg_broadcast( \
|
||||
__device__ void rocshmem_ctx_##TNAME##_broadcast_wg( \
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \
|
||||
int nelem, int pe_root) { \
|
||||
rocshmem_wg_broadcast<T>(ctx, team, dest, source, nelem, pe_root); \
|
||||
rocshmem_broadcast_wg<T>(ctx, team, dest, source, nelem, pe_root); \
|
||||
} \
|
||||
__device__ void rocshmem_ctx_##TNAME##_wg_alltoall( \
|
||||
__device__ void rocshmem_ctx_##TNAME##_alltoall_wg( \
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \
|
||||
int nelem) { \
|
||||
rocshmem_wg_alltoall<T>(ctx, team, dest, source, nelem); \
|
||||
rocshmem_alltoall_wg<T>(ctx, team, dest, source, nelem); \
|
||||
} \
|
||||
__device__ void rocshmem_ctx_##TNAME##_wg_fcollect( \
|
||||
__device__ void rocshmem_ctx_##TNAME##_fcollect_wg( \
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \
|
||||
int nelem) { \
|
||||
rocshmem_wg_fcollect<T>(ctx, team, dest, source, nelem); \
|
||||
rocshmem_fcollect_wg<T>(ctx, team, dest, source, nelem); \
|
||||
}
|
||||
|
||||
#define AMO_STANDARD_DEF_GEN(T, TNAME) \
|
||||
|
||||
@@ -53,11 +53,11 @@ __global__ void BarrierAllTest(int loop, int skip, long long int *start_time,
|
||||
break;
|
||||
case WAVEBarrierAllTestType:
|
||||
if(wf_id == 0) {
|
||||
rocshmem_ctx_wave_barrier_all(ctx);
|
||||
rocshmem_ctx_barrier_all_wave(ctx);
|
||||
}
|
||||
break;
|
||||
case WGBarrierAllTestType:
|
||||
rocshmem_ctx_wg_barrier_all(ctx);
|
||||
rocshmem_ctx_barrier_all_wg(ctx);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -53,11 +53,11 @@ __global__ void SyncAllTest(int loop, int skip, long long int *start_time,
|
||||
break;
|
||||
case WAVESyncAllTestType:
|
||||
if(wf_id == 0) {
|
||||
rocshmem_ctx_wave_sync_all(ctx);
|
||||
rocshmem_ctx_sync_all_wave(ctx);
|
||||
}
|
||||
break;
|
||||
case WGSyncAllTestType:
|
||||
rocshmem_ctx_wg_sync_all(ctx);
|
||||
rocshmem_ctx_sync_all_wg(ctx);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -45,16 +45,16 @@ __global__ void SyncTest(int loop, int skip, long long int *start_time,
|
||||
switch (type) {
|
||||
case SyncTestType:
|
||||
if(t_id == 0) {
|
||||
rocshmem_ctx_team_sync(ctx, teams[wg_id]);
|
||||
rocshmem_ctx_sync(ctx, teams[wg_id]);
|
||||
}
|
||||
break;
|
||||
case WAVESyncTestType:
|
||||
if(wf_id == 0) {
|
||||
rocshmem_ctx_wave_team_sync(ctx, teams[wg_id]);
|
||||
rocshmem_ctx_sync_wave(ctx, teams[wg_id]);
|
||||
}
|
||||
break;
|
||||
case WGSyncTestType:
|
||||
rocshmem_ctx_wg_team_sync(ctx, teams[wg_id]);
|
||||
rocshmem_ctx_sync_wg(ctx, teams[wg_id]);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -32,7 +32,7 @@ __device__ void wg_team_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team,
|
||||
template <> \
|
||||
__device__ void wg_team_alltoall<T>(rocshmem_ctx_t ctx, rocshmem_team_t team,\
|
||||
T * dest, const T *source, int nelem) { \
|
||||
rocshmem_ctx_##TNAME##_wg_alltoall(ctx, team, dest, source, nelem); \
|
||||
rocshmem_ctx_##TNAME##_alltoall_wg(ctx, team, dest, source, nelem); \
|
||||
}
|
||||
|
||||
TEAM_ALLTOALL_DEF_GEN(float, float)
|
||||
|
||||
@@ -51,11 +51,11 @@ __global__ void TeamBarrierTest(int loop, int skip, long long int *start_time,
|
||||
break;
|
||||
case TeamWAVEBarrierTestType:
|
||||
if(wf_id == 0) {
|
||||
rocshmem_ctx_wave_barrier(ctx, teams[wg_id]);
|
||||
rocshmem_ctx_barrier_wave(ctx, teams[wg_id]);
|
||||
}
|
||||
break;
|
||||
case TeamWGBarrierTestType:
|
||||
rocshmem_ctx_wg_barrier(ctx, teams[wg_id]);
|
||||
rocshmem_ctx_barrier_wg(ctx, teams[wg_id]);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -34,7 +34,7 @@ __device__ void wg_team_broadcast(rocshmem_ctx_t ctx, rocshmem_team_t team,
|
||||
__device__ void wg_team_broadcast<T>( \
|
||||
rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \
|
||||
int nelem, int pe_root) { \
|
||||
rocshmem_ctx_##TNAME##_wg_broadcast(ctx, team, dest, source, nelem, \
|
||||
rocshmem_ctx_##TNAME##_broadcast_wg(ctx, team, dest, source, nelem, \
|
||||
pe_root); \
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ __device__ void wg_team_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team,
|
||||
template <> \
|
||||
__device__ void wg_team_fcollect<T>(rocshmem_ctx_t ctx, rocshmem_team_t team,\
|
||||
T * dest, const T *source, int nelem) { \
|
||||
rocshmem_ctx_##TNAME##_wg_fcollect(ctx, team, dest, source, nelem); \
|
||||
rocshmem_ctx_##TNAME##_fcollect_wg(ctx, team, dest, source, nelem); \
|
||||
}
|
||||
|
||||
TEAM_FCOLLECT_DEF_GEN(float, float)
|
||||
|
||||
@@ -35,7 +35,7 @@ __device__ int wg_team_reduce(rocshmem_ctx_t ctx, rocshmem_team_t, T *dest,
|
||||
__device__ int wg_team_reduce<T, Op>(rocshmem_ctx_t ctx, \
|
||||
rocshmem_team_t team, T * dest, \
|
||||
const T *source, int nreduce) { \
|
||||
return rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce(ctx, team, dest, \
|
||||
return rocshmem_ctx_##TNAME##_##Op_API##_reduce_wg(ctx, team, dest, \
|
||||
source, nreduce); \
|
||||
}
|
||||
|
||||
@@ -93,7 +93,7 @@ __global__ void TeamReductionTest(int loop, int skip, long long int *start_time,
|
||||
start_time[wg_id] = wall_clock64();
|
||||
}
|
||||
wg_team_reduce<T1, T2>(ctx, team, r_buf, s_buf, size);
|
||||
rocshmem_ctx_wg_barrier_all(ctx);
|
||||
rocshmem_ctx_barrier_all_wg(ctx);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -43,7 +43,7 @@ types = [
|
||||
|
||||
def alltoall_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_alltoall(\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_alltoall_wg(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
|
||||
f" const {T} *source, int nelems);\n\n"
|
||||
)
|
||||
@@ -75,7 +75,7 @@ def generate_alltoall_api():
|
||||
|
||||
def broadcast_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_broadcast(\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_broadcast_wg(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
|
||||
f" const {T} *source, int nelems, int pe_root);\n"
|
||||
f"__host__ void rocshmem_ctx_{TNAME}_broadcast(\n"
|
||||
@@ -120,7 +120,7 @@ def generate_broadcast_api():
|
||||
|
||||
def fcollect_api(T, TNAME):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_wg_fcollect(\n"
|
||||
f"__device__ ATTR_NO_INLINE void rocshmem_ctx_{TNAME}_fcollect_wg(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest,\n"
|
||||
f" const {T} *source, int nelems);\n\n"
|
||||
)
|
||||
@@ -152,7 +152,7 @@ def generate_fcollect_api():
|
||||
|
||||
def reduction_api(T, TNAME, Op_API):
|
||||
return (
|
||||
f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_wg_reduce(\n"
|
||||
f"__device__ ATTR_NO_INLINE int rocshmem_ctx_{TNAME}_{Op_API}_reduce_wg(\n"
|
||||
f" rocshmem_ctx_t ctx, rocshmem_team_t team, {T} *dest, const {T} *source,\n"
|
||||
f" int nreduce);\n"
|
||||
f"__host__ int rocshmem_ctx_{TNAME}_{Op_API}_reduce(\n"
|
||||
|
||||
Reference in New Issue
Block a user