diff --git a/projects/rocshmem/CMakeLists.txt b/projects/rocshmem/CMakeLists.txt index bce7de7cbb..b07b98cc74 100644 --- a/projects/rocshmem/CMakeLists.txt +++ b/projects/rocshmem/CMakeLists.txt @@ -190,7 +190,7 @@ set_target_properties( ${PROJECT_NAME} PROPERTIES PUBLIC_HEADER - "${CMAKE_BINARY_DIR}/config.h;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/rocshmem.hpp;${CMAKE_CURRENT_SOURCE_DIR}/include/rocshmem/debug.hpp" + "${CMAKE_BINARY_DIR}/config.h" ) ############################################################################### @@ -386,6 +386,11 @@ install( COMPONENT bin ) +install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/ + DESTINATION ${INSTALL_INCLUDEDIR} + COMPONENT dev +) + install( EXPORT ${PROJECT_NAME}Targets diff --git a/projects/rocshmem/examples/rocshmem_allreduce_test.cc b/projects/rocshmem/examples/rocshmem_allreduce_test.cc index fdd81fa8a2..c6f4d05dc9 100644 --- a/projects/rocshmem/examples/rocshmem_allreduce_test.cc +++ b/projects/rocshmem/examples/rocshmem_allreduce_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_allreduce_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/examples/rocshmem_alltoall_test.cc b/projects/rocshmem/examples/rocshmem_alltoall_test.cc index 36847abf3a..775580fba2 100644 --- a/projects/rocshmem/examples/rocshmem_alltoall_test.cc +++ b/projects/rocshmem/examples/rocshmem_alltoall_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_alltoall_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/examples/rocshmem_broadcast_test.cc b/projects/rocshmem/examples/rocshmem_broadcast_test.cc index 9958707893..4a630c75db 100644 --- a/projects/rocshmem/examples/rocshmem_broadcast_test.cc +++ b/projects/rocshmem/examples/rocshmem_broadcast_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_broadcast_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/examples/rocshmem_getmem_test.cc b/projects/rocshmem/examples/rocshmem_getmem_test.cc index 9c4a419ddd..942b43068d 100644 --- a/projects/rocshmem/examples/rocshmem_getmem_test.cc +++ b/projects/rocshmem/examples/rocshmem_getmem_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_getmem_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/examples/rocshmem_put_signal_test.cc b/projects/rocshmem/examples/rocshmem_put_signal_test.cc index da52562124..9e41bc4529 100644 --- a/projects/rocshmem/examples/rocshmem_put_signal_test.cc +++ b/projects/rocshmem/examples/rocshmem_put_signal_test.cc @@ -1,7 +1,6 @@ /* hipcc -c -fgpu-rdc -x hip rocshmem_put_signal_test.cc \ -I/opt/rocm/include \ - -I$ROCSHMEM_SRC_DIR/include \ -I$ROCSHMEM_INSTALL_DIR/include \ -I$OPENMPI_UCX_INSTALL_DIR/include/ diff --git a/projects/rocshmem/include/rocshmem/rocshmem.hpp b/projects/rocshmem/include/rocshmem/rocshmem.hpp index 4dd33895ef..04ea6336fe 100644 --- a/projects/rocshmem/include/rocshmem/rocshmem.hpp +++ b/projects/rocshmem/include/rocshmem/rocshmem.hpp @@ -27,6 +27,13 @@ #include #include "config.h" +#include "rocshmem_common.hpp" +#include "rocshmem_RMA.hpp" +#include "rocshmem_AMO.hpp" +#include "rocshmem_SIG_OP.hpp" +#include "rocshmem_COLL.hpp" +#include "rocshmem_P2P_SYNC.hpp" +#include "rocshmem_RMA_X.hpp" /** * @file rocshmem.hpp * @brief Public header for rocSHMEM device and host libraries. @@ -43,106 +50,6 @@ namespace rocshmem { -#ifdef USE_FUNC_CALL -#define ATTR_NO_INLINE __attribute__((noinline)) -#else -#define ATTR_NO_INLINE -#endif - - -enum ROCSHMEM_STATUS { - ROCSHMEM_SUCCESS = 0, - ROCSHMEM_ERROR = 1, -}; - -enum ROCSHMEM_OP { - ROCSHMEM_SUM, - ROCSHMEM_MAX, - ROCSHMEM_MIN, - ROCSHMEM_PROD, - ROCSHMEM_AND, - ROCSHMEM_OR, - ROCSHMEM_XOR, - ROCSHMEM_REPLACE -}; - -enum ROCSHMEM_SIGNAL_OPS { - ROCSHMEM_SIGNAL_SET, - ROCSHMEM_SIGNAL_ADD, -}; - -/** - * @brief Types defined for rocshmem_wait() operations. - */ -enum rocshmem_cmps { - ROCSHMEM_CMP_EQ, - ROCSHMEM_CMP_NE, - ROCSHMEM_CMP_GT, - ROCSHMEM_CMP_GE, - ROCSHMEM_CMP_LT, - ROCSHMEM_CMP_LE, -}; - -enum rocshmem_thread_ops { - ROCSHMEM_THREAD_SINGLE, - ROCSHMEM_THREAD_FUNNELED, - ROCSHMEM_THREAD_WG_FUNNELED, - ROCSHMEM_THREAD_SERIALIZED, - ROCSHMEM_THREAD_MULTIPLE -}; - -/** - * @brief Bitwise flags to mask configuration parameters. - */ -enum rocshmem_team_configs { - ROCSHMEM_TEAM_DEFAULT_CONFIGS, - ROCSHMEM_TEAM_NUM_CONTEXTS -}; - -typedef struct { - int num_contexts; -} rocshmem_team_config_t; - -constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024; -constexpr size_t ROCSHMEM_ATA_MAX_WRKDATA_SIZE = (4 * 1024 * 1024); -constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256; -constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256; -// Internally calls sync function, which matches barrier implementation -constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE; -constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1; -constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE; -constexpr size_t ROCSHMEM_SYNC_VALUE = 0; - -const int ROCSHMEM_CTX_ZERO = 0; -const int ROCSHMEM_CTX_NOSTORE = 1; -const int ROCSHMEM_CTX_SERIALIZED = 2; -const int ROCSHMEM_CTX_WG_PRIVATE = 4; -const int ROCSHMEM_CTX_SHARED = 8; - -/** - * @brief GPU side OpenSHMEM context created from each work-groups' - * rocshmem_wg_handle_t - */ -typedef struct { - void *ctx_opaque; - void *team_opaque; -} rocshmem_ctx_t; - -/** - * Shmem default context. - */ -extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_DEFAULT; - -/** - * Used internally to set default context. - */ -void set_internal_ctx(rocshmem_ctx_t *ctx); - -typedef uint64_t *rocshmem_team_t; -extern rocshmem_team_t ROCSHMEM_TEAM_WORLD; - -const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr; - /****************************************************************************** **************************** HOST INTERFACE ********************************** *****************************************************************************/ @@ -322,102 +229,6 @@ __host__ int rocshmem_team_split_strided(rocshmem_team_t parent_team, */ __host__ void rocshmem_team_destroy(rocshmem_team_t team); -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into __host__ rocshmem_quiet() if remote completion is required. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__host__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, int pe); - -__host__ void rocshmem_putmem(void *dest, const void *source, size_t nelems, - int pe); - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * _host__ rocshmem_quiet() if completion notification is required. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__host__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe); - -__host__ void rocshmem_putmem_nbi(void *dest, const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__host__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, int pe); - -__host__ void rocshmem_getmem(void *dest, const void *source, size_t nelems, - int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller will - * return as soon as the request is posted. The caller must call - * __host__ rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__host__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, - const void *source, size_t nelems, - int pe); - -__host__ void rocshmem_getmem_nbi(void *dest, const void *source, - size_t nelems, int pe); - /** * @brief Guarantees order between messages in this context in accordance with * OpenSHMEM semantics. @@ -549,121 +360,6 @@ __device__ ATTR_NO_INLINE int rocshmem_wg_team_create_ctx( */ __device__ ATTR_NO_INLINE void rocshmem_wg_ctx_destroy(rocshmem_ctx_t *ctx); -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem(void *dest, const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem(void *dest, const void *source, - size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller will - * return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi(void *dest, - const void *source, - size_t nelems, int pe); - /** * @brief Guarantees order between messages in this context in accordance with * OpenSHMEM semantics. @@ -844,1858 +540,6 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system( __device__ ATTR_NO_INLINE void rocshmem_threadfence_system(); -/* - * MACRO DECLARE SHMEM_REDUCTION APIs - */ -#define REDUCTION_API_GEN(T, TNAME, Op_API) \ - __device__ ATTR_NO_INLINE int rocshmem_ctx_##TNAME##_##Op_API##_wg_reduce( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nreduce); \ - __host__ int rocshmem_ctx_##TNAME##_##Op_API##_reduce( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nreduce); - -#define ARITH_REDUCTION_API_GEN(T, TNAME) \ - REDUCTION_API_GEN(T, TNAME, sum) \ - REDUCTION_API_GEN(T, TNAME, min) \ - REDUCTION_API_GEN(T, TNAME, max) \ - REDUCTION_API_GEN(T, TNAME, prod) - -#define BITWISE_REDUCTION_API_GEN(T, TNAME) \ - REDUCTION_API_GEN(T, TNAME, or) \ - REDUCTION_API_GEN(T, TNAME, and) \ - REDUCTION_API_GEN(T, TNAME, xor) - -#define INT_REDUCTION_API_GEN(T, TNAME) \ - ARITH_REDUCTION_API_GEN(T, TNAME) \ - BITWISE_REDUCTION_API_GEN(T, TNAME) - -#define FLOAT_REDUCTION_API_GEN(T, TNAME) ARITH_REDUCTION_API_GEN(T, TNAME) - -/* - * MACRO DECLARE SHMEM_BROADCAST APIs - */ -#define BROADCAST_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_wg_broadcast( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nelem, int pe_root); /* NOLINT */ \ - __host__ void rocshmem_ctx_##TNAME##_broadcast( \ - rocshmem_ctx_t ctx, T *dest, const T *source, int nelem, int pe_root, \ - int pe_start, int log_pe_stride, int pe_size, \ - long *p_sync); /* NOLINT */ \ - __host__ void rocshmem_ctx_##TNAME##_broadcast( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nelem, int pe_root); /* NOLINT */ - -/* - * MACRO DECLARE SHMEM_ALLTOALL APIs - */ -#define ALLTOALL_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_wg_alltoall( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nelem); /* NOLINT */ -/* - * MACRO DECLARE SHMEM_FCOLLECT APIs - */ -#define FCOLLECT_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_wg_fcollect( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, const T *source, \ - int nelem); /* NOLINT */ - -/* - * MACRO DECLARE SHMEM_PUT APIs - */ -#define PUT_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put( \ - T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_put( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_##TNAME##_put(T *dest, const T *source, \ - size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_P APIs - */ -#define P_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_p( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_p(T *dest, T value, \ - int pe); \ - __host__ void rocshmem_ctx_##TNAME##_p(rocshmem_ctx_t ctx, T *dest, \ - T value, int pe); \ - __host__ void rocshmem_##TNAME##_p(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_GET APIs - */ -#define GET_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get( \ - T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_get( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_##TNAME##_get(T *dest, const T *source, \ - size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_G APIs - */ -#define G_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_g( \ - rocshmem_ctx_t ctx, const T *source, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_g(const T *source, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_g(rocshmem_ctx_t ctx, const T *source, \ - int pe); \ - __host__ T rocshmem_##TNAME##_g(const T *source, int pe); - -/* - * MACRO DECLARE SHMEM_PUT_NBI APIs - */ -#define PUT_NBI_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_nbi( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_nbi( \ - T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_put_nbi( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_##TNAME##_put_nbi(T *dest, const T *source, \ - size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_GET_NBI APIs - */ -#define GET_NBI_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get_nbi( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get_nbi( \ - T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_get_nbi( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __host__ void rocshmem_##TNAME##_get_nbi(T *dest, const T *source, \ - size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_ADD APIs - */ -#define ATOMIC_FETCH_ADD_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_add( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_add( \ - T *dest, T value, int pe); \ - __host__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_add( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_add( \ - T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_COMPARE_SWAP APIs - */ -#define ATOMIC_COMPARE_SWAP_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_compare_swap( \ - rocshmem_ctx_t ctx, T *dest, T cond, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_compare_swap( \ - T *dest, T cond, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_compare_swap( \ - rocshmem_ctx_t ctx, T *dest, T cond, T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_compare_swap(T *dest, T cond, T value, \ - int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_INC APIs - */ -#define ATOMIC_FETCH_INC_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_inc( \ - rocshmem_ctx_t ctx, T *dest, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_inc(T *dest, \ - int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_inc(rocshmem_ctx_t ctx, \ - T *dest, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch_inc(T *dest, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH APIs - */ -#define ATOMIC_FETCH_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch( \ - rocshmem_ctx_t ctx, T *source, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch(T *source, \ - int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch(rocshmem_ctx_t ctx, \ - T *source, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch(T *source, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_ADD APIs - */ -#define ATOMIC_ADD_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_add( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_add( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_add(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_add(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_SET APIs - */ -#define ATOMIC_SET_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_set( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_set( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_set(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_set(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_SWAP APIs - */ -#define ATOMIC_SWAP_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_swap( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_swap( \ - T *dest, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_swap(rocshmem_ctx_t ctx, T *dest, \ - T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_swap(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_AND APIs - */ -#define ATOMIC_FETCH_AND_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_and( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_and( \ - T *dest, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_and( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch_and(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_AND APIs - */ -#define ATOMIC_AND_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_and( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_and( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_and(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_and(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_OR APIs - */ -#define ATOMIC_FETCH_OR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_or( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_or( \ - T *dest, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_or( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch_or(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_OR APIs - */ -#define ATOMIC_OR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_or( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_or( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_or(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_or(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_FETCH_XOR APIs - */ -#define ATOMIC_FETCH_XOR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE T rocshmem_ctx_##TNAME##_atomic_fetch_xor( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE T rocshmem_##TNAME##_atomic_fetch_xor( \ - T *dest, T value, int pe); \ - __host__ T rocshmem_ctx_##TNAME##_atomic_fetch_xor( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __host__ T rocshmem_##TNAME##_atomic_fetch_xor(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_XOR APIs - */ -#define ATOMIC_XOR_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_xor( \ - rocshmem_ctx_t ctx, T *dest, T value, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_xor( \ - T *dest, T value, int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_xor(rocshmem_ctx_t ctx, \ - T *dest, T value, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_xor(T *dest, T value, int pe); - -/* - * MACRO DECLARE SHMEM_ATOMIC_INC APIs - */ -#define ATOMIC_INC_API_GEN(T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_atomic_inc( \ - rocshmem_ctx_t ctx, T *dest, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_atomic_inc(T *dest, \ - int pe); \ - __host__ void rocshmem_ctx_##TNAME##_atomic_inc(rocshmem_ctx_t ctx, \ - T *dest, int pe); \ - __host__ void rocshmem_##TNAME##_atomic_inc(T *dest, int pe); - -/* - * MACRO DECLARE SHMEM_WAIT_UNTIL APIs - */ -#define WAIT_UNTIL_API_GEN(T, TNAME) \ - __device__ void rocshmem_##TNAME##_wait_until(T *ivars, \ - int cmp, \ - T val); \ - __device__ size_t rocshmem_##TNAME##_wait_until_any(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T val); \ - __device__ void rocshmem_##TNAME##_wait_until_all(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T val); \ - __device__ size_t rocshmem_##TNAME##_wait_until_some(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T val); \ - __device__ size_t rocshmem_##TNAME##_wait_until_any_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals); \ - __device__ void rocshmem_##TNAME##_wait_until_all_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals); \ - __device__ size_t rocshmem_##TNAME##_wait_until_some_vector(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T* vals); \ - __host__ void rocshmem_##TNAME##_wait_until(T *ivars, \ - int cmp, \ - T val); \ - __host__ size_t rocshmem_##TNAME##_wait_until_any(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T val); \ - __host__ void rocshmem_##TNAME##_wait_until_all(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T val); \ - __host__ size_t rocshmem_##TNAME##_wait_until_some(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T val); \ - __host__ size_t rocshmem_##TNAME##_wait_until_any_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals); \ - __host__ void rocshmem_##TNAME##_wait_until_all_vector(T *ivars, \ - size_t nelems, \ - const int* status, \ - int cmp, \ - T* vals); \ - __host__ size_t rocshmem_##TNAME##_wait_until_some_vector(T *ivars, \ - size_t nelems, \ - size_t* indices, \ - const int* status, \ - int cmp, \ - T* vals); - -/* - * MACRO DECLARE SHMEM_TEST APIs - */ -#define TEST_API_GEN(T, TNAME) \ - __device__ int rocshmem_##TNAME##_test(T *ivars, int cmp, T val); \ - __host__ int rocshmem_##TNAME##_test(T *ivars, int cmp, T val); - -/** - * @name SHMEM_REDUCTIONS - * @brief Perform an allreduce between PEs in the active set. The caller - * is blocked until the reduction completes. - * - * This function must be called as a work-group collective. - * - * @param[in] dest Destination address. Must be an address on the - * symmetric heap. - * @param[in] source Source address. Must be an address on the symmetric - heap. - * @param[in] nreduce Size of the buffer to participate in the reduction. - * @param[in] PE_start PE to start the reduction. - * @param[in] logPE_stride Stride of PEs participating in the reduction. - * @param[in] PE_size Number PEs participating in the reduction. - * @param[in] pWrk Temporary work buffer provided to rocSHMEM. Must - * be of size at least max(size/2 + 1, - ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE). - * @param[in] pSync Temporary sync buffer provided to rocSHMEM. Must - be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. - * @param[in] handle GPU side handle. - * - * @return void - */ -///@{ -INT_REDUCTION_API_GEN(int, int) -INT_REDUCTION_API_GEN(short, short) // NOLINT(runtime/int) -INT_REDUCTION_API_GEN(long, long) // NOLINT(runtime/int) -INT_REDUCTION_API_GEN(long long, longlong) // NOLINT(runtime/int) -FLOAT_REDUCTION_API_GEN(float, float) -FLOAT_REDUCTION_API_GEN(double, double) -// long double reduction fails. hipcc/device may not support long double. -// so disable it for now. -// FLOAT_REDUCTION_API_GEN(long double, longdouble) -///@} - -/** - * @name SHMEM_BROADCAST - * @brief Perform a broadcast between PEs in the active set. The caller - * is blocked until the broadcase completes. - * - * This function must be called as a work-group collective. - * - * @param[in] dest Destination address. Must be an address on the - * symmetric heap. - * @param[in] source Source address. Must be an address on the symmetric - heap. - * @param[in] nelement Size of the buffer to participate in the broadcast. - * @param[in] PE_root Zero-based ordinal of the PE, with respect to the - active set, from which the data is copied - * @param[in] PE_start PE to start the reduction. - * @param[in] logPE_stride Stride of PEs participating in the reduction. - * @param[in] PE_size Number PEs participating in the reduction. - * @param[in] pSync Temporary sync buffer provided to rocSHMEM. Must - be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. - * - * @return void - */ -///@{ -BROADCAST_API_GEN(float, float) -BROADCAST_API_GEN(double, double) -// BROADCAST_API_GEN(long double, longdouble) -BROADCAST_API_GEN(char, char) -BROADCAST_API_GEN(signed char, schar) -BROADCAST_API_GEN(short, short) // NOLINT(runtime/int) -BROADCAST_API_GEN(int, int) -BROADCAST_API_GEN(long, long) // NOLINT(runtime/int) -BROADCAST_API_GEN(long long, longlong) // NOLINT(runtime/int) -BROADCAST_API_GEN(unsigned char, uchar) -BROADCAST_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -BROADCAST_API_GEN(unsigned int, uint) -BROADCAST_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -BROADCAST_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_ALLTOALL - * @brief Exchanges a fixed amount of contiguous data blocks between all pairs - * of PEs participating in the collective routine. - * - * This function must be called as a work-group collective. - * - * @param[in] team The team participating in the collective. - * @param[in] dest Destination address. Must be an address on the - * symmetric heap. - * @param[in] source Source address. Must be an address on the symmetric - heap. - * @param[in] nelems Number of data blocks transferred per pair of PEs. - * - * @return void - */ -///@{ -ALLTOALL_API_GEN(float, float) -ALLTOALL_API_GEN(double, double) -// ALLTOALL_API_GEN(long double, longdouble) -ALLTOALL_API_GEN(char, char) -ALLTOALL_API_GEN(signed char, schar) -ALLTOALL_API_GEN(short, short) // NOLINT(runtime/int) -ALLTOALL_API_GEN(int, int) -ALLTOALL_API_GEN(long, long) // NOLINT(runtime/int) -ALLTOALL_API_GEN(long long, longlong) // NOLINT(runtime/int) -ALLTOALL_API_GEN(unsigned char, uchar) -ALLTOALL_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -ALLTOALL_API_GEN(unsigned int, uint) -ALLTOALL_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -ALLTOALL_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_FCOLLECT - * @brief Concatenates blocks of data from multiple PEs to an array in every - * PE participating in the collective routine. - * - * This function must be called as a work-group collective. - * - * @param[in] team The team participating in the collective. - * @param[in] dest Destination address. Must be an address on the - * symmetric heap. - * @param[in] source Source address. Must be an address on the symmetric - heap. - * @param[in] nelems Number of data blocks in source array. - * - * @return void - */ -///@{ -FCOLLECT_API_GEN(float, float) -FCOLLECT_API_GEN(double, double) -// FCOLLECT_API_GEN(long double, longdouble) -FCOLLECT_API_GEN(char, char) -FCOLLECT_API_GEN(signed char, schar) -FCOLLECT_API_GEN(short, short) // NOLINT(runtime/int) -FCOLLECT_API_GEN(int, int) -FCOLLECT_API_GEN(long, long) // NOLINT(runtime/int) -FCOLLECT_API_GEN(long long, longlong) // NOLINT(runtime/int) -FCOLLECT_API_GEN(unsigned char, uchar) -FCOLLECT_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -FCOLLECT_API_GEN(unsigned int, uint) -FCOLLECT_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -FCOLLECT_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_PUT - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_API_GEN(float, float) -PUT_API_GEN(double, double) -// PUT_API_GEN(long double, longdouble) -PUT_API_GEN(char, char) -PUT_API_GEN(signed char, schar) -PUT_API_GEN(short, short) // NOLINT(runtime/int) -PUT_API_GEN(int, int) -PUT_API_GEN(long, long) // NOLINT(runtime/int) -PUT_API_GEN(long long, longlong) // NOLINT(runtime/int) -PUT_API_GEN(unsigned char, uchar) -PUT_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -PUT_API_GEN(unsigned int, uint) -PUT_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -PUT_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_P - * @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe. - * The caller must call into rocshmem_quiet() if remote completion is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] value Value to write to dest at \p pe. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -P_API_GEN(float, float) -P_API_GEN(double, double) -// P_API_GEN(long double, longdouble) -P_API_GEN(char, char) -P_API_GEN(signed char, schar) -P_API_GEN(short, short) // NOLINT(runtime/int) -P_API_GEN(int, int) -P_API_GEN(long, long) // NOLINT(runtime/int) -P_API_GEN(long long, longlong) // NOLINT(runtime/int) -P_API_GEN(unsigned char, uchar) -P_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -P_API_GEN(unsigned int, uint) -P_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -P_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_GET - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_API_GEN(float, float) -GET_API_GEN(double, double) -// GET_API_GEN(long double, longdouble) -GET_API_GEN(char, char) -GET_API_GEN(signed char, schar) -GET_API_GEN(short, short) // NOLINT(runtime/int) -GET_API_GEN(int, int) -GET_API_GEN(long, long) // NOLINT(runtime/int) -GET_API_GEN(long long, longlong) // NOLINT(runtime/int) -GET_API_GEN(unsigned char, uchar) -GET_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -GET_API_GEN(unsigned int, uint) -GET_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -GET_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_G - * @brief reads and returns single value from \p source at \p pe. - * The calling work-group/thread will block until the operation completes. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] source sourcen address. Must be an address on the symmetric - * heap. - * @param[in] pe PE of the remote process. - * - * @return the value read from remote \p source at \p pe. - */ -///@{ -G_API_GEN(float, float) -G_API_GEN(double, double) -// G_API_GEN(long double, longdouble) -G_API_GEN(char, char) -G_API_GEN(signed char, schar) -G_API_GEN(short, short) // NOLINT(runtime/int) -G_API_GEN(int, int) -G_API_GEN(long, long) // NOLINT(runtime/int) -G_API_GEN(long long, longlong) // NOLINT(runtime/int) -G_API_GEN(unsigned char, uchar) -G_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -G_API_GEN(unsigned int, uint) -G_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -G_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_PUT_NBI - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_NBI_API_GEN(float, float) -PUT_NBI_API_GEN(double, double) -// PUT_NBI_API_GEN(long double, longdouble) -PUT_NBI_API_GEN(char, char) -PUT_NBI_API_GEN(signed char, schar) -PUT_NBI_API_GEN(short, short) // NOLINT(runtime/int) -PUT_NBI_API_GEN(int, int) -PUT_NBI_API_GEN(long, long) // NOLINT(runtime/int) -PUT_NBI_API_GEN(long long, longlong) // NOLINT(runtime/int) -PUT_NBI_API_GEN(unsigned char, uchar) -PUT_NBI_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -PUT_NBI_API_GEN(unsigned int, uint) -PUT_NBI_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -PUT_NBI_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_GET_NBI - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller will - * return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_NBI_API_GEN(float, float) -GET_NBI_API_GEN(double, double) -// GET_NBI_API_GEN(long double, longdouble) -GET_NBI_API_GEN(char, char) -GET_NBI_API_GEN(signed char, schar) -GET_NBI_API_GEN(short, short) // NOLINT(runtime/int) -GET_NBI_API_GEN(int, int) -GET_NBI_API_GEN(long, long) // NOLINT(runtime/int) -GET_NBI_API_GEN(long long, longlong) // NOLINT(runtime/int) -GET_NBI_API_GEN(unsigned char, uchar) -GET_NBI_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -GET_NBI_API_GEN(unsigned int, uint) -GET_NBI_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -GET_NBI_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_ADD - * @brief Atomically add the value \p val to \p dest on \p pe. The operation - * returns the older value of \p dest to the calling PE. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return The old value of \p dest before the \p val was added. - */ -///@{ -ATOMIC_FETCH_ADD_API_GEN(int, int) -ATOMIC_FETCH_ADD_API_GEN(long, long) -ATOMIC_FETCH_ADD_API_GEN(long long, longlong) -ATOMIC_FETCH_ADD_API_GEN(unsigned int, uint) -ATOMIC_FETCH_ADD_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_ADD_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_ADD_API_GEN(int32_t, int32) -ATOMIC_FETCH_ADD_API_GEN(int64_t, int64) -ATOMIC_FETCH_ADD_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_ADD_API_GEN(uint64_t, uint64) -ATOMIC_FETCH_ADD_API_GEN(size_t, size) -ATOMIC_FETCH_ADD_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_COMPARE_SWAP - * @brief Atomically compares if the value in \p dest with \p cond is equal - * then put \p val in \p dest. The operation returns the older value of \p dest - * to the calling PE. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] cond The value to be compare with. - * @param[in] val The value to be atomically swapped. - * @param[in] pe PE of the remote process. - * - * @return The old value of \p dest. - */ -///@{ -ATOMIC_COMPARE_SWAP_API_GEN(int, int) -ATOMIC_COMPARE_SWAP_API_GEN(long, long) -ATOMIC_COMPARE_SWAP_API_GEN(long long, longlong) -ATOMIC_COMPARE_SWAP_API_GEN(unsigned int, uint) -ATOMIC_COMPARE_SWAP_API_GEN(unsigned long, ulong) -ATOMIC_COMPARE_SWAP_API_GEN(unsigned long long, ulonglong) -ATOMIC_COMPARE_SWAP_API_GEN(int32_t, int32) -ATOMIC_COMPARE_SWAP_API_GEN(int64_t, int64) -ATOMIC_COMPARE_SWAP_API_GEN(uint32_t, uint32) -ATOMIC_COMPARE_SWAP_API_GEN(uint64_t, uint64) -ATOMIC_COMPARE_SWAP_API_GEN(size_t, size) -ATOMIC_COMPARE_SWAP_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_INC - * @brief Atomically add 1 to \p dest on \p pe. The operation - * returns the older value of \p dest to the calling PE. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] pe PE of the remote process. - * - * @return The old value of \p dest before it was incremented by 1. - */ -///@{ -ATOMIC_FETCH_INC_API_GEN(int, int) -ATOMIC_FETCH_INC_API_GEN(long, long) -ATOMIC_FETCH_INC_API_GEN(long long, longlong) -ATOMIC_FETCH_INC_API_GEN(unsigned int, uint) -ATOMIC_FETCH_INC_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_INC_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_INC_API_GEN(int32_t, int32) -ATOMIC_FETCH_INC_API_GEN(int64_t, int64) -ATOMIC_FETCH_INC_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_INC_API_GEN(uint64_t, uint64) -ATOMIC_FETCH_INC_API_GEN(size_t, size) -ATOMIC_FETCH_INC_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH - * @brief Atomically return the value of \p dest to the calling PE. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return The value of \p dest. - */ -///@{ -ATOMIC_FETCH_API_GEN(float, float) -ATOMIC_FETCH_API_GEN(double, double) -ATOMIC_FETCH_API_GEN(int, int) -ATOMIC_FETCH_API_GEN(long, long) -ATOMIC_FETCH_API_GEN(long long, longlong) -ATOMIC_FETCH_API_GEN(unsigned int, uint) -ATOMIC_FETCH_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_API_GEN(int32_t, int32) -ATOMIC_FETCH_API_GEN(int64_t, int64) -ATOMIC_FETCH_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_API_GEN(uint64_t, uint64) -ATOMIC_FETCH_API_GEN(size_t, size) -ATOMIC_FETCH_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_ADD - * @brief Atomically add the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_ADD_API_GEN(int, int) -ATOMIC_ADD_API_GEN(long, long) -ATOMIC_ADD_API_GEN(long long, longlong) -ATOMIC_ADD_API_GEN(unsigned int, uint) -ATOMIC_ADD_API_GEN(unsigned long, ulong) -ATOMIC_ADD_API_GEN(unsigned long long, ulonglong) -ATOMIC_ADD_API_GEN(int32_t, int32) -ATOMIC_ADD_API_GEN(int64_t, int64) -ATOMIC_ADD_API_GEN(uint32_t, uint32) -ATOMIC_ADD_API_GEN(uint64_t, uint64) -ATOMIC_ADD_API_GEN(size_t, size) -ATOMIC_ADD_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_SET - * @brief Atomically set the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_SET_API_GEN(float, float) -ATOMIC_SET_API_GEN(double, double) -ATOMIC_SET_API_GEN(int, int) -ATOMIC_SET_API_GEN(long, long) -ATOMIC_SET_API_GEN(long long, longlong) -ATOMIC_SET_API_GEN(unsigned int, uint) -ATOMIC_SET_API_GEN(unsigned long, ulong) -ATOMIC_SET_API_GEN(unsigned long long, ulonglong) -ATOMIC_SET_API_GEN(int32_t, int32) -ATOMIC_SET_API_GEN(int64_t, int64) -ATOMIC_SET_API_GEN(uint32_t, uint32) -ATOMIC_SET_API_GEN(uint64_t, uint64) -ATOMIC_SET_API_GEN(size_t, size) -ATOMIC_SET_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_SWAP - * @brief Atomically swap the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return original value - */ -///@{ -ATOMIC_SWAP_API_GEN(float, float) -ATOMIC_SWAP_API_GEN(double, double) -ATOMIC_SWAP_API_GEN(int, int) -ATOMIC_SWAP_API_GEN(long, long) -ATOMIC_SWAP_API_GEN(long long, longlong) -ATOMIC_SWAP_API_GEN(unsigned int, uint) -ATOMIC_SWAP_API_GEN(unsigned long, ulong) -ATOMIC_SWAP_API_GEN(unsigned long long, ulonglong) -ATOMIC_SWAP_API_GEN(int32_t, int32) -ATOMIC_SWAP_API_GEN(int64_t, int64) -ATOMIC_SWAP_API_GEN(uint32_t, uint32) -ATOMIC_SWAP_API_GEN(uint64_t, uint64) -ATOMIC_SWAP_API_GEN(size_t, size) -ATOMIC_SWAP_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_AND - * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return original value - */ -///@{ -ATOMIC_FETCH_AND_API_GEN(unsigned int, uint) -ATOMIC_FETCH_AND_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_AND_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_AND_API_GEN(int32_t, int32) -ATOMIC_FETCH_AND_API_GEN(int64_t, int64) -ATOMIC_FETCH_AND_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_AND_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_AND - * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_AND_API_GEN(unsigned int, uint) -ATOMIC_AND_API_GEN(unsigned long, ulong) -ATOMIC_AND_API_GEN(unsigned long long, ulonglong) -ATOMIC_AND_API_GEN(int32_t, int32) -ATOMIC_AND_API_GEN(int64_t, int64) -ATOMIC_AND_API_GEN(uint32_t, uint32) -ATOMIC_AND_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_OR - * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return original value - */ -///@{ -ATOMIC_FETCH_OR_API_GEN(unsigned int, uint) -ATOMIC_FETCH_OR_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_OR_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_OR_API_GEN(int32_t, int32) -ATOMIC_FETCH_OR_API_GEN(int64_t, int64) -ATOMIC_FETCH_OR_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_OR_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_OR - * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_OR_API_GEN(unsigned int, uint) -ATOMIC_OR_API_GEN(unsigned long, ulong) -ATOMIC_OR_API_GEN(unsigned long long, ulonglong) -ATOMIC_OR_API_GEN(int32_t, int32) -ATOMIC_OR_API_GEN(int64_t, int64) -ATOMIC_OR_API_GEN(uint32_t, uint32) -ATOMIC_OR_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_FETCH_XOR - * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return original value - */ -///@{ -ATOMIC_FETCH_XOR_API_GEN(unsigned int, uint) -ATOMIC_FETCH_XOR_API_GEN(unsigned long, ulong) -ATOMIC_FETCH_XOR_API_GEN(unsigned long long, ulonglong) -ATOMIC_FETCH_XOR_API_GEN(int32_t, int32) -ATOMIC_FETCH_XOR_API_GEN(int64_t, int64) -ATOMIC_FETCH_XOR_API_GEN(uint32_t, uint32) -ATOMIC_FETCH_XOR_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_XOR - * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] val The value to be atomically added. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_XOR_API_GEN(unsigned int, uint) -ATOMIC_XOR_API_GEN(unsigned long, ulong) -ATOMIC_XOR_API_GEN(unsigned long long, ulonglong) -ATOMIC_XOR_API_GEN(int32_t, int32) -ATOMIC_XOR_API_GEN(int64_t, int64) -ATOMIC_XOR_API_GEN(uint32_t, uint32) -ATOMIC_XOR_API_GEN(uint64_t, uint64) -///@} - -/** - * @name SHMEM_ATOMIC_INC - * @brief Atomically add 1 to \p dest on \p pe. - * - * The operation is blocking. - * - * This function can be called from divergent control paths at per-thread - * granularity. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] pe PE of the remote process. - * - * @return void - */ -///@{ -ATOMIC_INC_API_GEN(int, int) -ATOMIC_INC_API_GEN(long, long) -ATOMIC_INC_API_GEN(long long, longlong) -ATOMIC_INC_API_GEN(unsigned int, uint) -ATOMIC_INC_API_GEN(unsigned long, ulong) -ATOMIC_INC_API_GEN(unsigned long long, ulonglong) -ATOMIC_INC_API_GEN(int32_t, int32) -ATOMIC_INC_API_GEN(int64_t, int64) -ATOMIC_INC_API_GEN(uint32_t, uint32) -ATOMIC_INC_API_GEN(uint64_t, uint64) -ATOMIC_INC_API_GEN(size_t, size) -ATOMIC_INC_API_GEN(ptrdiff_t, ptrdiff) -///@} - -/** - * @name SHMEM_WAIT_UNTIL - * @brief Block the caller until the condition (* \p ptr \p cmps \p val) is - * true. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ivars Pointer to memory on the symmetric heap to wait for. - * @param[in] cmp Operation for the comparison. - * @param[in] val Value to compare the memory at \p ptr to. - * - * @return void - */ -///@{ -WAIT_UNTIL_API_GEN(float, float) -WAIT_UNTIL_API_GEN(double, double) -// WAIT_UNTIL_API_GEN(long double, longdouble) -WAIT_UNTIL_API_GEN(char, char) -WAIT_UNTIL_API_GEN(signed char, schar) -WAIT_UNTIL_API_GEN(short, short) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(int, int) -WAIT_UNTIL_API_GEN(long, long) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(long long, longlong) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(unsigned char, uchar) -WAIT_UNTIL_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(unsigned int, uint) -WAIT_UNTIL_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -WAIT_UNTIL_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @name SHMEM_TEST - * @brief test if the condition (* \p ptr \p cmps \p val) is - * true. - * - * This function can be called from divergent control paths at per-thread - * granularity. However, performance may be improved if the caller can - * coalesce contiguous messages and elect a leader thread to call into the - * rocSHMEM function. - * - * @param[in] ivars Pointer to memory on the symmetric heap to wait for. - * @param[in] cmp Operation for the comparison. - * @param[in] val Value to compare the memory at \p ptr to. - * - * @return 1 if the evaluation is true else 0 - */ -///@{ -TEST_API_GEN(float, float) -TEST_API_GEN(double, double) -// TEST_API_GEN(long double, longdouble) -TEST_API_GEN(char, char) -TEST_API_GEN(signed char, schar) -TEST_API_GEN(short, short) // NOLINT(runtime/int) -TEST_API_GEN(int, int) -TEST_API_GEN(long, long) // NOLINT(runtime/int) -TEST_API_GEN(long long, longlong) // NOLINT(runtime/int) -TEST_API_GEN(unsigned char, uchar) -TEST_API_GEN(unsigned short, ushort) // NOLINT(runtime/int) -TEST_API_GEN(unsigned int, uint) -TEST_API_GEN(unsigned long, ulong) // NOLINT(runtime/int) -TEST_API_GEN(unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/****************************************************************************** - ***************************** API EXTENSIONS ********************************* - *****************************************************************************/ - -/* - * MACRO DECLARE SHMEM_PUT APIs - */ -#define PUT_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_##GRAN( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_##GRAN( \ - T *dest, const T *source, size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_GET APIs - */ -#define GET_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get_##GRAN( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get_##GRAN( \ - T *dest, const T *source, size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_PUT_NBI APIs - */ -#define PUT_NBI_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_nbi_##GRAN( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_nbi_##GRAN( \ - T *dest, const T *source, size_t nelems, int pe); - -/* - * MACRO DECLARE SHMEM_GET_NBI APIs - */ -#define GET_NBI_API_EXT_GEN(GRAN, T, TNAME) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_get_nbi_##GRAN( \ - rocshmem_ctx_t ctx, T *dest, const T *source, size_t nelems, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_get_nbi_##GRAN( \ - T *dest, const T *source, size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in a wave must participate in the - * call using the same parameters. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wave( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_wave(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-workgroup - * (WG) granularity. However, all threads in the workgroup must participate in - * the call using the same parameters. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_wg(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in a wave must collectively participate - * in the call using the same arguments - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_API_EXT_GEN(wave, float, float) -PUT_API_EXT_GEN(wave, double, double) -// PUT_API_EXT_GEN(wave, long double, longdouble) -PUT_API_EXT_GEN(wave, char, char) -PUT_API_EXT_GEN(wave, signed char, schar) -PUT_API_EXT_GEN(wave, short, short) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, int, int) -PUT_API_EXT_GEN(wave, long, long) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, long long, longlong) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, unsigned char, uchar) -PUT_API_EXT_GEN(wave, unsigned short, ushort) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, unsigned int, uint) -PUT_API_EXT_GEN(wave, unsigned long, ulong) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest at \p pe. The caller will block until the operation - * completes locally (it is safe to reuse \p source). The caller must - * call into rocshmem_quiet() if remote completion is required. - * - * This function can be called from divergent control paths at per-workgroub - * (WG) granularity. However, All threads in a WG must collectively participate - * in the call using the same arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in number of elements. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_API_EXT_GEN(wg, float, float) -PUT_API_EXT_GEN(wg, double, double) -// PUT_API_EXT_GEN(wg, long double, longdouble) -PUT_API_EXT_GEN(wg, char, char) -PUT_API_EXT_GEN(wg, signed char, schar) -PUT_API_EXT_GEN(wg, short, short) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, int, int) -PUT_API_EXT_GEN(wg, long, long) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, long long, longlong) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, unsigned char, uchar) -PUT_API_EXT_GEN(wg, unsigned short, ushort) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, unsigned int, uint) -PUT_API_EXT_GEN(wg, unsigned long, ulong) // NOLINT(runtime/int) -PUT_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in a the wave must participate in the - * call using the same parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wave( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_wave(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-workgroup - * (WG) granularity. However, all threads in the workgroup must participate - * in the call using the same parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, - void *dest, - const void *source, - size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_wg(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in the wave must participate in the - * call using the same parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_API_EXT_GEN(wave, float, float) -GET_API_EXT_GEN(wave, double, double) -// GET_API_EXT_GEN(wave, long double, longdouble) -GET_API_EXT_GEN(wave, char, char) -GET_API_EXT_GEN(wave, signed char, schar) -GET_API_EXT_GEN(wave, short, short) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, int, int) -GET_API_EXT_GEN(wave, long, long) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, long long, longlong) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, unsigned char, uchar) -GET_API_EXT_GEN(wave, unsigned short, ushort) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, unsigned int, uint) -GET_API_EXT_GEN(wave, unsigned long, ulong) // NOLINT(runtime/int) -GET_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The calling work-group will block until the - * operation completes (data has been placed in \p dest). - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in the workgroup must participate in - * the call using the same parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_API_EXT_GEN(wg, float, float) -GET_API_EXT_GEN(wg, double, double) -// GET_API_EXT_GEN(wg, long double, longdouble) -GET_API_EXT_GEN(wg, char, char) -GET_API_EXT_GEN(wg, signed char, schar) -GET_API_EXT_GEN(wg, short, short) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, int, int) -GET_API_EXT_GEN(wg, long, long) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, long long, longlong) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, unsigned char, uchar) -GET_API_EXT_GEN(wg, unsigned short, ushort) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, unsigned int, uint) -GET_API_EXT_GEN(wg, unsigned long, ulong) // NOLINT(runtime/int) -GET_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in a wave must call in with the same - * parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wave( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wave(void *dest, - const void *source, - size_t nelems, - int pe); - -/** - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in the wave must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_NBI_API_EXT_GEN(wave, float, float) -PUT_NBI_API_EXT_GEN(wave, double, double) -// PUT_NBI_API_EXT_GEN(wave, long double, longdouble) -PUT_NBI_API_EXT_GEN(wave, char, char) -PUT_NBI_API_EXT_GEN(wave, signed char, schar) -PUT_NBI_API_EXT_GEN(wave, short, short) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, int, int) -PUT_NBI_API_EXT_GEN(wave, long, long) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, long long, longlong) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, unsigned char, uchar) -PUT_NBI_API_EXT_GEN(wave, unsigned short, ushort) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, unsigned int, uint) -PUT_NBI_API_EXT_GEN(wave, unsigned long, ulong) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT -///@} - -/** - * @brief Writes contiguous data of \p nelems bytes from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in a WG must call in with the same - * parameters - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wg( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wg(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Writes contiguous data of \p nelems elements from \p source on the - * calling PE to \p dest on \p pe. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in the WG must call in with the sameo - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -PUT_NBI_API_EXT_GEN(wg, float, float) -PUT_NBI_API_EXT_GEN(wg, double, double) -// PUT_NBI_API_EXT_GEN(wg, long double, longdouble) -PUT_NBI_API_EXT_GEN(wg, char, char) -PUT_NBI_API_EXT_GEN(wg, signed char, schar) -PUT_NBI_API_EXT_GEN(wg, short, short) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, int, int) -PUT_NBI_API_EXT_GEN(wg, long, long) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, long long, longlong) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, unsigned char, uchar) -PUT_NBI_API_EXT_GEN(wg, unsigned short, ushort) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, unsigned int, uint) -PUT_NBI_API_EXT_GEN(wg, unsigned long, ulong) // NOLINT(runtime/int) -PUT_NBI_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in the wave must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wave( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wave(void *dest, - const void *source, - size_t nelems, - int pe); - -/** - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-wave - * granularity. However, all threads in the wave must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_NBI_API_EXT_GEN(wave, float, float) -GET_NBI_API_EXT_GEN(wave, double, double) -// GET_NBI_API_EXT_GEN(wave, long double, longdouble) -GET_NBI_API_EXT_GEN(wave, char, char) -GET_NBI_API_EXT_GEN(wave, signed char, schar) -GET_NBI_API_EXT_GEN(wave, short, short) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, int, int) -GET_NBI_API_EXT_GEN(wave, long, long) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, long long, longlong) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, unsigned char, uchar) -GET_NBI_API_EXT_GEN(wave, unsigned short, ushort) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, unsigned int, uint) -GET_NBI_API_EXT_GEN(wave, unsigned long, ulong) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wave, unsigned long long, ulonglong) // NOLINT -///@} - -/** - * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in the WG must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wg( - rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); - -__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wg(void *dest, - const void *source, - size_t nelems, int pe); - -/** - * @brief Reads contiguous data of \p nelems elements from \p source on \p pe - * to \p dest on the calling PE. The operation is not blocking. The caller - * will return as soon as the request is posted. The caller must call - * rocshmem_quiet() on the same context if completion notification is - * required. - * - * This function can be called from divergent control paths at per-workgroup - * granularity. However, all threads in the WG must call in with the same - * arguments. - * - * @param[in] ctx Context with which to perform this operation. - * @param[in] dest Destination address. Must be an address on the symmetric - * heap. - * @param[in] source Source address. Must be an address on the symmetric heap. - * @param[in] nelems Size of the transfer in bytes. - * @param[in] pe PE of the remote process. - * - * @return void. - */ -///@{ -GET_NBI_API_EXT_GEN(wg, float, float) -GET_NBI_API_EXT_GEN(wg, double, double) -// GET_NBI_API_EXT_GEN(wg, long double, longdouble) -GET_NBI_API_EXT_GEN(wg, char, char) -GET_NBI_API_EXT_GEN(wg, signed char, schar) -GET_NBI_API_EXT_GEN(wg, short, short) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, int, int) -GET_NBI_API_EXT_GEN(wg, long, long) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, long long, longlong) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, unsigned char, uchar) -GET_NBI_API_EXT_GEN(wg, unsigned short, ushort) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, unsigned int, uint) -GET_NBI_API_EXT_GEN(wg, unsigned long, ulong) // NOLINT(runtime/int) -GET_NBI_API_EXT_GEN(wg, unsigned long long, ulonglong) // NOLINT(runtime/int) -///@} - - -/* - * ROCSHMEM Signalling Operations - */ -#define PUTMEM_SIGNAL_DEC(SUFFIX) \ - __device__ ATTR_NO_INLINE void rocshmem_putmem_signal##SUFFIX(void *dest, \ - const void *source, \ - size_t nelems, \ - uint64_t *sig_addr, \ - uint64_t signal, \ - int sig_op, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal##SUFFIX(rocshmem_ctx_t ctx, \ - void *dest, \ - const void *source, \ - size_t nelems, \ - uint64_t *sig_addr, \ - uint64_t signal, \ - int sig_op, int pe); - -#define PUT_SIGNAL_TYPED_DEC(T, TNAME, SUFFIX) \ - __device__ ATTR_NO_INLINE void rocshmem_ctx_##TNAME##_put_signal##SUFFIX(rocshmem_ctx_t ctx, \ - T *dest, \ - const T *source, \ - size_t nelems, \ - uint64_t *sig_addr, \ - uint64_t signal, \ - int sig_op, int pe); \ - __device__ ATTR_NO_INLINE void rocshmem_##TNAME##_put_signal##SUFFIX(T *dest, \ - const T *source, \ - size_t nelems, \ - uint64_t *sig_addr, \ - uint64_t signal, \ - int sig_op, int pe); - -#define PUT_SIGNAL_DEC(SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(float, float, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(double, double, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(char, char, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(signed char, schar, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(short, short, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(int, int, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(long, long, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(long long, longlong, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned char, uchar, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned short, ushort, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned int, uint, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned long, ulong, SUFFIX) \ - PUT_SIGNAL_TYPED_DEC(unsigned long long, ulonglong, SUFFIX) - -#define SIGNALING_API_DEC(SUFFIX) \ - PUTMEM_SIGNAL_DEC(SUFFIX) \ - PUT_SIGNAL_DEC(SUFFIX) - -SIGNALING_API_DEC() -SIGNALING_API_DEC(_wg) -SIGNALING_API_DEC(_wave) -SIGNALING_API_DEC(_nbi) -SIGNALING_API_DEC(_nbi_wg) -SIGNALING_API_DEC(_nbi_wave) - -__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr); -__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr); -__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr); - } // namespace rocshmem #endif // LIBRARY_INCLUDE_ROCSHMEM_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_AMO.hpp b/projects/rocshmem/include/rocshmem/rocshmem_AMO.hpp new file mode 100644 index 0000000000..76fc47d52b --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_AMO.hpp @@ -0,0 +1,1581 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP + +namespace rocshmem { + +/** + * @name SHMEM_ATOMIC_FETCH + * @brief Atomically return the value of \p dest to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return The value of \p dest. + */ +__device__ ATTR_NO_INLINE float rocshmem_ctx_float_atomic_fetch( + rocshmem_ctx_t ctx, float *source, int pe); +__device__ ATTR_NO_INLINE float rocshmem_float_atomic_fetch( + float *source, int pe); +__host__ float rocshmem_ctx_float_atomic_fetch( + rocshmem_ctx_t ctx, float *source, int pe); +__host__ float rocshmem_float_atomic_fetch( + float *source, int pe); + +__device__ ATTR_NO_INLINE double rocshmem_ctx_double_atomic_fetch( + rocshmem_ctx_t ctx, double *source, int pe); +__device__ ATTR_NO_INLINE double rocshmem_double_atomic_fetch( + double *source, int pe); +__host__ double rocshmem_ctx_double_atomic_fetch( + rocshmem_ctx_t ctx, double *source, int pe); +__host__ double rocshmem_double_atomic_fetch( + double *source, int pe); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_fetch( + rocshmem_ctx_t ctx, int *source, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_fetch( + int *source, int pe); +__host__ int rocshmem_ctx_int_atomic_fetch( + rocshmem_ctx_t ctx, int *source, int pe); +__host__ int rocshmem_int_atomic_fetch( + int *source, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_fetch( + rocshmem_ctx_t ctx, long *source, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_fetch( + long *source, int pe); +__host__ long rocshmem_ctx_long_atomic_fetch( + rocshmem_ctx_t ctx, long *source, int pe); +__host__ long rocshmem_long_atomic_fetch( + long *source, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_fetch( + rocshmem_ctx_t ctx, long long *source, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_fetch( + long long *source, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_fetch( + rocshmem_ctx_t ctx, long long *source, int pe); +__host__ long long rocshmem_longlong_atomic_fetch( + long long *source, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch( + rocshmem_ctx_t ctx, unsigned int *source, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch( + unsigned int *source, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch( + rocshmem_ctx_t ctx, unsigned int *source, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch( + unsigned int *source, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch( + rocshmem_ctx_t ctx, unsigned long *source, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch( + unsigned long *source, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch( + rocshmem_ctx_t ctx, unsigned long *source, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch( + unsigned long *source, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch( + rocshmem_ctx_t ctx, unsigned long long *source, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch( + unsigned long long *source, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch( + rocshmem_ctx_t ctx, unsigned long long *source, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch( + unsigned long long *source, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch( + rocshmem_ctx_t ctx, int32_t *source, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch( + int32_t *source, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch( + rocshmem_ctx_t ctx, int32_t *source, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch( + int32_t *source, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch( + rocshmem_ctx_t ctx, int64_t *source, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch( + int64_t *source, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch( + rocshmem_ctx_t ctx, int64_t *source, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch( + int64_t *source, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch( + rocshmem_ctx_t ctx, uint32_t *source, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch( + uint32_t *source, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch( + rocshmem_ctx_t ctx, uint32_t *source, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch( + uint32_t *source, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch( + rocshmem_ctx_t ctx, uint64_t *source, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch( + uint64_t *source, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch( + rocshmem_ctx_t ctx, uint64_t *source, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch( + uint64_t *source, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_fetch( + rocshmem_ctx_t ctx, size_t *source, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_fetch( + size_t *source, int pe); +__host__ size_t rocshmem_ctx_size_atomic_fetch( + rocshmem_ctx_t ctx, size_t *source, int pe); +__host__ size_t rocshmem_size_atomic_fetch( + size_t *source, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch( + rocshmem_ctx_t ctx, ptrdiff_t *source, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_fetch( + ptrdiff_t *source, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch( + rocshmem_ctx_t ctx, ptrdiff_t *source, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_fetch( + ptrdiff_t *source, int pe); + + +/** + * @name SHMEM_ATOMIC_SET + * @brief Atomically set the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_atomic_set( + rocshmem_ctx_t ctx, float *dest, float value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_atomic_set( + float *dest, float value, int pe); +__host__ void rocshmem_ctx_float_atomic_set( + rocshmem_ctx_t ctx, float *dest, float value, int pe); +__host__ void rocshmem_float_atomic_set( + float *dest, float value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_atomic_set( + rocshmem_ctx_t ctx, double *dest, double value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_atomic_set( + double *dest, double value, int pe); +__host__ void rocshmem_ctx_double_atomic_set( + rocshmem_ctx_t ctx, double *dest, double value, int pe); +__host__ void rocshmem_double_atomic_set( + double *dest, double value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_atomic_set( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_atomic_set( + int *dest, int value, int pe); +__host__ void rocshmem_ctx_int_atomic_set( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__host__ void rocshmem_int_atomic_set( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_atomic_set( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_atomic_set( + long *dest, long value, int pe); +__host__ void rocshmem_ctx_long_atomic_set( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__host__ void rocshmem_long_atomic_set( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_atomic_set( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_atomic_set( + long long *dest, long long value, int pe); +__host__ void rocshmem_ctx_longlong_atomic_set( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__host__ void rocshmem_longlong_atomic_set( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_set( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_set( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_set( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_set( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_set( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_set( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_set( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_set( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_set( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_set( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_set( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_set( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_set( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_set( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_set( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_set( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_set( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_set( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_set( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_set( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_set( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_set( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_set( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_set( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_set( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_set( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_set( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_set( + uint64_t *dest, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_size_atomic_set( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_size_atomic_set( + size_t *dest, size_t value, int pe); +__host__ void rocshmem_ctx_size_atomic_set( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__host__ void rocshmem_size_atomic_set( + size_t *dest, size_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ptrdiff_atomic_set( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ptrdiff_atomic_set( + ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ void rocshmem_ctx_ptrdiff_atomic_set( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ void rocshmem_ptrdiff_atomic_set( + ptrdiff_t *dest, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_COMPARE_SWAP + * @brief Atomically compares if the value in \p dest with \p cond is equal + * then put \p val in \p dest. The operation returns the older value of \p dest + * to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] cond The value to be compare with. + * @param[in] val The value to be atomically swapped. + * @param[in] pe PE of the remote process. + * + * @return The old value of \p dest. + */ +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_compare_swap( + rocshmem_ctx_t ctx, int *dest, int cond, int value, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_compare_swap( + int *dest, int cond, int value, int pe); +__host__ int rocshmem_ctx_int_atomic_compare_swap( + rocshmem_ctx_t ctx, int *dest, int cond, int value, int pe); +__host__ int rocshmem_int_atomic_compare_swap( + int *dest, int cond, int value, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_compare_swap( + rocshmem_ctx_t ctx, long *dest, long cond, long value, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_compare_swap( + long *dest, long cond, long value, int pe); +__host__ long rocshmem_ctx_long_atomic_compare_swap( + rocshmem_ctx_t ctx, long *dest, long cond, long value, int pe); +__host__ long rocshmem_long_atomic_compare_swap( + long *dest, long cond, long value, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_compare_swap( + rocshmem_ctx_t ctx, long long *dest, long long cond, long long value, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_compare_swap( + long long *dest, long long cond, long long value, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_compare_swap( + rocshmem_ctx_t ctx, long long *dest, long long cond, long long value, int pe); +__host__ long long rocshmem_longlong_atomic_compare_swap( + long long *dest, long long cond, long long value, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int cond, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_compare_swap( + unsigned int *dest, unsigned int cond, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int cond, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_compare_swap( + unsigned int *dest, unsigned int cond, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long cond, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_compare_swap( + unsigned long *dest, unsigned long cond, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long cond, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_compare_swap( + unsigned long *dest, unsigned long cond, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_compare_swap( + unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_compare_swap( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_compare_swap( + unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_compare_swap( + rocshmem_ctx_t ctx, int32_t *dest, int32_t cond, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_compare_swap( + int32_t *dest, int32_t cond, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_compare_swap( + rocshmem_ctx_t ctx, int32_t *dest, int32_t cond, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_compare_swap( + int32_t *dest, int32_t cond, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_compare_swap( + rocshmem_ctx_t ctx, int64_t *dest, int64_t cond, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_compare_swap( + int64_t *dest, int64_t cond, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_compare_swap( + rocshmem_ctx_t ctx, int64_t *dest, int64_t cond, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_compare_swap( + int64_t *dest, int64_t cond, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_compare_swap( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t cond, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_compare_swap( + uint32_t *dest, uint32_t cond, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_compare_swap( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t cond, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_compare_swap( + uint32_t *dest, uint32_t cond, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_compare_swap( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t cond, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_compare_swap( + uint64_t *dest, uint64_t cond, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_compare_swap( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t cond, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_compare_swap( + uint64_t *dest, uint64_t cond, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_compare_swap( + rocshmem_ctx_t ctx, size_t *dest, size_t cond, size_t value, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_compare_swap( + size_t *dest, size_t cond, size_t value, int pe); +__host__ size_t rocshmem_ctx_size_atomic_compare_swap( + rocshmem_ctx_t ctx, size_t *dest, size_t cond, size_t value, int pe); +__host__ size_t rocshmem_size_atomic_compare_swap( + size_t *dest, size_t cond, size_t value, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_compare_swap( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_compare_swap( + ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_compare_swap( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_compare_swap( + ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_SWAP + * @brief Atomically swap the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */ +__device__ ATTR_NO_INLINE float rocshmem_ctx_float_atomic_swap( + rocshmem_ctx_t ctx, float *dest, float value, int pe); +__device__ ATTR_NO_INLINE float rocshmem_float_atomic_swap( + float *dest, float value, int pe); +__host__ float rocshmem_ctx_float_atomic_swap( + rocshmem_ctx_t ctx, float *dest, float value, int pe); +__host__ float rocshmem_float_atomic_swap( + float *dest, float value, int pe); + +__device__ ATTR_NO_INLINE double rocshmem_ctx_double_atomic_swap( + rocshmem_ctx_t ctx, double *dest, double value, int pe); +__device__ ATTR_NO_INLINE double rocshmem_double_atomic_swap( + double *dest, double value, int pe); +__host__ double rocshmem_ctx_double_atomic_swap( + rocshmem_ctx_t ctx, double *dest, double value, int pe); +__host__ double rocshmem_double_atomic_swap( + double *dest, double value, int pe); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_swap( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_swap( + int *dest, int value, int pe); +__host__ int rocshmem_ctx_int_atomic_swap( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__host__ int rocshmem_int_atomic_swap( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_swap( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_swap( + long *dest, long value, int pe); +__host__ long rocshmem_ctx_long_atomic_swap( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__host__ long rocshmem_long_atomic_swap( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_swap( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_swap( + long long *dest, long long value, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_swap( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__host__ long long rocshmem_longlong_atomic_swap( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_swap( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_swap( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_swap( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_swap( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_swap( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_swap( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_swap( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_swap( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_swap( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_swap( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_swap( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_swap( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_swap( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_swap( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_swap( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_swap( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_swap( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_swap( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_swap( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_swap( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_swap( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_swap( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_swap( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_swap( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_swap( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_swap( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_swap( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_swap( + uint64_t *dest, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_swap( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_swap( + size_t *dest, size_t value, int pe); +__host__ size_t rocshmem_ctx_size_atomic_swap( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__host__ size_t rocshmem_size_atomic_swap( + size_t *dest, size_t value, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_swap( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_swap( + ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_swap( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_swap( + ptrdiff_t *dest, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_INC + * @brief Atomically add 1 to \p dest on \p pe. The operation + * returns the older value of \p dest to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] pe PE of the remote process. + * + * @return The old value of \p dest before it was incremented by 1. + */ +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_fetch_inc( + rocshmem_ctx_t ctx, int *dest, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_fetch_inc( + int *dest, int pe); +__host__ int rocshmem_ctx_int_atomic_fetch_inc( + rocshmem_ctx_t ctx, int *dest, int pe); +__host__ int rocshmem_int_atomic_fetch_inc( + int *dest, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_fetch_inc( + rocshmem_ctx_t ctx, long *dest, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_fetch_inc( + long *dest, int pe); +__host__ long rocshmem_ctx_long_atomic_fetch_inc( + rocshmem_ctx_t ctx, long *dest, int pe); +__host__ long rocshmem_long_atomic_fetch_inc( + long *dest, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_fetch_inc( + rocshmem_ctx_t ctx, long long *dest, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_fetch_inc( + long long *dest, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_fetch_inc( + rocshmem_ctx_t ctx, long long *dest, int pe); +__host__ long long rocshmem_longlong_atomic_fetch_inc( + long long *dest, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned int *dest, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_inc( + unsigned int *dest, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned int *dest, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_inc( + unsigned int *dest, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned long *dest, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_inc( + unsigned long *dest, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned long *dest, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_inc( + unsigned long *dest, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned long long *dest, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_inc( + unsigned long long *dest, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_inc( + rocshmem_ctx_t ctx, unsigned long long *dest, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_inc( + unsigned long long *dest, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_inc( + rocshmem_ctx_t ctx, int32_t *dest, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_inc( + int32_t *dest, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_inc( + rocshmem_ctx_t ctx, int32_t *dest, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_inc( + int32_t *dest, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_inc( + rocshmem_ctx_t ctx, int64_t *dest, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_inc( + int64_t *dest, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_inc( + rocshmem_ctx_t ctx, int64_t *dest, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_inc( + int64_t *dest, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_inc( + rocshmem_ctx_t ctx, uint32_t *dest, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_inc( + uint32_t *dest, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_inc( + rocshmem_ctx_t ctx, uint32_t *dest, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_inc( + uint32_t *dest, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_inc( + rocshmem_ctx_t ctx, uint64_t *dest, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_inc( + uint64_t *dest, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_inc( + rocshmem_ctx_t ctx, uint64_t *dest, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_inc( + uint64_t *dest, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_fetch_inc( + rocshmem_ctx_t ctx, size_t *dest, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_fetch_inc( + size_t *dest, int pe); +__host__ size_t rocshmem_ctx_size_atomic_fetch_inc( + rocshmem_ctx_t ctx, size_t *dest, int pe); +__host__ size_t rocshmem_size_atomic_fetch_inc( + size_t *dest, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_inc( + rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_fetch_inc( + ptrdiff_t *dest, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_inc( + rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_fetch_inc( + ptrdiff_t *dest, int pe); + + +/** + * @name SHMEM_ATOMIC_INC + * @brief Atomically add 1 to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_atomic_inc( + rocshmem_ctx_t ctx, int *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_atomic_inc( + int *dest, int pe); +__host__ void rocshmem_ctx_int_atomic_inc( + rocshmem_ctx_t ctx, int *dest, int pe); +__host__ void rocshmem_int_atomic_inc( + int *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_atomic_inc( + rocshmem_ctx_t ctx, long *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_atomic_inc( + long *dest, int pe); +__host__ void rocshmem_ctx_long_atomic_inc( + rocshmem_ctx_t ctx, long *dest, int pe); +__host__ void rocshmem_long_atomic_inc( + long *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_atomic_inc( + rocshmem_ctx_t ctx, long long *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_atomic_inc( + long long *dest, int pe); +__host__ void rocshmem_ctx_longlong_atomic_inc( + rocshmem_ctx_t ctx, long long *dest, int pe); +__host__ void rocshmem_longlong_atomic_inc( + long long *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_inc( + rocshmem_ctx_t ctx, unsigned int *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_inc( + unsigned int *dest, int pe); +__host__ void rocshmem_ctx_uint_atomic_inc( + rocshmem_ctx_t ctx, unsigned int *dest, int pe); +__host__ void rocshmem_uint_atomic_inc( + unsigned int *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_inc( + rocshmem_ctx_t ctx, unsigned long *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_inc( + unsigned long *dest, int pe); +__host__ void rocshmem_ctx_ulong_atomic_inc( + rocshmem_ctx_t ctx, unsigned long *dest, int pe); +__host__ void rocshmem_ulong_atomic_inc( + unsigned long *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_inc( + rocshmem_ctx_t ctx, unsigned long long *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_inc( + unsigned long long *dest, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_inc( + rocshmem_ctx_t ctx, unsigned long long *dest, int pe); +__host__ void rocshmem_ulonglong_atomic_inc( + unsigned long long *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_inc( + rocshmem_ctx_t ctx, int32_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_inc( + int32_t *dest, int pe); +__host__ void rocshmem_ctx_int32_atomic_inc( + rocshmem_ctx_t ctx, int32_t *dest, int pe); +__host__ void rocshmem_int32_atomic_inc( + int32_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_inc( + rocshmem_ctx_t ctx, int64_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_inc( + int64_t *dest, int pe); +__host__ void rocshmem_ctx_int64_atomic_inc( + rocshmem_ctx_t ctx, int64_t *dest, int pe); +__host__ void rocshmem_int64_atomic_inc( + int64_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_inc( + rocshmem_ctx_t ctx, uint32_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_inc( + uint32_t *dest, int pe); +__host__ void rocshmem_ctx_uint32_atomic_inc( + rocshmem_ctx_t ctx, uint32_t *dest, int pe); +__host__ void rocshmem_uint32_atomic_inc( + uint32_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_inc( + rocshmem_ctx_t ctx, uint64_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_inc( + uint64_t *dest, int pe); +__host__ void rocshmem_ctx_uint64_atomic_inc( + rocshmem_ctx_t ctx, uint64_t *dest, int pe); +__host__ void rocshmem_uint64_atomic_inc( + uint64_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_size_atomic_inc( + rocshmem_ctx_t ctx, size_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_size_atomic_inc( + size_t *dest, int pe); +__host__ void rocshmem_ctx_size_atomic_inc( + rocshmem_ctx_t ctx, size_t *dest, int pe); +__host__ void rocshmem_size_atomic_inc( + size_t *dest, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ptrdiff_atomic_inc( + rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ptrdiff_atomic_inc( + ptrdiff_t *dest, int pe); +__host__ void rocshmem_ctx_ptrdiff_atomic_inc( + rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe); +__host__ void rocshmem_ptrdiff_atomic_inc( + ptrdiff_t *dest, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_ADD + * @brief Atomically add the value \p val to \p dest on \p pe. The operation + * returns the older value of \p dest to the calling PE. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return The old value of \p dest before the \p val was added. + */ +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_fetch_add( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_atomic_fetch_add( + int *dest, int value, int pe); +__host__ int rocshmem_ctx_int_atomic_fetch_add( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__host__ int rocshmem_int_atomic_fetch_add( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_fetch_add( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_atomic_fetch_add( + long *dest, long value, int pe); +__host__ long rocshmem_ctx_long_atomic_fetch_add( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__host__ long rocshmem_long_atomic_fetch_add( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_fetch_add( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_fetch_add( + long long *dest, long long value, int pe); +__host__ long long rocshmem_ctx_longlong_atomic_fetch_add( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__host__ long long rocshmem_longlong_atomic_fetch_add( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_add( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_add( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_add( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_add( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_add( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_add( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_add( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_add( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_add( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_add( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_add( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_add( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_add( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_add( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_add( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_add( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_add( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_add( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_add( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_add( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_add( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_add( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_add( + uint64_t *dest, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_fetch_add( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_fetch_add( + size_t *dest, size_t value, int pe); +__host__ size_t rocshmem_ctx_size_atomic_fetch_add( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__host__ size_t rocshmem_size_atomic_fetch_add( + size_t *dest, size_t value, int pe); + +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_add( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_fetch_add( + ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_add( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ ptrdiff_t rocshmem_ptrdiff_atomic_fetch_add( + ptrdiff_t *dest, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_ADD + * @brief Atomically add the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_atomic_add( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_atomic_add( + int *dest, int value, int pe); +__host__ void rocshmem_ctx_int_atomic_add( + rocshmem_ctx_t ctx, int *dest, int value, int pe); +__host__ void rocshmem_int_atomic_add( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_atomic_add( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_atomic_add( + long *dest, long value, int pe); +__host__ void rocshmem_ctx_long_atomic_add( + rocshmem_ctx_t ctx, long *dest, long value, int pe); +__host__ void rocshmem_long_atomic_add( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_atomic_add( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_atomic_add( + long long *dest, long long value, int pe); +__host__ void rocshmem_ctx_longlong_atomic_add( + rocshmem_ctx_t ctx, long long *dest, long long value, int pe); +__host__ void rocshmem_longlong_atomic_add( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_add( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_add( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_add( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_add( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_add( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_add( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_add( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_add( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_add( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_add( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_add( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_add( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_add( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_add( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_add( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_add( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_add( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_add( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_add( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_add( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_add( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_add( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_add( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_add( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_add( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_add( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_add( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_add( + uint64_t *dest, uint64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_size_atomic_add( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_size_atomic_add( + size_t *dest, size_t value, int pe); +__host__ void rocshmem_ctx_size_atomic_add( + rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe); +__host__ void rocshmem_size_atomic_add( + size_t *dest, size_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ptrdiff_atomic_add( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ptrdiff_atomic_add( + ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ void rocshmem_ctx_ptrdiff_atomic_add( + rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe); +__host__ void rocshmem_ptrdiff_atomic_add( + ptrdiff_t *dest, ptrdiff_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_AND + * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */ +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_and( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_and( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_and( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_and( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_and( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_and( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_and( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_and( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_and( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_and( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_and( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_and( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_and( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_and( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_and( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_and( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_and( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_and( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_and( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_and( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_and( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_and( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_and( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_AND + * @brief Atomically bitwise-and the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_and( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_and( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_and( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_and( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_and( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_and( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_and( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_and( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_and( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_and( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_and( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_and( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_and( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_and( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_and( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_and( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_and( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_and( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_and( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_and( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_and( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_and( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_and( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_and( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_and( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_and( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_and( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_and( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_OR + * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */ +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_or( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_or( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_or( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_or( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_or( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_or( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_or( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_or( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_or( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_or( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_or( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_or( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_or( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_or( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_or( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_or( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_or( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_or( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_or( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_or( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_or( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_or( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_or( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_OR + * @brief Atomically bitwise-or the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_or( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_or( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_or( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_or( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_or( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_or( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_or( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_or( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_or( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_or( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_or( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_or( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_or( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_or( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_or( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_or( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_or( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_or( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_or( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_or( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_or( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_or( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_or( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_or( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_or( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_or( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_or( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_or( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_FETCH_XOR + * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return original value + */ +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_xor( + unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ unsigned int rocshmem_uint_atomic_fetch_xor( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_xor( + unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ unsigned long rocshmem_ulong_atomic_fetch_xor( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_xor( + unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_xor( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_xor( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_xor( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_xor( + int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_ctx_int32_atomic_fetch_xor( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ int32_t rocshmem_int32_atomic_fetch_xor( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_xor( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_xor( + int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_ctx_int64_atomic_fetch_xor( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ int64_t rocshmem_int64_atomic_fetch_xor( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_xor( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_xor( + uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_xor( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ uint32_t rocshmem_uint32_atomic_fetch_xor( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_xor( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_xor( + uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_xor( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ uint64_t rocshmem_uint64_atomic_fetch_xor( + uint64_t *dest, uint64_t value, int pe); + + +/** + * @name SHMEM_ATOMIC_XOR + * @brief Atomically bitwise-xor the value \p val to \p dest on \p pe. + * + * The operation is blocking. + * + * This function can be called from divergent control paths at per-thread + * granularity. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] val The value to be atomically added. + * @param[in] pe PE of the remote process. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_xor( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_xor( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_atomic_xor( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_uint_atomic_xor( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_xor( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_xor( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_atomic_xor( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ulong_atomic_xor( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_xor( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_xor( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_atomic_xor( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ulonglong_atomic_xor( + unsigned long long *dest, unsigned long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_xor( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_xor( + int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_ctx_int32_atomic_xor( + rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe); +__host__ void rocshmem_int32_atomic_xor( + int32_t *dest, int32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_xor( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_xor( + int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_ctx_int64_atomic_xor( + rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe); +__host__ void rocshmem_int64_atomic_xor( + int64_t *dest, int64_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_xor( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_xor( + uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_ctx_uint32_atomic_xor( + rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe); +__host__ void rocshmem_uint32_atomic_xor( + uint32_t *dest, uint32_t value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_xor( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_xor( + uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_ctx_uint64_atomic_xor( + rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe); +__host__ void rocshmem_uint64_atomic_xor( + uint64_t *dest, uint64_t value, int pe); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp new file mode 100644 index 0000000000..c15498705d --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp @@ -0,0 +1,603 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP + +namespace rocshmem { + +/** + * @name SHMEM_ALLTOALL + * @brief Exchanges a fixed amount of contiguous data blocks between all pairs + * of PEs participating in the collective routine. + * + * This function must be called as a work-group collective. + * + * @param[in] team The team participating in the collective. + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nelems Number of data blocks transferred per pair of PEs. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, + const float *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, + const double *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, + const char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, + const signed char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, + const short *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, + const int *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, + const long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, + const long long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, + const unsigned char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, + const unsigned short *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, + const unsigned int *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, + const unsigned long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_alltoall( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, + const unsigned long long *source, int nelems); + + +/** + * @name SHMEM_BROADCAST + * @brief Perform a broadcast between PEs in the active set. The caller + * is blocked until the broadcase completes. + * + * This function must be called as a work-group collective. + * + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nelement Size of the buffer to participate in the broadcast. + * @param[in] PE_root Zero-based ordinal of the PE, with respect to the + active set, from which the data is copied + * @param[in] PE_start PE to start the reduction. + * @param[in] logPE_stride Stride of PEs participating in the reduction. + * @param[in] PE_size Number PEs participating in the reduction. + * @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must + be of size at least ROCSHMEM_REDUCE_SYNC_SIZE. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, + const float *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_float_broadcast( + rocshmem_ctx_t ctx, float *dest, const float *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_float_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, + const float *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, + const double *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_double_broadcast( + rocshmem_ctx_t ctx, double *dest, const double *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_double_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, + const double *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, + const char *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_char_broadcast( + rocshmem_ctx_t ctx, char *dest, const char *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_char_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, + const char *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, + const signed char *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_schar_broadcast( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_schar_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, + const signed char *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, + const short *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_short_broadcast( + rocshmem_ctx_t ctx, short *dest, const short *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_short_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, + const short *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, + const int *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_int_broadcast( + rocshmem_ctx_t ctx, int *dest, const int *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_int_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, + const int *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, + const long *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_long_broadcast( + rocshmem_ctx_t ctx, long *dest, const long *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_long_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, + const long *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, + const long long *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_longlong_broadcast( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_longlong_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, + const long long *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, + const unsigned char *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_uchar_broadcast( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_uchar_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, + const unsigned char *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, + const unsigned short *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_ushort_broadcast( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_ushort_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, + const unsigned short *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, + const unsigned int *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_uint_broadcast( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_uint_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, + const unsigned int *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, + const unsigned long *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_ulong_broadcast( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_ulong_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, + const unsigned long *source, int nelems, int pe_root); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, + const unsigned long long *source, int nelems, int pe_root); +__host__ void rocshmem_ctx_ulonglong_broadcast( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + int nelems, int pe_root, int pe_start, int log_pe_stride, + int pe_size, long *p_sync); +__host__ void rocshmem_ctx_ulonglong_broadcast( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, + const unsigned long long *source, int nelems, int pe_root); + + +/** + * @name SHMEM_FCOLLECT + * @brief Concatenates blocks of data from multiple PEs to an array in every + * PE participating in the collective routine. + * + * This function must be called as a work-group collective. + * + * @param[in] team The team participating in the collective. + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nelems Number of data blocks in source array. + * + * @return void + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, + const float *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, + const double *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest, + const char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest, + const signed char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, + const short *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, + const int *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, + const long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, + const long long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest, + const unsigned char *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest, + const unsigned short *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest, + const unsigned int *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest, + const unsigned long *source, int nelems); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_wg_fcollect( + rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest, + const unsigned long long *source, int nelems); + + +/** + * @name SHMEM_REDUCTIONS + * @brief Perform an allreduce between PEs in the active set. The caller + * is blocked until the reduction completes. + * + * This function must be called as a work-group collective. + * + * @param[in] team The team participating in the collective. + * @param[in] dest Destination address. Must be an address on the + * symmetric heap. + * @param[in] source Source address. Must be an address on the symmetric + heap. + * @param[in] nreduce Size of the buffer to participate in the reduction. + * + * @return int (Zero on successful local completion. Nonzero otherwise.) + */ +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_or_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_and_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); +__host__ int rocshmem_ctx_short_xor_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_or_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_and_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); +__host__ int rocshmem_ctx_int_xor_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_or_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_and_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); +__host__ int rocshmem_ctx_long_xor_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_or_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_and_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); +__host__ int rocshmem_ctx_longlong_xor_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); +__host__ int rocshmem_ctx_float_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); +__host__ int rocshmem_ctx_float_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); +__host__ int rocshmem_ctx_float_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); +__host__ int rocshmem_ctx_float_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); +__host__ int rocshmem_ctx_double_sum_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); +__host__ int rocshmem_ctx_double_min_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); +__host__ int rocshmem_ctx_double_max_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_wg_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); +__host__ int rocshmem_ctx_double_prod_reduce( + rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source, + int nreduce); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_P2P_SYNC.hpp b/projects/rocshmem/include/rocshmem/rocshmem_P2P_SYNC.hpp new file mode 100644 index 0000000000..8ebe9b0390 --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_P2P_SYNC.hpp @@ -0,0 +1,662 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP + +namespace rocshmem { + +/** + * @name SHMEM_WAIT_UNTIL + * @brief Block the caller until the condition (* \p ptr \p cmps \p val) is + * true. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ivars Pointer to memory on the symmetric heap to wait for. + * @param[in] cmp Operation for the comparison. + * @param[in] val Value to compare the memory at \p ptr to. + * + * @return void + */ +__device__ void rocshmem_float_wait_until( + float *ivars, int cmp, float val); +__device__ size_t rocshmem_float_wait_until_any( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__device__ void rocshmem_float_wait_until_all( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__device__ size_t rocshmem_float_wait_until_some( + float *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, float val); +__device__ size_t rocshmem_float_wait_until_any_vector( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__device__ void rocshmem_float_wait_until_all_vector( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__device__ size_t rocshmem_float_wait_until_some_vector( + float *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, float val); +__host__ void rocshmem_float_wait_until( + float *ivars, int cmp, float val); +__host__ size_t rocshmem_float_wait_until_any( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__host__ void rocshmem_float_wait_until_all( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__host__ size_t rocshmem_float_wait_until_some( + float *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, float val); +__host__ size_t rocshmem_float_wait_until_any_vector( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__host__ void rocshmem_float_wait_until_all_vector( + float *ivars, size_t nelems, const int* status, + int cmp, float val); +__host__ size_t rocshmem_float_wait_until_some_vector( + float *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, float val); + +__device__ void rocshmem_double_wait_until( + double *ivars, int cmp, double val); +__device__ size_t rocshmem_double_wait_until_any( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__device__ void rocshmem_double_wait_until_all( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__device__ size_t rocshmem_double_wait_until_some( + double *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, double val); +__device__ size_t rocshmem_double_wait_until_any_vector( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__device__ void rocshmem_double_wait_until_all_vector( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__device__ size_t rocshmem_double_wait_until_some_vector( + double *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, double val); +__host__ void rocshmem_double_wait_until( + double *ivars, int cmp, double val); +__host__ size_t rocshmem_double_wait_until_any( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__host__ void rocshmem_double_wait_until_all( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__host__ size_t rocshmem_double_wait_until_some( + double *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, double val); +__host__ size_t rocshmem_double_wait_until_any_vector( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__host__ void rocshmem_double_wait_until_all_vector( + double *ivars, size_t nelems, const int* status, + int cmp, double val); +__host__ size_t rocshmem_double_wait_until_some_vector( + double *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, double val); + +__device__ void rocshmem_char_wait_until( + char *ivars, int cmp, char val); +__device__ size_t rocshmem_char_wait_until_any( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__device__ void rocshmem_char_wait_until_all( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__device__ size_t rocshmem_char_wait_until_some( + char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, char val); +__device__ size_t rocshmem_char_wait_until_any_vector( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__device__ void rocshmem_char_wait_until_all_vector( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__device__ size_t rocshmem_char_wait_until_some_vector( + char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, char val); +__host__ void rocshmem_char_wait_until( + char *ivars, int cmp, char val); +__host__ size_t rocshmem_char_wait_until_any( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__host__ void rocshmem_char_wait_until_all( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__host__ size_t rocshmem_char_wait_until_some( + char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, char val); +__host__ size_t rocshmem_char_wait_until_any_vector( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__host__ void rocshmem_char_wait_until_all_vector( + char *ivars, size_t nelems, const int* status, + int cmp, char val); +__host__ size_t rocshmem_char_wait_until_some_vector( + char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, char val); + +__device__ void rocshmem_schar_wait_until( + signed char *ivars, int cmp, signed char val); +__device__ size_t rocshmem_schar_wait_until_any( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__device__ void rocshmem_schar_wait_until_all( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__device__ size_t rocshmem_schar_wait_until_some( + signed char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, signed char val); +__device__ size_t rocshmem_schar_wait_until_any_vector( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__device__ void rocshmem_schar_wait_until_all_vector( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__device__ size_t rocshmem_schar_wait_until_some_vector( + signed char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, signed char val); +__host__ void rocshmem_schar_wait_until( + signed char *ivars, int cmp, signed char val); +__host__ size_t rocshmem_schar_wait_until_any( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__host__ void rocshmem_schar_wait_until_all( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__host__ size_t rocshmem_schar_wait_until_some( + signed char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, signed char val); +__host__ size_t rocshmem_schar_wait_until_any_vector( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__host__ void rocshmem_schar_wait_until_all_vector( + signed char *ivars, size_t nelems, const int* status, + int cmp, signed char val); +__host__ size_t rocshmem_schar_wait_until_some_vector( + signed char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, signed char val); + +__device__ void rocshmem_short_wait_until( + short *ivars, int cmp, short val); +__device__ size_t rocshmem_short_wait_until_any( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__device__ void rocshmem_short_wait_until_all( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__device__ size_t rocshmem_short_wait_until_some( + short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, short val); +__device__ size_t rocshmem_short_wait_until_any_vector( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__device__ void rocshmem_short_wait_until_all_vector( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__device__ size_t rocshmem_short_wait_until_some_vector( + short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, short val); +__host__ void rocshmem_short_wait_until( + short *ivars, int cmp, short val); +__host__ size_t rocshmem_short_wait_until_any( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__host__ void rocshmem_short_wait_until_all( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__host__ size_t rocshmem_short_wait_until_some( + short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, short val); +__host__ size_t rocshmem_short_wait_until_any_vector( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__host__ void rocshmem_short_wait_until_all_vector( + short *ivars, size_t nelems, const int* status, + int cmp, short val); +__host__ size_t rocshmem_short_wait_until_some_vector( + short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, short val); + +__device__ void rocshmem_int_wait_until( + int *ivars, int cmp, int val); +__device__ size_t rocshmem_int_wait_until_any( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__device__ void rocshmem_int_wait_until_all( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__device__ size_t rocshmem_int_wait_until_some( + int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, int val); +__device__ size_t rocshmem_int_wait_until_any_vector( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__device__ void rocshmem_int_wait_until_all_vector( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__device__ size_t rocshmem_int_wait_until_some_vector( + int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, int val); +__host__ void rocshmem_int_wait_until( + int *ivars, int cmp, int val); +__host__ size_t rocshmem_int_wait_until_any( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__host__ void rocshmem_int_wait_until_all( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__host__ size_t rocshmem_int_wait_until_some( + int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, int val); +__host__ size_t rocshmem_int_wait_until_any_vector( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__host__ void rocshmem_int_wait_until_all_vector( + int *ivars, size_t nelems, const int* status, + int cmp, int val); +__host__ size_t rocshmem_int_wait_until_some_vector( + int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, int val); + +__device__ void rocshmem_long_wait_until( + long *ivars, int cmp, long val); +__device__ size_t rocshmem_long_wait_until_any( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__device__ void rocshmem_long_wait_until_all( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__device__ size_t rocshmem_long_wait_until_some( + long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long val); +__device__ size_t rocshmem_long_wait_until_any_vector( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__device__ void rocshmem_long_wait_until_all_vector( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__device__ size_t rocshmem_long_wait_until_some_vector( + long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long val); +__host__ void rocshmem_long_wait_until( + long *ivars, int cmp, long val); +__host__ size_t rocshmem_long_wait_until_any( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__host__ void rocshmem_long_wait_until_all( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__host__ size_t rocshmem_long_wait_until_some( + long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long val); +__host__ size_t rocshmem_long_wait_until_any_vector( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__host__ void rocshmem_long_wait_until_all_vector( + long *ivars, size_t nelems, const int* status, + int cmp, long val); +__host__ size_t rocshmem_long_wait_until_some_vector( + long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long val); + +__device__ void rocshmem_longlong_wait_until( + long long *ivars, int cmp, long long val); +__device__ size_t rocshmem_longlong_wait_until_any( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__device__ void rocshmem_longlong_wait_until_all( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__device__ size_t rocshmem_longlong_wait_until_some( + long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long long val); +__device__ size_t rocshmem_longlong_wait_until_any_vector( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__device__ void rocshmem_longlong_wait_until_all_vector( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__device__ size_t rocshmem_longlong_wait_until_some_vector( + long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long long val); +__host__ void rocshmem_longlong_wait_until( + long long *ivars, int cmp, long long val); +__host__ size_t rocshmem_longlong_wait_until_any( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__host__ void rocshmem_longlong_wait_until_all( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__host__ size_t rocshmem_longlong_wait_until_some( + long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long long val); +__host__ size_t rocshmem_longlong_wait_until_any_vector( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__host__ void rocshmem_longlong_wait_until_all_vector( + long long *ivars, size_t nelems, const int* status, + int cmp, long long val); +__host__ size_t rocshmem_longlong_wait_until_some_vector( + long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, long long val); + +__device__ void rocshmem_uchar_wait_until( + unsigned char *ivars, int cmp, unsigned char val); +__device__ size_t rocshmem_uchar_wait_until_any( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__device__ void rocshmem_uchar_wait_until_all( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__device__ size_t rocshmem_uchar_wait_until_some( + unsigned char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned char val); +__device__ size_t rocshmem_uchar_wait_until_any_vector( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__device__ void rocshmem_uchar_wait_until_all_vector( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__device__ size_t rocshmem_uchar_wait_until_some_vector( + unsigned char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned char val); +__host__ void rocshmem_uchar_wait_until( + unsigned char *ivars, int cmp, unsigned char val); +__host__ size_t rocshmem_uchar_wait_until_any( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__host__ void rocshmem_uchar_wait_until_all( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__host__ size_t rocshmem_uchar_wait_until_some( + unsigned char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned char val); +__host__ size_t rocshmem_uchar_wait_until_any_vector( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__host__ void rocshmem_uchar_wait_until_all_vector( + unsigned char *ivars, size_t nelems, const int* status, + int cmp, unsigned char val); +__host__ size_t rocshmem_uchar_wait_until_some_vector( + unsigned char *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned char val); + +__device__ void rocshmem_ushort_wait_until( + unsigned short *ivars, int cmp, unsigned short val); +__device__ size_t rocshmem_ushort_wait_until_any( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__device__ void rocshmem_ushort_wait_until_all( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__device__ size_t rocshmem_ushort_wait_until_some( + unsigned short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned short val); +__device__ size_t rocshmem_ushort_wait_until_any_vector( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__device__ void rocshmem_ushort_wait_until_all_vector( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__device__ size_t rocshmem_ushort_wait_until_some_vector( + unsigned short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned short val); +__host__ void rocshmem_ushort_wait_until( + unsigned short *ivars, int cmp, unsigned short val); +__host__ size_t rocshmem_ushort_wait_until_any( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__host__ void rocshmem_ushort_wait_until_all( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__host__ size_t rocshmem_ushort_wait_until_some( + unsigned short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned short val); +__host__ size_t rocshmem_ushort_wait_until_any_vector( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__host__ void rocshmem_ushort_wait_until_all_vector( + unsigned short *ivars, size_t nelems, const int* status, + int cmp, unsigned short val); +__host__ size_t rocshmem_ushort_wait_until_some_vector( + unsigned short *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned short val); + +__device__ void rocshmem_uint_wait_until( + unsigned int *ivars, int cmp, unsigned int val); +__device__ size_t rocshmem_uint_wait_until_any( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__device__ void rocshmem_uint_wait_until_all( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__device__ size_t rocshmem_uint_wait_until_some( + unsigned int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned int val); +__device__ size_t rocshmem_uint_wait_until_any_vector( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__device__ void rocshmem_uint_wait_until_all_vector( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__device__ size_t rocshmem_uint_wait_until_some_vector( + unsigned int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned int val); +__host__ void rocshmem_uint_wait_until( + unsigned int *ivars, int cmp, unsigned int val); +__host__ size_t rocshmem_uint_wait_until_any( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__host__ void rocshmem_uint_wait_until_all( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__host__ size_t rocshmem_uint_wait_until_some( + unsigned int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned int val); +__host__ size_t rocshmem_uint_wait_until_any_vector( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__host__ void rocshmem_uint_wait_until_all_vector( + unsigned int *ivars, size_t nelems, const int* status, + int cmp, unsigned int val); +__host__ size_t rocshmem_uint_wait_until_some_vector( + unsigned int *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned int val); + +__device__ void rocshmem_ulong_wait_until( + unsigned long *ivars, int cmp, unsigned long val); +__device__ size_t rocshmem_ulong_wait_until_any( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__device__ void rocshmem_ulong_wait_until_all( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__device__ size_t rocshmem_ulong_wait_until_some( + unsigned long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long val); +__device__ size_t rocshmem_ulong_wait_until_any_vector( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__device__ void rocshmem_ulong_wait_until_all_vector( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__device__ size_t rocshmem_ulong_wait_until_some_vector( + unsigned long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long val); +__host__ void rocshmem_ulong_wait_until( + unsigned long *ivars, int cmp, unsigned long val); +__host__ size_t rocshmem_ulong_wait_until_any( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__host__ void rocshmem_ulong_wait_until_all( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__host__ size_t rocshmem_ulong_wait_until_some( + unsigned long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long val); +__host__ size_t rocshmem_ulong_wait_until_any_vector( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__host__ void rocshmem_ulong_wait_until_all_vector( + unsigned long *ivars, size_t nelems, const int* status, + int cmp, unsigned long val); +__host__ size_t rocshmem_ulong_wait_until_some_vector( + unsigned long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long val); + +__device__ void rocshmem_ulonglong_wait_until( + unsigned long long *ivars, int cmp, unsigned long long val); +__device__ size_t rocshmem_ulonglong_wait_until_any( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__device__ void rocshmem_ulonglong_wait_until_all( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__device__ size_t rocshmem_ulonglong_wait_until_some( + unsigned long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long long val); +__device__ size_t rocshmem_ulonglong_wait_until_any_vector( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__device__ void rocshmem_ulonglong_wait_until_all_vector( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__device__ size_t rocshmem_ulonglong_wait_until_some_vector( + unsigned long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long long val); +__host__ void rocshmem_ulonglong_wait_until( + unsigned long long *ivars, int cmp, unsigned long long val); +__host__ size_t rocshmem_ulonglong_wait_until_any( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__host__ void rocshmem_ulonglong_wait_until_all( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__host__ size_t rocshmem_ulonglong_wait_until_some( + unsigned long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long long val); +__host__ size_t rocshmem_ulonglong_wait_until_any_vector( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__host__ void rocshmem_ulonglong_wait_until_all_vector( + unsigned long long *ivars, size_t nelems, const int* status, + int cmp, unsigned long long val); +__host__ size_t rocshmem_ulonglong_wait_until_some_vector( + unsigned long long *ivars, size_t nelems, size_t* indices, const int* status, + int cmp, unsigned long long val); + + +/** + * @name SHMEM_TEST + * @brief test if the condition (* \p ptr \p cmps \p val) is + * true. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ivars Pointer to memory on the symmetric heap to wait for. + * @param[in] cmp Operation for the comparison. + * @param[in] val Value to compare the memory at \p ptr to. + * + * @return 1 if the evaluation is true else 0 + */ +__device__ int rocshmem_float_test( + float *ivars, int cmp, float val); +__host__ int rocshmem_float_test( + float *ivars, int cmp, float val); + +__device__ int rocshmem_double_test( + double *ivars, int cmp, double val); +__host__ int rocshmem_double_test( + double *ivars, int cmp, double val); + +__device__ int rocshmem_char_test( + char *ivars, int cmp, char val); +__host__ int rocshmem_char_test( + char *ivars, int cmp, char val); + +__device__ int rocshmem_schar_test( + signed char *ivars, int cmp, signed char val); +__host__ int rocshmem_schar_test( + signed char *ivars, int cmp, signed char val); + +__device__ int rocshmem_short_test( + short *ivars, int cmp, short val); +__host__ int rocshmem_short_test( + short *ivars, int cmp, short val); + +__device__ int rocshmem_int_test( + int *ivars, int cmp, int val); +__host__ int rocshmem_int_test( + int *ivars, int cmp, int val); + +__device__ int rocshmem_long_test( + long *ivars, int cmp, long val); +__host__ int rocshmem_long_test( + long *ivars, int cmp, long val); + +__device__ int rocshmem_longlong_test( + long long *ivars, int cmp, long long val); +__host__ int rocshmem_longlong_test( + long long *ivars, int cmp, long long val); + +__device__ int rocshmem_uchar_test( + unsigned char *ivars, int cmp, unsigned char val); +__host__ int rocshmem_uchar_test( + unsigned char *ivars, int cmp, unsigned char val); + +__device__ int rocshmem_ushort_test( + unsigned short *ivars, int cmp, unsigned short val); +__host__ int rocshmem_ushort_test( + unsigned short *ivars, int cmp, unsigned short val); + +__device__ int rocshmem_uint_test( + unsigned int *ivars, int cmp, unsigned int val); +__host__ int rocshmem_uint_test( + unsigned int *ivars, int cmp, unsigned int val); + +__device__ int rocshmem_ulong_test( + unsigned long *ivars, int cmp, unsigned long val); +__host__ int rocshmem_ulong_test( + unsigned long *ivars, int cmp, unsigned long val); + +__device__ int rocshmem_ulonglong_test( + unsigned long long *ivars, int cmp, unsigned long long val); +__host__ int rocshmem_ulonglong_test( + unsigned long long *ivars, int cmp, unsigned long long val); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_RMA.hpp b/projects/rocshmem/include/rocshmem/rocshmem_RMA.hpp new file mode 100644 index 0000000000..ccd71f061b --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_RMA.hpp @@ -0,0 +1,1208 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP + +namespace rocshmem { + +/** + * @name SHMEM_PUT + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put( + float *dest, const float *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_float_put( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__host__ void rocshmem_float_put(float *dest, + const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put( + double *dest, const double *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_double_put( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__host__ void rocshmem_double_put(double *dest, + const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put( + char *dest, const char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_char_put( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__host__ void rocshmem_char_put(char *dest, + const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put( + signed char *dest, const signed char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_schar_put( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__host__ void rocshmem_schar_put(signed char *dest, + const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put( + short *dest, const short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_short_put( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__host__ void rocshmem_short_put(short *dest, + const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put( + int *dest, const int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_int_put( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__host__ void rocshmem_int_put(int *dest, + const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put( + long *dest, const long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_long_put( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__host__ void rocshmem_long_put(long *dest, + const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put( + long long *dest, const long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_longlong_put( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__host__ void rocshmem_longlong_put(long long *dest, + const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uchar_put( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__host__ void rocshmem_uchar_put(unsigned char *dest, + const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ushort_put( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__host__ void rocshmem_ushort_put(unsigned short *dest, + const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uint_put( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__host__ void rocshmem_uint_put(unsigned int *dest, + const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulong_put( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulong_put(unsigned long *dest, + const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulonglong_put( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulonglong_put(unsigned long long *dest, + const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem(void *dest, const void *source, + size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into __host__ rocshmem_quiet() if remote completion is required. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__host__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, int pe); + +__host__ void rocshmem_putmem(void *dest, const void *source, size_t nelems, + int pe); + + +/** + * @name SHMEM_P + * @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe. + * The caller must call into rocshmem_quiet() if remote completion is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] value Value to write to dest at \p pe. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_p( + rocshmem_ctx_t ctx, float *dest, float value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_p( + float *dest, float value, int pe); +__host__ void rocshmem_ctx_float_p( + rocshmem_ctx_t ctx, float *dest, float value, + int pe); +__host__ void rocshmem_float_p( + float *dest, float value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_p( + rocshmem_ctx_t ctx, double *dest, double value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_p( + double *dest, double value, int pe); +__host__ void rocshmem_ctx_double_p( + rocshmem_ctx_t ctx, double *dest, double value, + int pe); +__host__ void rocshmem_double_p( + double *dest, double value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_p( + rocshmem_ctx_t ctx, char *dest, char value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_p( + char *dest, char value, int pe); +__host__ void rocshmem_ctx_char_p( + rocshmem_ctx_t ctx, char *dest, char value, + int pe); +__host__ void rocshmem_char_p( + char *dest, char value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_p( + rocshmem_ctx_t ctx, signed char *dest, signed char value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_p( + signed char *dest, signed char value, int pe); +__host__ void rocshmem_ctx_schar_p( + rocshmem_ctx_t ctx, signed char *dest, signed char value, + int pe); +__host__ void rocshmem_schar_p( + signed char *dest, signed char value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_p( + rocshmem_ctx_t ctx, short *dest, short value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_p( + short *dest, short value, int pe); +__host__ void rocshmem_ctx_short_p( + rocshmem_ctx_t ctx, short *dest, short value, + int pe); +__host__ void rocshmem_short_p( + short *dest, short value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_p( + rocshmem_ctx_t ctx, int *dest, int value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_p( + int *dest, int value, int pe); +__host__ void rocshmem_ctx_int_p( + rocshmem_ctx_t ctx, int *dest, int value, + int pe); +__host__ void rocshmem_int_p( + int *dest, int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_p( + rocshmem_ctx_t ctx, long *dest, long value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_p( + long *dest, long value, int pe); +__host__ void rocshmem_ctx_long_p( + rocshmem_ctx_t ctx, long *dest, long value, + int pe); +__host__ void rocshmem_long_p( + long *dest, long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_p( + rocshmem_ctx_t ctx, long long *dest, long long value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_p( + long long *dest, long long value, int pe); +__host__ void rocshmem_ctx_longlong_p( + rocshmem_ctx_t ctx, long long *dest, long long value, + int pe); +__host__ void rocshmem_longlong_p( + long long *dest, long long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_p( + rocshmem_ctx_t ctx, unsigned char *dest, unsigned char value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_p( + unsigned char *dest, unsigned char value, int pe); +__host__ void rocshmem_ctx_uchar_p( + rocshmem_ctx_t ctx, unsigned char *dest, unsigned char value, + int pe); +__host__ void rocshmem_uchar_p( + unsigned char *dest, unsigned char value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_p( + rocshmem_ctx_t ctx, unsigned short *dest, unsigned short value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_p( + unsigned short *dest, unsigned short value, int pe); +__host__ void rocshmem_ctx_ushort_p( + rocshmem_ctx_t ctx, unsigned short *dest, unsigned short value, + int pe); +__host__ void rocshmem_ushort_p( + unsigned short *dest, unsigned short value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_p( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_p( + unsigned int *dest, unsigned int value, int pe); +__host__ void rocshmem_ctx_uint_p( + rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, + int pe); +__host__ void rocshmem_uint_p( + unsigned int *dest, unsigned int value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_p( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_p( + unsigned long *dest, unsigned long value, int pe); +__host__ void rocshmem_ctx_ulong_p( + rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, + int pe); +__host__ void rocshmem_ulong_p( + unsigned long *dest, unsigned long value, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_p( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, + int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_p( + unsigned long long *dest, unsigned long long value, int pe); +__host__ void rocshmem_ctx_ulonglong_p( + rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, + int pe); +__host__ void rocshmem_ulonglong_p( + unsigned long long *dest, unsigned long long value, int pe); + + +/** + * @name SHMEM_GET + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get( + float *dest, const float *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_float_get( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__host__ void rocshmem_float_get(float *dest, + const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get( + double *dest, const double *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_double_get( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__host__ void rocshmem_double_get(double *dest, + const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get( + char *dest, const char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_char_get( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__host__ void rocshmem_char_get(char *dest, + const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get( + signed char *dest, const signed char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_schar_get( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__host__ void rocshmem_schar_get(signed char *dest, + const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get( + short *dest, const short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_short_get( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__host__ void rocshmem_short_get(short *dest, + const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get( + int *dest, const int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_int_get( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__host__ void rocshmem_int_get(int *dest, + const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get( + long *dest, const long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_long_get( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__host__ void rocshmem_long_get(long *dest, + const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get( + long long *dest, const long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_longlong_get( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__host__ void rocshmem_longlong_get(long long *dest, + const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uchar_get( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__host__ void rocshmem_uchar_get(unsigned char *dest, + const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ushort_get( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__host__ void rocshmem_ushort_get(unsigned short *dest, + const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uint_get( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__host__ void rocshmem_uint_get(unsigned int *dest, + const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulong_get( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulong_get(unsigned long *dest, + const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulonglong_get( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulonglong_get(unsigned long long *dest, + const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem(void *dest, const void *source, + size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__host__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, int pe); + +__host__ void rocshmem_getmem(void *dest, const void *source, size_t nelems, + int pe); + + +/** + * @name SHMEM_G + * @brief reads and returns single value from \p source at \p pe. + * The calling work-group/thread will block until the operation completes. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] source Source address. Must be an address on the symmetric + * heap. + * @param[in] pe PE of the remote process. + * + * @return the value read from remote \p source at \p pe. + */ +__device__ ATTR_NO_INLINE float rocshmem_ctx_float_g( + rocshmem_ctx_t ctx, const float *source, int pe); +__device__ ATTR_NO_INLINE float rocshmem_float_g( + const float *source, int pe); +__host__ float rocshmem_ctx_float_g( + rocshmem_ctx_t ctx, const float *source, int pe); +__host__ float rocshmem_float_g( + const float *source, int pe); + +__device__ ATTR_NO_INLINE double rocshmem_ctx_double_g( + rocshmem_ctx_t ctx, const double *source, int pe); +__device__ ATTR_NO_INLINE double rocshmem_double_g( + const double *source, int pe); +__host__ double rocshmem_ctx_double_g( + rocshmem_ctx_t ctx, const double *source, int pe); +__host__ double rocshmem_double_g( + const double *source, int pe); + +__device__ ATTR_NO_INLINE char rocshmem_ctx_char_g( + rocshmem_ctx_t ctx, const char *source, int pe); +__device__ ATTR_NO_INLINE char rocshmem_char_g( + const char *source, int pe); +__host__ char rocshmem_ctx_char_g( + rocshmem_ctx_t ctx, const char *source, int pe); +__host__ char rocshmem_char_g( + const char *source, int pe); + +__device__ ATTR_NO_INLINE signed char rocshmem_ctx_schar_g( + rocshmem_ctx_t ctx, const signed char *source, int pe); +__device__ ATTR_NO_INLINE signed char rocshmem_schar_g( + const signed char *source, int pe); +__host__ signed char rocshmem_ctx_schar_g( + rocshmem_ctx_t ctx, const signed char *source, int pe); +__host__ signed char rocshmem_schar_g( + const signed char *source, int pe); + +__device__ ATTR_NO_INLINE short rocshmem_ctx_short_g( + rocshmem_ctx_t ctx, const short *source, int pe); +__device__ ATTR_NO_INLINE short rocshmem_short_g( + const short *source, int pe); +__host__ short rocshmem_ctx_short_g( + rocshmem_ctx_t ctx, const short *source, int pe); +__host__ short rocshmem_short_g( + const short *source, int pe); + +__device__ ATTR_NO_INLINE int rocshmem_ctx_int_g( + rocshmem_ctx_t ctx, const int *source, int pe); +__device__ ATTR_NO_INLINE int rocshmem_int_g( + const int *source, int pe); +__host__ int rocshmem_ctx_int_g( + rocshmem_ctx_t ctx, const int *source, int pe); +__host__ int rocshmem_int_g( + const int *source, int pe); + +__device__ ATTR_NO_INLINE long rocshmem_ctx_long_g( + rocshmem_ctx_t ctx, const long *source, int pe); +__device__ ATTR_NO_INLINE long rocshmem_long_g( + const long *source, int pe); +__host__ long rocshmem_ctx_long_g( + rocshmem_ctx_t ctx, const long *source, int pe); +__host__ long rocshmem_long_g( + const long *source, int pe); + +__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_g( + rocshmem_ctx_t ctx, const long long *source, int pe); +__device__ ATTR_NO_INLINE long long rocshmem_longlong_g( + const long long *source, int pe); +__host__ long long rocshmem_ctx_longlong_g( + rocshmem_ctx_t ctx, const long long *source, int pe); +__host__ long long rocshmem_longlong_g( + const long long *source, int pe); + +__device__ ATTR_NO_INLINE unsigned char rocshmem_ctx_uchar_g( + rocshmem_ctx_t ctx, const unsigned char *source, int pe); +__device__ ATTR_NO_INLINE unsigned char rocshmem_uchar_g( + const unsigned char *source, int pe); +__host__ unsigned char rocshmem_ctx_uchar_g( + rocshmem_ctx_t ctx, const unsigned char *source, int pe); +__host__ unsigned char rocshmem_uchar_g( + const unsigned char *source, int pe); + +__device__ ATTR_NO_INLINE unsigned short rocshmem_ctx_ushort_g( + rocshmem_ctx_t ctx, const unsigned short *source, int pe); +__device__ ATTR_NO_INLINE unsigned short rocshmem_ushort_g( + const unsigned short *source, int pe); +__host__ unsigned short rocshmem_ctx_ushort_g( + rocshmem_ctx_t ctx, const unsigned short *source, int pe); +__host__ unsigned short rocshmem_ushort_g( + const unsigned short *source, int pe); + +__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_g( + rocshmem_ctx_t ctx, const unsigned int *source, int pe); +__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_g( + const unsigned int *source, int pe); +__host__ unsigned int rocshmem_ctx_uint_g( + rocshmem_ctx_t ctx, const unsigned int *source, int pe); +__host__ unsigned int rocshmem_uint_g( + const unsigned int *source, int pe); + +__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_g( + rocshmem_ctx_t ctx, const unsigned long *source, int pe); +__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_g( + const unsigned long *source, int pe); +__host__ unsigned long rocshmem_ctx_ulong_g( + rocshmem_ctx_t ctx, const unsigned long *source, int pe); +__host__ unsigned long rocshmem_ulong_g( + const unsigned long *source, int pe); + +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_g( + rocshmem_ctx_t ctx, const unsigned long long *source, int pe); +__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_g( + const unsigned long long *source, int pe); +__host__ unsigned long long rocshmem_ctx_ulonglong_g( + rocshmem_ctx_t ctx, const unsigned long long *source, int pe); +__host__ unsigned long long rocshmem_ulonglong_g( + const unsigned long long *source, int pe); + + +/** + * @name SHMEM_PUT_NBI + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_nbi( + float *dest, const float *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_float_put_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__host__ void rocshmem_float_put_nbi( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_nbi( + double *dest, const double *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_double_put_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__host__ void rocshmem_double_put_nbi( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_nbi( + char *dest, const char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_char_put_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__host__ void rocshmem_char_put_nbi( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_nbi( + signed char *dest, const signed char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_schar_put_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__host__ void rocshmem_schar_put_nbi( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_nbi( + short *dest, const short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_short_put_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__host__ void rocshmem_short_put_nbi( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_nbi( + int *dest, const int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_int_put_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__host__ void rocshmem_int_put_nbi( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_nbi( + long *dest, const long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_long_put_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__host__ void rocshmem_long_put_nbi( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_nbi( + long long *dest, const long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_longlong_put_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__host__ void rocshmem_longlong_put_nbi( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_nbi( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uchar_put_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__host__ void rocshmem_uchar_put_nbi( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_nbi( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ushort_put_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__host__ void rocshmem_ushort_put_nbi( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_nbi( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uint_put_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__host__ void rocshmem_uint_put_nbi( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_nbi( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulong_put_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulong_put_nbi( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_nbi( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulonglong_put_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulonglong_put_nbi( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * _host__ rocshmem_quiet() if completion notification is required. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__host__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe); + +__host__ void rocshmem_putmem_nbi(void *dest, const void *source, + size_t nelems, int pe); + + +/** + * @name SHMEM_GET_NBI + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller will + * return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_nbi( + float *dest, const float *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_float_get_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__host__ void rocshmem_float_get_nbi(float *dest, + const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_nbi( + double *dest, const double *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_double_get_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__host__ void rocshmem_double_get_nbi(double *dest, + const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_nbi( + char *dest, const char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_char_get_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__host__ void rocshmem_char_get_nbi(char *dest, + const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_nbi( + signed char *dest, const signed char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_schar_get_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__host__ void rocshmem_schar_get_nbi(signed char *dest, + const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_nbi( + short *dest, const short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_short_get_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__host__ void rocshmem_short_get_nbi(short *dest, + const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_nbi( + int *dest, const int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_int_get_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__host__ void rocshmem_int_get_nbi(int *dest, + const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_nbi( + long *dest, const long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_long_get_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__host__ void rocshmem_long_get_nbi(long *dest, + const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_nbi( + long long *dest, const long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_longlong_get_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__host__ void rocshmem_longlong_get_nbi(long long *dest, + const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_nbi( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uchar_get_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__host__ void rocshmem_uchar_get_nbi(unsigned char *dest, + const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_nbi( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ushort_get_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__host__ void rocshmem_ushort_get_nbi(unsigned short *dest, + const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_nbi( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_uint_get_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__host__ void rocshmem_uint_get_nbi(unsigned int *dest, + const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_nbi( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulong_get_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulong_get_nbi(unsigned long *dest, + const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_nbi( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); +__host__ void rocshmem_ctx_ulonglong_get_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__host__ void rocshmem_ulonglong_get_nbi(unsigned long long *dest, + const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller will + * return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller will + * return as soon as the request is posted. The caller must call + * __host__ rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-thread + * granularity. However, performance may be improved if the caller can + * coalesce contiguous messages and elect a leader thread to call into the + * ROCSHMEM function. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__host__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, + const void *source, size_t nelems, + int pe); + +__host__ void rocshmem_getmem_nbi(void *dest, const void *source, + size_t nelems, int pe); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_RMA_X.hpp b/projects/rocshmem/include/rocshmem/rocshmem_RMA_X.hpp new file mode 100644 index 0000000000..34b9185e9d --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_RMA_X.hpp @@ -0,0 +1,1036 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP + +namespace rocshmem { + +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in a wave must collectively participate + * in the call using the same arguments + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_wave( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_wave( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_wave( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_wave( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_wave( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_wave( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_wave( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_wave( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-workgroup + * (WG) granularity. However, All threads in a WG must collectively participate + * in the call using the same arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_wg( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_wg( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_wg( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_wg( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_wg( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_wg( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_wg( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_wg( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in a wave must participate in the + * call using the same parameters. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_wave(void *dest, + const void *source, + size_t nelems, int pe); + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest at \p pe. The caller will block until the operation + * completes locally (it is safe to reuse \p source). The caller must + * call into rocshmem_quiet() if remote completion is required. + * + * This function can be called from divergent control paths at per-workgroup + * (WG) granularity. However, all threads in the workgroup must participate in + * the call using the same parameters. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in number of elements. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_wg(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must participate in the + * call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_wave( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_wave( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_wave( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_wave( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_wave( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_wave( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_wave( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_wave( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the workgroup must participate in + * the call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_wg( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_wg( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_wg( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_wg( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_wg( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_wg( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_wg( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_wg( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in a the wave must participate in the + * call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_wave(void *dest, + const void *source, + size_t nelems, int pe); + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The calling work-group will block until the + * operation completes (data has been placed in \p dest). + * + * This function can be called from divergent control paths at per-workgroup + * (WG) granularity. However, all threads in the workgroup must participate + * in the call using the same parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, + void *dest, + const void *source, + size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_wg(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_nbi_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_nbi_wave( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_nbi_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_nbi_wave( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_nbi_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_nbi_wave( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_nbi_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_nbi_wave( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_nbi_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_nbi_wave( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_nbi_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_nbi_wave( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_nbi_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_nbi_wave( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_nbi_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_nbi_wave( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_nbi_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_nbi_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_nbi_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_nbi_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_nbi_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_nbi_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems elements from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the WG must call in with the sameo + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_nbi_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_nbi_wg( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_nbi_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_nbi_wg( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_nbi_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_nbi_wg( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_nbi_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_nbi_wg( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_nbi_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_nbi_wg( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_nbi_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_nbi_wg( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_nbi_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_nbi_wg( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_nbi_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_nbi_wg( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_nbi_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_nbi_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_nbi_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_nbi_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_nbi_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_nbi_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in a wave must call in with the same + * parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wave(void *dest, + const void *source, + size_t nelems, + int pe); + +/** + * @brief Writes contiguous data of \p nelems bytes from \p source on the + * calling PE to \p dest on \p pe. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in a WG must call in with the same + * parameters + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wg(void *dest, + const void *source, + size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_nbi_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_nbi_wave( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_nbi_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_nbi_wave( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_nbi_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_nbi_wave( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_nbi_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_nbi_wave( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_nbi_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_nbi_wave( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_nbi_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_nbi_wave( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_nbi_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_nbi_wave( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_nbi_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_nbi_wave( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_nbi_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_nbi_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_nbi_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_nbi_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_nbi_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_nbi_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems elements from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the WG must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_nbi_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_get_nbi_wg( + float *dest, const float *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_nbi_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_get_nbi_wg( + double *dest, const double *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_nbi_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_get_nbi_wg( + char *dest, const char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_nbi_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_get_nbi_wg( + signed char *dest, const signed char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_nbi_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_get_nbi_wg( + short *dest, const short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_nbi_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_get_nbi_wg( + int *dest, const int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_nbi_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_get_nbi_wg( + long *dest, const long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_nbi_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_get_nbi_wg( + long long *dest, const long long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_get_nbi_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_get_nbi_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_get_nbi_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_get_nbi_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_nbi_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, + size_t nelems, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_nbi_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe); + + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-wave + * granularity. However, all threads in the wave must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wave(void *dest, + const void *source, + size_t nelems, + int pe); + +/** + * @brief Reads contiguous data of \p nelems bytes from \p source on \p pe + * to \p dest on the calling PE. The operation is not blocking. The caller + * will return as soon as the request is posted. The caller must call + * rocshmem_quiet() on the same context if completion notification is + * required. + * + * This function can be called from divergent control paths at per-workgroup + * granularity. However, all threads in the WG must call in with the same + * arguments. + * + * @param[in] ctx Context with which to perform this operation. + * @param[in] dest Destination address. Must be an address on the symmetric + * heap. + * @param[in] source Source address. Must be an address on the symmetric heap. + * @param[in] nelems Size of the transfer in bytes. + * @param[in] pe PE of the remote process. + * + * @return void. + */ +__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wg(void *dest, + const void *source, + size_t nelems, int pe); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_SIG_OP.hpp b/projects/rocshmem/include/rocshmem/rocshmem_SIG_OP.hpp new file mode 100644 index 0000000000..00ad57a10b --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_SIG_OP.hpp @@ -0,0 +1,623 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP + +namespace rocshmem { +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wg( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wg( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wg( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wg( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wg( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wg( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wg( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wg( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wg( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wave( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wave( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wave( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wave( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wave( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wave( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wave( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wave( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wave( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wg( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wg( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wg( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wg( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wg( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wg( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wg( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wg( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wg( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wg( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wg( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wg( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wg( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wg( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wg( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wg( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wg( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wg( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wg( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wg( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wg( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wg( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wg( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wg( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wave( + void *dest, const void *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wave( + rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wave( + rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wave( + float *dest, const float *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wave( + rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wave( + double *dest, const double *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wave( + rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wave( + char *dest, const char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wave( + rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wave( + signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wave( + rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wave( + short *dest, const short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wave( + rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wave( + int *dest, const int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wave( + rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wave( + long *dest, const long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wave( + rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wave( + long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wave( + unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wave( + unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wave( + unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wave( + unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + +__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wave( + rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems, + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe); +__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wave( + unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr, + uint64_t signal, int sig_op, int pe); + + +__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr); +__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr); +__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr); + + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP diff --git a/projects/rocshmem/include/rocshmem/rocshmem_common.hpp b/projects/rocshmem/include/rocshmem/rocshmem_common.hpp new file mode 100644 index 0000000000..baea438244 --- /dev/null +++ b/projects/rocshmem/include/rocshmem/rocshmem_common.hpp @@ -0,0 +1,130 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP +#define LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP + +namespace rocshmem { + +#ifdef USE_FUNC_CALL +#define ATTR_NO_INLINE __attribute__((noinline)) +#else +#define ATTR_NO_INLINE +#endif + + +enum ROCSHMEM_STATUS { + ROCSHMEM_SUCCESS = 0, + ROCSHMEM_ERROR = 1, +}; + +enum ROCSHMEM_OP { + ROCSHMEM_SUM, + ROCSHMEM_MAX, + ROCSHMEM_MIN, + ROCSHMEM_PROD, + ROCSHMEM_AND, + ROCSHMEM_OR, + ROCSHMEM_XOR, + ROCSHMEM_REPLACE +}; + +enum ROCSHMEM_SIGNAL_OPS { + ROCSHMEM_SIGNAL_SET, + ROCSHMEM_SIGNAL_ADD, +}; + +/** + * @brief Types defined for rocshmem_wait() operations. + */ +enum rocshmem_cmps { + ROCSHMEM_CMP_EQ, + ROCSHMEM_CMP_NE, + ROCSHMEM_CMP_GT, + ROCSHMEM_CMP_GE, + ROCSHMEM_CMP_LT, + ROCSHMEM_CMP_LE, +}; + +enum rocshmem_thread_ops { + ROCSHMEM_THREAD_SINGLE, + ROCSHMEM_THREAD_FUNNELED, + ROCSHMEM_THREAD_WG_FUNNELED, + ROCSHMEM_THREAD_SERIALIZED, + ROCSHMEM_THREAD_MULTIPLE +}; + +/** + * @brief Bitwise flags to mask configuration parameters. + */ +enum rocshmem_team_configs { + ROCSHMEM_TEAM_DEFAULT_CONFIGS, + ROCSHMEM_TEAM_NUM_CONTEXTS +}; + +typedef struct { + int num_contexts; +} rocshmem_team_config_t; + +constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024; +constexpr size_t ROCSHMEM_ATA_MAX_WRKDATA_SIZE = (4 * 1024 * 1024); +constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256; +constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256; +// Internally calls sync function, which matches barrier implementation +constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE; +constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1; +constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE; +constexpr size_t ROCSHMEM_SYNC_VALUE = 0; + +const int ROCSHMEM_CTX_ZERO = 0; +const int ROCSHMEM_CTX_NOSTORE = 1; +const int ROCSHMEM_CTX_SERIALIZED = 2; +const int ROCSHMEM_CTX_WG_PRIVATE = 4; +const int ROCSHMEM_CTX_SHARED = 8; + +/** + * @brief GPU side OpenSHMEM context created from each work-groups' + * rocshmem_wg_handle_t + */ +typedef struct { + void *ctx_opaque; + void *team_opaque; +} rocshmem_ctx_t; + +/** + * Shmem default context. + */ +extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_DEFAULT; + +/** + * Used internally to set default context. + */ +void set_internal_ctx(rocshmem_ctx_t *ctx); + +typedef uint64_t *rocshmem_team_t; +extern rocshmem_team_t ROCSHMEM_TEAM_WORLD; + +const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr; + +} // namespace rocshmem + +#endif // LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP